In [19]:
import os
import pickle
from glob import glob
from tqdm import tqdm
import numpy as np
import pretty_midi

In [20]:
# ========== CONFIG ==========
DATA_DIR = "../data/reduced_midi"
ARTIFACT_DIR = "artifacts"
OUTPUT_DIR = "../data/training_data"

MOODS = ["happy", "sad", "angry", "quiet"]
SEQ_LEN = 50
MAX_SEQUENCES_PER_FILE = 2000
# ===========================

In [21]:
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [22]:
with open(os.path.join(ARTIFACT_DIR, "note_to_idx.pkl"), "rb") as f:
    note_to_idx = pickle.load(f)

vocab_size = len(note_to_idx)
print("Vocabulary size:", vocab_size)

Vocabulary size: 128


In [23]:
def extract_notes_from_midi(midi_path):
    midi = pretty_midi.PrettyMIDI(midi_path)
    notes = []

    for instrument in midi.instruments:
        if instrument.is_drum:
            continue
        for note in instrument.notes:
            notes.append(str(note.pitch))

    return notes

In [24]:
X, y, mood_labels = [], [], []

for mood in MOODS:
    mood_id = MOODS.index(mood)
    files = glob(os.path.join(DATA_DIR, mood, "*.midi"))

    print(f"Processing {mood}: {len(files)} files")

    for file in tqdm(files):
        try:
            notes = extract_notes_from_midi(file)
            encoded = [note_to_idx[n] for n in notes if n in note_to_idx]

            count = 0
            for i in range(len(encoded) - SEQ_LEN):
                if count >= MAX_SEQUENCES_PER_FILE:
                    break

                X.append(encoded[i:i+SEQ_LEN])
                y.append(encoded[i+SEQ_LEN])
                mood_labels.append(mood_id)
                count += 1

        except Exception:
            continue


Processing happy: 3000 files


100%|██████████| 3000/3000 [03:22<00:00, 14.84it/s]


Processing sad: 2998 files


100%|██████████| 2998/2998 [02:52<00:00, 17.42it/s]


Processing angry: 2999 files


100%|██████████| 2999/2999 [02:29<00:00, 20.02it/s]


Processing quiet: 2989 files


100%|██████████| 2989/2989 [01:22<00:00, 36.06it/s]


In [25]:
X = np.array(X, dtype=np.int16)
y = np.array(y, dtype=np.int16)
mood_labels = np.array(mood_labels, dtype=np.int8)

print("X shape:", X.shape)
print("y shape:", y.shape)
print("Mood labels shape:", mood_labels.shape)

X shape: (16716936, 50)
y shape: (16716936,)
Mood labels shape: (16716936,)


In [26]:
with open(os.path.join(OUTPUT_DIR, "X.pkl"), "wb") as f:
    pickle.dump(X, f)

with open(os.path.join(OUTPUT_DIR, "y.pkl"), "wb") as f:
    pickle.dump(y, f)

with open(os.path.join(OUTPUT_DIR, "mood_labels.pkl"), "wb") as f:
    pickle.dump(mood_labels, f)

In [27]:
print("Saved files:", os.listdir(OUTPUT_DIR))
print("Total training samples:", len(X))

Saved files: ['mood_labels.pkl', 'X.pkl', 'y.pkl']
Total training samples: 16716936
