In [28]:
import os
import pickle
from glob import glob
from tqdm import tqdm
import pretty_midi

In [29]:
# ========== CONFIG ==========
DATA_DIR = "../data/reduced_midi"
ARTIFACT_DIR = "artifacts"

MOODS = ["happy", "sad", "angry", "quiet"]
# ===========================

In [30]:
os.makedirs(ARTIFACT_DIR, exist_ok=True)

In [31]:
def extract_notes_from_midi(midi_path):
    """
    Extract note pitches from a MIDI file.
    Returns a list of string pitches.
    """
    midi = pretty_midi.PrettyMIDI(midi_path)
    notes = []

    for instrument in midi.instruments:
        if instrument.is_drum:
            continue
        for note in instrument.notes:
            notes.append(str(note.pitch))

    return notes

In [32]:
all_notes = []

for mood in MOODS:
    files = glob(os.path.join(DATA_DIR, mood, "*.midi"))
    print(f"Processing {mood}: {len(files)} files")

    for file in tqdm(files):
        try:
            notes = extract_notes_from_midi(file)
            all_notes.extend(notes)
        except Exception:
            continue

Processing happy: 3000 files


  0%|          | 0/3000 [00:00<?, ?it/s]

100%|██████████| 3000/3000 [09:25<00:00,  5.30it/s]


Processing sad: 2998 files


100%|██████████| 2998/2998 [10:44<00:00,  4.65it/s]  


Processing angry: 2999 files


100%|██████████| 2999/2999 [11:01<00:00,  4.54it/s]


Processing quiet: 2989 files


100%|██████████| 2989/2989 [07:21<00:00,  6.77it/s]


In [33]:
print("Total note events:", len(all_notes))
print("Unique notes:", len(set(all_notes)))

Total note events: 22113921
Unique notes: 128


In [34]:
vocab = sorted(set(all_notes))

note_to_idx = {note: idx for idx, note in enumerate(vocab)}
idx_to_note = {idx: note for note, idx in note_to_idx.items()}

print("Vocabulary size:", len(vocab))

Vocabulary size: 128


In [35]:
with open(os.path.join(ARTIFACT_DIR, "all_notes.pkl"), "wb") as f:
    pickle.dump(all_notes, f)

with open(os.path.join(ARTIFACT_DIR, "note_to_idx.pkl"), "wb") as f:
    pickle.dump(note_to_idx, f)

with open(os.path.join(ARTIFACT_DIR, "idx_to_note.pkl"), "wb") as f:
    pickle.dump(idx_to_note, f)

In [36]:
print("Artifacts saved:")
print(os.listdir(ARTIFACT_DIR))

Artifacts saved:
['all_notes.pkl', 'idx_to_note.pkl', 'note_to_idx.pkl']
