In [2]:
import zipfile
from collections import defaultdict

def list_sampled_zip_contents(zip_path, max_files_per_dir=5):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        file_list = zip_ref.namelist()

        # Group files by directory
        files_by_dir = defaultdict(list)
        for file in file_list:
            if file.endswith('/'):
                continue  # Skip folder entries
            dir_path = '/'.join(file.split('/')[:-1])
            files_by_dir[dir_path].append(file)

        # Print up to `max_files_per_dir` per folder
        for dir_path, files in files_by_dir.items():
            print(f"\n📁 Directory: {dir_path or '[root]'}")
            for f in files[:max_files_per_dir]:
                print(f"  └─ {f}")

# Example usage
list_sampled_zip_contents("maestro-v3.0.0-midi.zip")



📁 Directory: maestro-v3.0.0/2004
  └─ maestro-v3.0.0/2004/MIDI-Unprocessed_XP_08_R1_2004_01-02_ORIG_MID--AUDIO_08_R1_2004_01_Track01_wav.midi
  └─ maestro-v3.0.0/2004/MIDI-Unprocessed_XP_09_R1_2004_05_ORIG_MID--AUDIO_09_R1_2004_06_Track06_wav.midi
  └─ maestro-v3.0.0/2004/MIDI-Unprocessed_XP_14_R1_2004_01-03_ORIG_MID--AUDIO_14_R1_2004_01_Track01_wav.midi
  └─ maestro-v3.0.0/2004/MIDI-Unprocessed_XP_01_R1_2004_01-02_ORIG_MID--AUDIO_01_R1_2004_03_Track03_wav.midi
  └─ maestro-v3.0.0/2004/MIDI-Unprocessed_SMF_13_01_2004_01-05_ORIG_MID--AUDIO_13_R1_2004_09_Track09_wav.midi

📁 Directory: maestro-v3.0.0/2011
  └─ maestro-v3.0.0/2011/MIDI-Unprocessed_22_R2_2011_MID--AUDIO_R2-D5_11_Track11_wav.midi
  └─ maestro-v3.0.0/2011/MIDI-Unprocessed_02_R1_2011_MID--AUDIO_R1-D1_09_Track09_wav.midi
  └─ maestro-v3.0.0/2011/MIDI-Unprocessed_17_R1_2011_MID--AUDIO_R1-D7_03_Track03_wav.midi
  └─ maestro-v3.0.0/2011/MIDI-Unprocessed_21_R1_2011_MID--AUDIO_R1-D8_09_Track09_wav.midi
  └─ maestro-v3.0.0/2011/MIDI

In [3]:
import zipfile
import os

# Define the path to the zip file and the extraction directory
zip_file_path = "maestro-v3.0.0-midi.zip"
extract_dir = "maestro-v3.0.0"

# Unzip the file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

print(f"Extracted {zip_file_path} to {extract_dir}")


Extracted maestro-v3.0.0-midi.zip to maestro-v3.0.0


In [4]:
import pandas as pd

# Load metadata CSV
metadata_path = 'maestro-v3.0.0/maestro-v3.0.0/maestro-v3.0.0.csv'
df = pd.read_csv(metadata_path)

# Check structure
df.head()


Unnamed: 0,canonical_composer,canonical_title,split,year,midi_filename,audio_filename,duration
0,Alban Berg,Sonata Op. 1,train,2018,2018/MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R...,2018/MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R...,698.66116
1,Alban Berg,Sonata Op. 1,train,2008,2008/MIDI-Unprocessed_03_R2_2008_01-03_ORIG_MI...,2008/MIDI-Unprocessed_03_R2_2008_01-03_ORIG_MI...,759.518471
2,Alban Berg,Sonata Op. 1,train,2017,2017/MIDI-Unprocessed_066_PIANO066_MID--AUDIO-...,2017/MIDI-Unprocessed_066_PIANO066_MID--AUDIO-...,464.649433
3,Alexander Scriabin,"24 Preludes Op. 11, No. 13-24",train,2004,2004/MIDI-Unprocessed_XP_21_R1_2004_01_ORIG_MI...,2004/MIDI-Unprocessed_XP_21_R1_2004_01_ORIG_MI...,872.640588
4,Alexander Scriabin,"3 Etudes, Op. 65",validation,2006,2006/MIDI-Unprocessed_17_R1_2006_01-06_ORIG_MI...,2006/MIDI-Unprocessed_17_R1_2006_01-06_ORIG_MI...,397.857508


In [5]:
# Split the dataset
train_df = df[df['split'] == 'train']
val_df = df[df['split'] == 'validation']
test_df = df[df['split'] == 'test']


In [6]:
pip install miditok


Collecting miditok
  Downloading miditok-3.0.5.post1-py3-none-any.whl.metadata (10 kB)
Collecting symusic>=0.5.0 (from miditok)
  Downloading symusic-0.5.7-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (8.7 kB)
Collecting pySmartDL (from symusic>=0.5.0->miditok)
  Downloading pySmartDL-1.3.4-py3-none-any.whl.metadata (2.8 kB)
Downloading miditok-3.0.5.post1-py3-none-any.whl (158 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m158.3/158.3 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading symusic-0.5.7-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (2.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m92.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pySmartDL-1.3.4-py3-none-any.whl (20 kB)
Installing collected packages: pySmartDL, symusic, miditok
Successfully installed miditok-3.0.5.post1 pySmartDL-1.3.4 symusic-0.5.7


In [7]:
from miditok import REMI
from pathlib import Path

tokenizer = REMI()
tokenized_out = Path("maestro-tokenized/train")
tokenized_out.mkdir(parents=True, exist_ok=True)

midi_root = Path("maestro-v3.0.0/maestro-v3.0.0")
subdirectories = [folder for folder in midi_root.iterdir() if folder.is_dir()]

midi_paths = []

for subdir in subdirectories:
    for midi_file in subdir.glob("*.midi"):
        if midi_file.exists():
            midi_paths.append(midi_file)
        else:
            print(f"Skipped (file not found): {midi_file}")

for midi_path in midi_paths:
    try:
        encoded = tokenizer.encode(midi_path)

        if isinstance(encoded, list):
            for i, tokens in enumerate(encoded):
                out_path = tokenized_out / f"{midi_path.stem}_track{i}.json"
                tokenizer.save_tokens(tokens, out_path)
        else:
            out_path = tokenized_out / f"{midi_path.stem}.json"
            tokenizer.save_tokens(encoded, out_path)

    except Exception as e:
        print(f"Failed to tokenize {midi_path.name}: {e}")


In [8]:
from pathlib import Path
import json

tokenized_dir = Path("maestro-tokenized/train")
token_files = list(tokenized_dir.glob("*.json"))

print(f"✅ Total encoded files found: {len(token_files)}\n")

# Optionally, print a few sample filenames and their token count
for tf in token_files[:5]:  # limit to first 5 for preview
    try:
        with open(tf, 'r') as f:
            data = json.load(f)
        print(f"{tf.name}: {len(data.get('ids', [])) if 'ids' in data else 'No ids'} tokens")
    except Exception as e:
        print(f"❌ Error reading {tf.name}: {e}")


✅ Total encoded files found: 1276

MIDI-Unprocessed_01_R2_2006_01_ORIG_MID--AUDIO_01_R2_2006_03_Track03_wav_track0.json: 32035 tokens
MIDI-Unprocessed_XP_03_R1_2004_01-02_ORIG_MID--AUDIO_03_R1_2004_02_Track02_wav_track0.json: 58643 tokens
MIDI-Unprocessed_R1_D2-13-20_mid--AUDIO-from_mp3_13_R1_2015_wav--2_track0.json: 14384 tokens
MIDI-Unprocessed_02_R1_2011_MID--AUDIO_R1-D1_08_Track08_wav_track0.json: 11589 tokens
MIDI-Unprocessed_Schubert4-6_MID--AUDIO_10_R2_2018_wav_track0.json: 78372 tokens


In [9]:
import json
import torch
from torch.utils.data import Dataset
import random

class SafeMIDITokenDataset(Dataset):
    def __init__(self, json_paths, max_seq_len=1024):
        self.json_paths = json_paths
        self.max_seq_len = max_seq_len

    def __len__(self):
        return 10000  # Arbitrary, acts like an infinite stream

    def __getitem__(self, idx):
        while True:
            path = random.choice(self.json_paths)
            try:
                with open(path) as f:
                    tokens = json.load(f).get("ids", [])

                if len(tokens) > self.max_seq_len + 1:
                    offset = random.randint(0, len(tokens) - self.max_seq_len - 1)
                    input_seq = tokens[offset : offset + self.max_seq_len]
                    target_seq = tokens[offset + 1 : offset + self.max_seq_len + 1]
                    return torch.tensor(input_seq), torch.tensor(target_seq)
            except Exception as e:
                print(f"Skipping {path}: {e}")


In [10]:
from torch.utils.data import DataLoader

train_dataset = SafeMIDITokenDataset(token_files, max_seq_len=512)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, num_workers=0)


In [11]:
import torch
import torch.nn as nn

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=2048):
        super().__init__()
        self.pe = nn.Parameter(torch.zeros(1, max_len, d_model))
        nn.init.normal_(self.pe, mean=0, std=0.02)  # Learnable init

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

class MusicTransformer(nn.Module):
    def __init__(self, vocab_size, d_model=512, nhead=8, num_layers=6, dim_feedforward=2048, dropout=0.1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model)

        encoder_layer = nn.TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout=dropout)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers)

        self.norm = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

        self.fc_out = nn.Sequential(
            nn.Linear(d_model, d_model),
            nn.ReLU(),
            nn.Linear(d_model, vocab_size)
        )

    def forward(self, src):
        src = self.embedding(src)
        src = self.dropout(src)
        src = self.pos_encoder(src)
        src = self.norm(src)

        out = self.transformer(src)
        out = self.norm(out)

        return self.fc_out(out)


In [12]:
vocab_size = tokenizer.vocab_size
model = MusicTransformer(vocab_size).cuda()




In [13]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

def train_one_epoch():
    model.train()
    total_loss = 0
    for x, y in train_loader:
        x, y = x.cuda(), y.cuda()
        optimizer.zero_grad()
        output = model(x)
        loss = criterion(output.view(-1, vocab_size), y.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Train Loss: {total_loss/len(train_loader):.4f}")


In [14]:
for epoch in range(20):
    print(f"Epoch {epoch+1}")
    train_one_epoch()


Epoch 1
Train Loss: 3.2334
Epoch 2
Train Loss: 3.1766
Epoch 3
Train Loss: 3.1752
Epoch 4
Train Loss: 3.1722
Epoch 5
Train Loss: 3.1728
Epoch 6
Train Loss: 3.1734
Epoch 7
Train Loss: 3.1718
Epoch 8
Train Loss: 3.1691
Epoch 9
Train Loss: 3.1658
Epoch 10
Train Loss: 3.1710
Epoch 11
Train Loss: 3.1714
Epoch 12
Train Loss: 3.1711
Epoch 13
Train Loss: 3.1739
Epoch 14
Train Loss: 3.1668
Epoch 15
Train Loss: 3.1711
Epoch 16
Train Loss: 3.1694
Epoch 17
Train Loss: 3.1678
Epoch 18
Train Loss: 3.1696
Epoch 19
Train Loss: 3.1721
Epoch 20
Train Loss: 3.1687


In [15]:
import torch

def top_k_logits(logits, k):
    """Apply top-k filtering to logits."""
    values, indices = torch.topk(logits, k)
    mask = logits < values[..., -1, None]
    logits[mask] = float('-inf')
    return logits

def generate_tokens(
    model,
    tokenizer,
    max_length=1024,
    temperature=1.0,
    top_k=20,
    start_token=None,
    device='cuda'
):
    model.eval()
    start_token_id = start_token if start_token is not None else tokenizer.vocab.get('Bar_None')
    if start_token_id is None:
        raise ValueError("'Bar_None' not found in tokenizer vocab and no start_token provided.")

    generated = [start_token_id]
    input_tensor = torch.tensor([generated], dtype=torch.long).to(device)

    for _ in range(max_length):
        with torch.no_grad():
            output = model(input_tensor)
            logits = output[0] if isinstance(output, tuple) else output

            if logits.dim() == 3:
                logits = logits[:, -1, :]  # shape: [1, vocab]
            logits = logits / temperature

            # Top-k sampling
            if top_k > 0:
                logits = top_k_logits(logits, k=top_k)

            probs = torch.softmax(logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1).item()

        generated.append(next_token)

        # Prepare context window
        context = generated[-tokenizer.max_seq_len:] if hasattr(tokenizer, "max_seq_len") else generated
        input_tensor = torch.tensor([context], dtype=torch.long).to(device)

        # Optional early stop
        if next_token == tokenizer.vocab.get("EndOfTrack", -1):
            break

    return generated


In [16]:
!pip install miditoolkit


Collecting miditoolkit
  Downloading miditoolkit-1.0.1-py3-none-any.whl.metadata (4.9 kB)
Collecting mido>=1.1.16 (from miditoolkit)
  Downloading mido-1.3.3-py3-none-any.whl.metadata (6.4 kB)
Downloading miditoolkit-1.0.1-py3-none-any.whl (24 kB)
Downloading mido-1.3.3-py3-none-any.whl (54 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.6/54.6 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: mido, miditoolkit
Successfully installed miditoolkit-1.0.1 mido-1.3.3


In [17]:
def tokens_to_midi(token_ids, tokenizer, output_midi_path):
    import mido
    from mido import MidiFile, MidiTrack, Message, MetaMessage

    id_to_token = {v: k for k, v in tokenizer.vocab.items()}
    tokens = [id_to_token[i] for i in token_ids]

    midi = MidiFile(ticks_per_beat=480)
    track = MidiTrack()
    midi.tracks.append(track)
    track.append(MetaMessage('set_tempo', tempo=mido.bpm2tempo(120)))

    time_per_tick = 1  # this will be scaled later
    abs_time = 0
    current_position = 0
    note_buffer = []

    i = 0
    while i < len(tokens):
        token = tokens[i]

        if token.startswith("Bar"):
            current_position = 0

        elif token.startswith("Position_"):
            current_position = int(token.split("_")[1])

        elif token.startswith("Pitch_"):
            pitch = int(token.split("_")[1])
            if (i + 2) < len(tokens) and tokens[i+1].startswith("Velocity_") and tokens[i+2].startswith("Duration_"):
                velocity = int(tokens[i+1].split("_")[1])
                duration_str = tokens[i+2].split("_")[1]
                # convert Duration_0.2.8 to ticks
                beats = [int(x) for x in duration_str.split(".")]
                duration_ticks = (
                    beats[0] * 480 +
                    beats[1] * (480 // 4) +
                    beats[2] * (480 // 16)
                )
                # Note ON
                track.append(Message('note_on', note=pitch, velocity=velocity, time=abs_time))
                # Note OFF after duration
                track.append(Message('note_off', note=pitch, velocity=velocity, time=duration_ticks))
                abs_time = 0  # reset time since we already added it

                i += 2  # skip velocity and duration

        i += 1

    midi.save(output_midi_path)


In [18]:
# Generation
device = 'cuda' if torch.cuda.is_available() else 'cpu'
gen_ids = generate_tokens(model, tokenizer, max_length=2048, temperature=1.0, device=device)



In [19]:
print("Generated Token IDs:", gen_ids)

Generated Token IDs: [4, 190, 43, 107, 130, 201, 56, 107, 125, 218, 49, 106, 125, 203, 49, 108, 125, 218, 46, 109, 127, 4, 190, 50, 108, 125, 215, 51, 110, 125, 206, 39, 104, 125, 211, 55, 109, 125, 216, 48, 113, 130, 212, 49, 107, 127, 4, 189, 43, 106, 125, 216, 44, 103, 127, 206, 53, 114, 125, 208, 43, 103, 125, 4, 189, 55, 115, 128, 200, 53, 104, 125, 215, 49, 102, 125, 208, 50, 108, 126, 201, 55, 106, 128, 214, 59, 111, 125, 209, 48, 105, 125, 215, 52, 108, 125, 198, 50, 108, 130, 206, 52, 112, 125, 4, 191, 41, 102, 125, 212, 51, 110, 125, 203, 44, 102, 132, 199, 46, 114, 125, 219, 41, 107, 125, 203, 39, 107, 127, 215, 52, 110, 126, 202, 53, 112, 125, 216, 48, 100, 130, 211, 45, 111, 126, 201, 53, 109, 129, 219, 48, 107, 126, 201, 60, 109, 125, 4, 189, 55, 110, 126, 212, 47, 105, 132, 215, 53, 106, 125, 214, 60, 110, 126, 219, 60, 108, 125, 196, 47, 117, 125, 215, 43, 104, 130, 201, 52, 108, 126, 204, 49, 110, 126, 197, 57, 103, 125, 208, 50, 104, 129, 211, 51, 110, 125, 204, 46, 1

In [20]:
id_to_token = {v: k for k, v in tokenizer.vocab.items()}
print([id_to_token[t] for t in gen_ids[:100]])


['Bar_None', 'Position_1', 'Pitch_59', 'Velocity_59', 'Duration_0.6.8', 'Position_12', 'Pitch_72', 'Velocity_59', 'Duration_0.1.8', 'Position_29', 'Pitch_65', 'Velocity_55', 'Duration_0.1.8', 'Position_14', 'Pitch_65', 'Velocity_63', 'Duration_0.1.8', 'Position_29', 'Pitch_62', 'Velocity_67', 'Duration_0.3.8', 'Bar_None', 'Position_1', 'Pitch_66', 'Velocity_63', 'Duration_0.1.8', 'Position_26', 'Pitch_67', 'Velocity_71', 'Duration_0.1.8', 'Position_17', 'Pitch_55', 'Velocity_47', 'Duration_0.1.8', 'Position_22', 'Pitch_71', 'Velocity_67', 'Duration_0.1.8', 'Position_27', 'Pitch_64', 'Velocity_83', 'Duration_0.6.8', 'Position_23', 'Pitch_65', 'Velocity_59', 'Duration_0.3.8', 'Bar_None', 'Position_0', 'Pitch_59', 'Velocity_55', 'Duration_0.1.8', 'Position_27', 'Pitch_60', 'Velocity_43', 'Duration_0.3.8', 'Position_17', 'Pitch_69', 'Velocity_87', 'Duration_0.1.8', 'Position_19', 'Pitch_59', 'Velocity_43', 'Duration_0.1.8', 'Bar_None', 'Position_0', 'Pitch_71', 'Velocity_91', 'Duration_0.4

In [21]:
output_midi_path = "generated_music_sample.mid"
tokens_to_midi(gen_ids, tokenizer, output_midi_path)

In [22]:
import torch

# Assuming `model` is your trained model
model_path = "trained_model.pth"  # Path where you want to save the model

# Save the model's state_dict (recommended way to save in PyTorch)
torch.save(model.state_dict(), model_path)

print(f"Model saved to {model_path}")


Model saved to trained_model.pth
