# 0) requirements

In [1]:
# Install required package (if not already installed)

!pip install miditok kagglehub mido pydub


Collecting miditok
  Downloading miditok-3.0.5.post1-py3-none-any.whl.metadata (10 kB)
Collecting mido
  Downloading mido-1.3.3-py3-none-any.whl.metadata (6.4 kB)
Collecting symusic>=0.5.0 (from miditok)
  Downloading symusic-0.5.7-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (8.7 kB)
Collecting pySmartDL (from symusic>=0.5.0->miditok)
  Downloading pySmartDL-1.3.4-py3-none-any.whl.metadata (2.8 kB)
Downloading miditok-3.0.5.post1-py3-none-any.whl (158 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m158.3/158.3 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading mido-1.3.3-py3-none-any.whl (54 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.6/54.6 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading symusic-0.5.7-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (2.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m21.9 MB/s[0m eta [36m0:00:00[0m00:01

## b) import dependencies

In [2]:
import os
import random
import numpy as np
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
import mido
import matplotlib.pyplot as plt

from miditok import REMI, TokenizerConfig
from miditok.pytorch_data import DatasetMIDI
from pathlib import Path

device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')

In [3]:
from miditok import REMI, TokenizerConfig
tokenizer = REMI.from_pretrained("Richatte2000/tokenizer_midi_piano")

tokenizer.json:   0%|          | 0.00/1.28M [00:00<?, ?B/s]

  super().__init__(tokenizer_config, params)


In [4]:
# The following code will only execute
# successfully when compression is complete
"""
import kagglehub

# Download latest version
path = kagglehub.dataset_download("pierrepauchet/midi-piano-chunks")

print("Path to dataset files:", path)
"""

'\nimport kagglehub\n\n# Download latest version\npath = kagglehub.dataset_download("pierrepauchet/midi-piano-chunks")\n\nprint("Path to dataset files:", path)\n'

In [4]:
from miditok.pytorch_data import DatasetMIDI, DataCollator
from torch.utils.data import DataLoader
from pathlib import Path 

train_dataset = DatasetMIDI(files_paths=list(Path("/kaggle/input/midi-piano-chunks/train").resolve().glob("**/*.mid")),
                            tokenizer=tokenizer,
                            max_seq_len=512,
                            bos_token_id=tokenizer.pad_token_id,
                            eos_token_id=tokenizer["BOS_None"],
)

print("Train dataset loaded")
val_dataset = DatasetMIDI(files_paths=list(Path("/kaggle/input/midi-piano-chunks/val").resolve().glob("**/*.mid")),
                            tokenizer=tokenizer,
                            max_seq_len=512,
                            bos_token_id=tokenizer.pad_token_id,
                            eos_token_id=tokenizer["BOS_None"],
)
print("Val dataset loaded")
test_dataset = DatasetMIDI(files_paths=list(Path("/kaggle/input/midi-piano-chunks/test").resolve().glob("**/*.mid")),
                            tokenizer=tokenizer,
                            max_seq_len=512,
                            bos_token_id=tokenizer.pad_token_id,
                            eos_token_id=tokenizer["BOS_None"]
)
print("Test dataset loaded")


Train dataset loaded
Val dataset loaded
Test dataset loaded


In [5]:
# Récupération des tokens spéciaux
special_tokens = tokenizer.special_tokens
special_tokens_ids = tokenizer.special_tokens_ids
pad_token, bos_token, eos_token, mask_token = special_tokens
pad_token_id, bos_token_id, eos_token_id, mask_token_id = special_tokens_ids

In [6]:
import numpy as np
import torch
from miditok.pytorch_data import DataCollator

class DataCollatorForInfilling(DataCollator):
    """
    Data collator qui hérite du DataCollator de miditok et qui ajoute une corruption :
    pour chaque exemple, on masque UNE séquence contiguë de n tokens (n ~ Poisson(15)).
    La séquence est choisie aléatoirement parmi tous les tokens valides (excluant BOS et EOS).
    Les tokens masqués dans l'input sont remplacés par mask_token_id et dans les labels,
    ces positions conservent la valeur originale (les autres positions sont mises à -100).
    """
    def __init__(self, pad_token_id, mask_token_id, poisson_lambda=15, copy_inputs_as_labels=True, shift_labels=True):
        super().__init__(pad_token_id, copy_inputs_as_labels=copy_inputs_as_labels, shift_labels=shift_labels)
        self.pad_token_id = pad_token_id
        self.mask_token_id = mask_token_id
        self.poisson_lambda = poisson_lambda

    def __call__(self, batch):

        batch = super().__call__(batch)
   
        inputs = batch["input_ids"].clone()
        labels = inputs.clone()
       
        for i in range(inputs.size(0)):
            seq = inputs[i]
            
            valid_positions = (seq != self.pad_token_id).nonzero(as_tuple=False).view(-1)
            
            valid_positions = valid_positions[(valid_positions != 0) & (valid_positions != (seq.size(0) - 1))]
            if len(valid_positions) == 0:
                continue

            
            n_mask = np.random.poisson(self.poisson_lambda)
            available_length = valid_positions[-1].item() - valid_positions[0].item() + 1
            
            span_length = min(n_mask, available_length)
            if span_length <= 0:
                continue

            start_idx = np.random.randint(valid_positions[0].item(), valid_positions[-1].item() - span_length + 2)
            
            for j in range(start_idx, start_idx + span_length):
                labels[i, j] = inputs[i, j]      
                inputs[i, j] = self.mask_token_id 

        batch["input_ids"] = inputs
        batch["labels"] = labels
        return batch


In [7]:
collator = DataCollatorForInfilling(
    pad_token_id=tokenizer.pad_token_id,
    mask_token_id=mask_token_id,
    poisson_lambda=15,
    copy_inputs_as_labels=True,
    shift_labels=True
)
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=16, collate_fn=collator)
val_loader = DataLoader(val_dataset, batch_size=16, collate_fn=collator)
test_loader = DataLoader(test_dataset, batch_size=16, collate_fn=collator)

In [11]:
sample = next(iter(train_loader))

print("Inpus ID : ",sample['input_ids'][0])
print("-------------------")

print("LABELS : ",sample['labels'][0])
print("-------------------")

print("Attention MASK : ", sample['attention_mask'][0])
print("-------------------")

L_input,L_label,L_attention = sample['input_ids'][0], sample['labels'][0], sample['attention_mask'][0]

#Boucle pour check
for i in range(0,len(L_input)):
    if L_input[i] == mask_token_id:
        print("input : ",L_input[i] ,"labels : ",L_label[i] ,"attention mask : ",L_attention[i] )

Inpus ID :  tensor([ 3323,   526,  1848,  4089,   533,  1473,   634,  1464,   463,  1377,
          466,  1693,   607,  1192,  4827,  4944,   589,  1272,   474,  1357,
          463,  1624,  7537,  5803,   581,  1340,   474,  1289,   509,  2657,
         4681,  8078,  3427,  6163,  5944,   589,  3065,   507,  2989,   489,
         1583,  4835,  4987,  6177,  6163,   530,  4051,   530,  1557,  2570,
         2890,  2709,  4801,   474,  1468,  3946,  6039,  3563,  5369,   549,
         4048,   466,  3998, 17288,  5284, 10866,  3374,   521,  6123,   453,
         3725,   453,  1423,  5388,  3424,  5385,   558,  4670,   466,  4741,
          493,  1316,   516,  1897,  4533,  3500,  3235,   535,  1581,  8800,
            3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
            3,     3,     3,     3,     3,     3,     3,     3,     3,   460,
         3301,   849,  9718,  2938,  6027,  1974, 16684,  2077,  1994,  1748,
         7319,   534,  2306,  6015,  6287, 11772,  3

## bart training

In [19]:
#############################
# Définition du modèle BART de base (non pré-entraîné)
#############################
from transformers import BartConfig, BartForConditionalGeneration

config = BartConfig(
    vocab_size=tokenizer.vocab_size,
    max_position_embeddings=1024,
    encoder_layers=6,
    decoder_layers=6,
    encoder_attention_heads=8,
    decoder_attention_heads=8,
    d_model=512,
    bos_token_id=bos_token_id,
    eos_token_id=eos_token_id,
    pad_token_id=pad_token_id,
    mask_token_id=mask_token_id
)
#model = BartForConditionalGeneration(config)
#model = model.to(device)

model = BartForConditionalGeneration.from_pretrained("Richatte2000/model-from-trained-trained-epoch-0").to(device)



config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/323M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

In [20]:
from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=0.01)
num_epochs = 10
total_steps = len(train_loader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=int(0.1 * total_steps),
                                            num_training_steps=total_steps)




In [21]:
#############################
# Boucle d'entraînement avec évaluation sur la validation et push sur Hugging Face
#############################
for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")
    
    for batch in progress_bar:
        # Déplacement des données vers le device
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        
        # Clipping des gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        
        total_train_loss += loss.item()
        progress_bar.set_postfix({"loss": loss.item()})
    
    avg_train_loss = total_train_loss / len(train_loader)
    
    # Évaluation sur le jeu de validation
    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            total_val_loss += outputs.loss.item()
    avg_val_loss = total_val_loss / len(val_loader)
    
    print(f"\nEpoch {epoch+1} terminé : Train Loss = {avg_train_loss:.4f} | Val Loss = {avg_val_loss:.4f}\n")
    
    # Push du modèle sur Hugging Face Hub
    # Remplacez "USERNAME/REPO_NAME" et "YOUR_TOKEN" par vos identifiants et token.
    model.push_to_hub("Richatte2000/model-from-trained-trained-epoch-"+str(epoch), use_auth_token="hidden", commit_message=f"Epoch {epoch+1}")
    model.save_pretrained("model-trained-epoch-"+str(epoch))


Epoch 1/10: 100%|██████████| 4615/4615 [2:09:45<00:00,  1.69s/it, loss=0.167]  



Epoch 1 terminé : Train Loss = 0.2011 | Val Loss = 0.1914





model.safetensors:   0%|          | 0.00/323M [00:00<?, ?B/s]

Epoch 2/10:   6%|▌         | 285/4615 [07:49<1:58:59,  1.65s/it, loss=0.176]


KeyboardInterrupt: 

# Entrainement en remplacant une lkiste masqué par un seul mask

In [24]:
import numpy as np
import torch
from miditok.pytorch_data import DataCollator

# Retrieve special tokens
special_tokens = tokenizer.special_tokens
special_tokens_ids = tokenizer.special_tokens_ids
pad_token, bos_token, eos_token, mask_token = special_tokens
pad_token_id, bos_token_id, eos_token_id, mask_token_id = special_tokens_ids

class DataCollatorForInfilling(DataCollator):
    """
    Data collator that inherits from miditok's DataCollator and adds corruption:
    For each example, mask ONE contiguous sequence of n tokens (n ~ Poisson(15)).
    The sequence is randomly chosen from all valid tokens (excluding BOS and EOS).

    Modifications:
      - In the input, the masked sequence is replaced by a SINGLE mask_token.
      - The "missing" tokens (span_length - 1) are added at the END as pad tokens,
        to maintain the same length as the original.
      - In the labels, the position corresponding to the first masked token keeps the original value,
        while other positions in the masked span are set to -100.
    """
    def __init__(self, pad_token_id, mask_token_id, poisson_lambda=15, copy_inputs_as_labels=True, shift_labels=True):
        super().__init__(pad_token_id, copy_inputs_as_labels=copy_inputs_as_labels, shift_labels=shift_labels)
        self.pad_token_id = pad_token_id
        self.mask_token_id = mask_token_id
        self.poisson_lambda = poisson_lambda

    def __call__(self, batch):
        # Apply the base collator for padding and label shifting if requested
        batch = super().__call__(batch)
        inputs = batch["input_ids"].clone()
        labels = batch["input_ids"].clone()  # Start with a copy of inputs

        # For each example in the batch
        for i in range(inputs.size(0)):
            seq = inputs[i]
            L = seq.size(0)
            # Get valid positions (non-padding)
            valid_positions = (seq != self.pad_token_id).nonzero(as_tuple=False).view(-1)
            # Exclude the first and last token (often BOS and EOS)
            valid_positions = valid_positions[(valid_positions != 0) & (valid_positions != (L - 1))]
            if len(valid_positions) == 0:
                continue

            # Number of tokens to mask according to a Poisson distribution
            n_mask = np.random.poisson(self.poisson_lambda)

            # Calculate the available length in the contiguous sequence of valid tokens
            available_length = valid_positions[-1].item() - valid_positions[0].item() + 1
            span_length = min(n_mask, available_length)
            if span_length <= 0:
                continue

            # Randomly choose a start index such that the masked block stays within valid positions
            start_idx = np.random.randint(valid_positions[0].item(), valid_positions[-1].item() - span_length + 2)

            # Modify the input
            input_before = seq[:start_idx]
            input_after = seq[start_idx + span_length:]
            new_seq = torch.cat([input_before, torch.tensor([self.mask_token_id], dtype=seq.dtype), input_after])
            # Calculate the number of removed tokens (span_length - 1) and add pad tokens at the END
            num_removed = span_length - 1
            if num_removed > 0:
                padding = torch.full((num_removed,), self.pad_token_id, dtype=seq.dtype)
                new_seq = torch.cat([new_seq, padding])
            # Ensure the new sequence has the same length as the original
            if new_seq.size(0) != L:
                if new_seq.size(0) > L:
                    new_seq = new_seq[:L]
                else:
                    pad_extra = torch.full((L - new_seq.size(0),), self.pad_token_id, dtype=seq.dtype)
                    new_seq = torch.cat([new_seq, pad_extra])
            inputs[i] = new_seq

        batch["input_ids"] = inputs
        batch["labels"] = labels
        return batch

collator = DataCollatorForInfilling(
    pad_token_id=tokenizer.pad_token_id,
    mask_token_id=mask_token_id,
    poisson_lambda=15,
    copy_inputs_as_labels=True,
    shift_labels=True
)
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=16, collate_fn=collator)
val_loader = DataLoader(val_dataset, batch_size=16, collate_fn=collator)
test_loader = DataLoader(test_dataset, batch_size=16, collate_fn=collator)

sample = next(iter(train_loader))

print("Input IDs: ", sample['input_ids'][0])
print("-------------------")

print("Labels: ", sample['labels'][0])
print("-------------------")

print("Attention Mask: ", sample['attention_mask'][0])
print("-------------------")

L_input, L_label, L_attention = sample['input_ids'][0], sample['labels'][0], sample['attention_mask'][0]

# Loop to check masked positions
for i in range(len(L_input)):
    if L_input[i] == mask_token_id:
        print("Input: ", L_input[i], "Label: ", L_label[i], "Attention Mask: ", L_attention[i])


Inpus ID :  tensor([ 1988,  5920,  4992,  8891,   563,  2702,   563,  1583, 15900,   561,
         1010,   515,  1599,  2247,  5111, 14773,  1451,  3251,  3245,  5413,
          559,  3086, 16772,  4514, 17712, 13090,  6576,   748,  1031,  7039,
          571,  1877,  6113,  7349,  8651, 12964,   607,  1262,  3505,  3132,
          530,  4555,   547,  3065,  7940,  7483,   524,  2116,   522,  1529,
         2671, 17614,  3172, 13894,  1859,  4165,  1103,  1066,   788,   472,
         7296,  6463,  5064,  6044,  8856,   593, 14807,   607,  2225,   595,
         1045,  4990,  8618,  6706,  5064,   593,  1042,   628,  2756,   530,
          987,  5078,  5166,   504,  1121,   560,  1445,   674,  1934,   499,
         1216,  4287,  7929,  7249,  5475,  3380,   608,  1975,  8208,  3348,
         2822,  1190,  4546, 14137,  2364,   421,  9024,  1968,   434, 13066,
         1850,  3310,  1621,   434,  8709,   996,   431,     3,   547,  2637,
         5272,  7394,  5052,   506,   989,  6402,   

In [25]:
#############################
# Définition du modèle BART de base (non pré-entraîné)
#############################
from transformers import BartConfig, BartForConditionalGeneration

config = BartConfig(
    vocab_size=tokenizer.vocab_size,
    max_position_embeddings=1024,
    encoder_layers=6,
    decoder_layers=6,
    encoder_attention_heads=8,
    decoder_attention_heads=8,
    d_model=512,
    bos_token_id=bos_token_id,
    eos_token_id=eos_token_id,
    pad_token_id=pad_token_id,
    mask_token_id=mask_token_id
)
#model = BartForConditionalGeneration(config)
#model = model.to(device)

model = BartForConditionalGeneration.from_pretrained("Richatte2000/model-from-trained-trained-epoch-0").to(device)



config.json:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/323M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

In [28]:
from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(), lr=8e-4, weight_decay=0.01)
num_epochs = 10
total_steps = len(train_loader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=int(0.1 * total_steps),
                                            num_training_steps=total_steps)




In [None]:
#############################
# Boucle d'entraînement avec évaluation sur la validation et push sur Hugging Face
#############################
for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")
    
    for batch in progress_bar:
        # Déplacement des données vers le device
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        
        # Clipping des gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        
        total_train_loss += loss.item()
        progress_bar.set_postfix({"loss": loss.item()})
    
    avg_train_loss = total_train_loss / len(train_loader)
    
    # Évaluation sur le jeu de validation
    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            total_val_loss += outputs.loss.item()
    avg_val_loss = total_val_loss / len(val_loader)
    
    print(f"\nEpoch {epoch+1} terminé : Train Loss = {avg_train_loss:.4f} | Val Loss = {avg_val_loss:.4f}\n")
    
    # Push du modèle sur Hugging Face Hub
    # Remplacez "USERNAME/REPO_NAME" et "YOUR_TOKEN" par vos identifiants et token.
    model.push_to_hub("Richatte2000/model-mask-one-epoch-"+str(epoch), commit_message=f"Epoch {epoch+1}")
    model.save_pretrained("model-trained-epoch-"+str(epoch))


Epoch 1/10: 100%|██████████| 4615/4615 [2:06:40<00:00,  1.65s/it, loss=0.968]  



Epoch 1 terminé : Train Loss = 1.7935 | Val Loss = 0.8704





README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/323M [00:00<?, ?B/s]

Epoch 2/10:  85%|████████▌ | 3944/4615 [1:48:11<18:17,  1.64s/it, loss=0.363]  