# **Import libraries**

In [4]:
!pip install mido



In [5]:
import mido # easy to use python MIDI library
import matplotlib.pyplot as plt # plotting
import numpy as np # linear algebra
import os # accessing directory structure
import random
import pandas as pd

from mido import MidiFile, MidiTrack, Message

from sklearn import model_selection

import torch 
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

# **Hyperparameters**

In [16]:
num_epochs = 100
batch_size = 1024

sequence_length = 16
embedding_dim_note = 128
embedding_dim_time = 128

hidden_size = 512
num_layers = 1
num_classes_note = 128
num_classes_time = 18

learning_rate = 0.05

# **Load data**

In [8]:
df_train = pd.read_csv('train_note_time.csv', header=None)
df_val = pd.read_csv('val_note_time.csv', header=None)

array_train = df_train.values
array_val = df_val.values

In [9]:
train_loader = torch.utils.data.DataLoader(dataset=array_train,
                                           batch_size=batch_size, 
                                           shuffle=True)

val_loader = torch.utils.data.DataLoader(dataset=array_val,
                                           batch_size=batch_size, 
                                           shuffle=True)

# **Models**

In [10]:
# RNN architecture
class RNN(nn.Module):
  def __init__(self, num_classes_note, num_classes_time, embedding_dim_note, embedding_dim_time, hidden_size, num_layers, drop_prob=0., drop_fc=0.):
    super(RNN, self).__init__()

    self.embedding_note = nn.Embedding(num_classes_note, embedding_dim_note)
    self.embedding_time = nn.Embedding(num_classes_time, embedding_dim_time)

    self.hidden_size = hidden_size
    self.num_layers = num_layers
    self.lstm = nn.RNN(embedding_dim_note + embedding_dim_time, hidden_size, num_layers, dropout=drop_prob, batch_first=True)

    self.fc_note_1 = nn.Linear(hidden_size, hidden_size)
    self.fc_note_2 = nn.Linear(hidden_size, num_classes_note)

    self.fc_time_1 = nn.Linear(hidden_size, hidden_size)
    self.fc_time_2 = nn.Linear(hidden_size, num_classes_time)

    self.relu = nn.ReLU()
    self.dropout = nn.Dropout(p=drop_fc)
    
  def forward(self, inputs):
    notes, time = inputs
    # Embedding layers
    embeddings_note = self.embedding_note(notes) # Output shape (batch, sequence_length, embedding_dim)
    embeddings_time = self.embedding_time(time)
    x = torch.cat((embeddings_note, embeddings_time), dim=2)

    # Set initial hidden and cell states 
    h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device) 
    
    # Forward propagate LSTM
    out, hidden = self.lstm(x, h0)  # out: tensor of shape (batch_size, seq_length, hidden_size)
    out = out[:, -1, :] # Hidden state of the last element of the sequence 
    
    #FC
    note = self.fc_note_1(out)
    note = self.relu(note)
    note = self.fc_note_2(note)

    time = self.fc_time_1(out)
    time = self.dropout(time)
    time = self.relu(time)
    time = self.fc_time_2(time)

    return note, time

In [11]:
# GRU architecture
class GRU(nn.Module):
  def __init__(self, num_classes_note, num_classes_time, embedding_dim_note, embedding_dim_time, hidden_size, num_layers, drop_prob=0., drop_fc=0.):
    super(GRU, self).__init__()

    self.embedding_note = nn.Embedding(num_classes_note, embedding_dim_note)
    self.embedding_time = nn.Embedding(num_classes_time, embedding_dim_time)

    self.hidden_size = hidden_size
    self.num_layers = num_layers
    self.lstm = nn.GRU(embedding_dim_note + embedding_dim_time, hidden_size, num_layers, dropout=drop_prob, batch_first=True)

    self.fc_note_1 = nn.Linear(hidden_size, hidden_size)
    self.fc_note_2 = nn.Linear(hidden_size, num_classes_note)

    self.fc_time_1 = nn.Linear(hidden_size, hidden_size)
    self.fc_time_2 = nn.Linear(hidden_size, num_classes_time)

    self.relu = nn.ReLU()
    self.dropout = nn.Dropout(p=drop_fc)
    
  def forward(self, inputs):
    notes, time = inputs
    # Embedding layers
    embeddings_note = self.embedding_note(notes) # Output shape (batch, sequence_length, embedding_dim)
    embeddings_time = self.embedding_time(time)
    x = torch.cat((embeddings_note, embeddings_time), dim=2)

    # Set initial hidden and cell states 
    h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device) 
    
    # Forward propagate LSTM
    out, hidden = self.lstm(x, h0)  # out: tensor of shape (batch_size, seq_length, hidden_size)
    out = out[:, -1, :] # Hidden state of the last element of the sequence 
    
    #FC
    note = self.fc_note_1(out)
    note = self.relu(note)
    note = self.fc_note_2(note)

    time = self.fc_time_1(out)
    time = self.dropout(time)
    time = self.relu(time)
    time = self.fc_time_2(time)

    return note, time

In [12]:
# LSTM architecture
class LSTM(nn.Module):
  def __init__(self, num_classes_note, num_classes_time, embedding_dim_note, embedding_dim_time, hidden_size, num_layers, drop_prob=0., drop_fc=0.):
    super(LSTM, self).__init__()

    self.embedding_note = nn.Embedding(num_classes_note, embedding_dim_note)
    self.embedding_time = nn.Embedding(num_classes_time, embedding_dim_time)

    self.hidden_size = hidden_size
    self.num_layers = num_layers
    self.lstm = nn.LSTM(embedding_dim_note + embedding_dim_time, hidden_size, num_layers, dropout=drop_prob, batch_first=True)

    self.fc_note_1 = nn.Linear(hidden_size, hidden_size)
    self.fc_note_2 = nn.Linear(hidden_size, num_classes_note)

    self.fc_time_1 = nn.Linear(hidden_size, hidden_size)
    self.fc_time_2 = nn.Linear(hidden_size, num_classes_time)

    self.relu = nn.ReLU()
    self.dropout = nn.Dropout(p=drop_fc)
    
  def forward(self, inputs):
    notes, time = inputs
    # Embedding layers
    embeddings_note = self.embedding_note(notes) # Output shape (batch, sequence_length, embedding_dim)
    embeddings_time = self.embedding_time(time)
    x = torch.cat((embeddings_note, embeddings_time), dim=2)

    # Set initial hidden and cell states 
    h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device) 
    c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
    
    # Forward propagate LSTM
    out, hidden = self.lstm(x, (h0, c0))  # out: tensor of shape (batch_size, seq_length, hidden_size)
    out = out[:, -1, :] # Hidden state of the last element of the sequence 
    
    #FC
    note = self.fc_note_1(out)
    note = self.relu(note)
    note = self.fc_note_2(note)

    time = self.fc_time_1(out)
    time = self.dropout(time)
    time = self.relu(time)
    time = self.fc_time_2(time)

    return note, time

# **Training**

## Accuracy 

In [17]:
def validate_model(model, loader):
    model.eval()
    with torch.no_grad():
        correct_note = 0
        correct_time = 0
        total = 0
        for batch in loader:
            batch = torch.reshape(batch, (batch.shape[0], -1 , 3))
            notes = batch[:,:,0]
            velocity = batch[:,:,1]
            time = batch[:,:,2]

            notes_sequence = notes[:,:16].to(device)
            notes_target = notes[:,16].to(device)

            time = (time * 2) // 60 - (time // 60)
            time = torch.min(time, torch.tensor([17]))
            time_sequence = time[:,:16].to(device)
            time_target = time[:,16].to(device)

            inputs = (notes_sequence, time_sequence)

            note, time = model(inputs)

            # Accuracy note
            _, predicted = torch.max(note.data, 1)
            correct_note += (predicted == notes_target).sum().item()
            total += notes_target.size(0)

            # Accuracy time
            _, predicted = torch.max(time.data, 1)
            correct_time += (predicted == time_target).sum().item()

        accuracy_note = 100 * correct_note / total
        accuracy_time = 100 * correct_time / total

    return accuracy_note, accuracy_time

## Training loop

In [18]:
def train_model(model, optimizer, train_loader, val_loader, num_epochs, lr_scheduler=None, display_loss=False):
  criterion_note = nn.CrossEntropyLoss()
  criterion_time = nn.CrossEntropyLoss()

  best_val_accuracy_note = 0
  best_val_accuracy_time = 0
  best_epoch_note = 0
  best_epcoh_time = 0

  for epoch in range(num_epochs):

    model.train()

    #### UPDATE LEARNING RATE #### 
    if lr_scheduler == 'multi_steps':
        if epoch in [int(num_epochs * 0.5)]:
            for param_group in optimizer.param_groups:
                param_group['lr'] *= 0.1

    for i, batch in enumerate(train_loader):
      batch = torch.reshape(batch, (batch.shape[0], -1 , 3))
      notes = batch[:,:,0]
      velocity = batch[:,:,1]
      time = batch[:,:,2]

      notes_sequence = notes[:,:16].to(device)
      notes_target = notes[:,16].to(device)

      time = (time * 2) // 60 - (time // 60)
      time = torch.min(time, torch.tensor([17]))
      time_sequence = time[:,:16].to(device)
      time_target = time[:,16].to(device)

      inputs = (notes_sequence, time_sequence)

      optimizer.zero_grad()
      note, time = model(inputs)

      loss_note = torch.mean(criterion_note(note, notes_target))
      loss_time = torch.mean(criterion_time(time, time_target))
      loss = loss_note + loss_time

      loss.backward()
      optimizer.step()

      if i % 300 == 0 and display_loss:
        print(f'Epoch : {epoch}, Step: {i}, Loss: {round(loss.item(), 2)}')

    # Train accuracy 
    train_accuracy_note, train_accuracy_time = validate_model(model, train_loader)
    train_accuracy_note = round(train_accuracy_note, 2)
    train_accuracy_time = round(train_accuracy_time, 2)

    # Val accuracy
    val_accuracy_note, val_accuracy_time = validate_model(model, val_loader)
    val_accuracy_note = round(val_accuracy_note, 2)
    val_accuracy_time = round(val_accuracy_time, 2)


    if val_accuracy_note > best_val_accuracy_note:
      best_val_accuracy_note = val_accuracy_note
      best_epoch_note = epoch

    if val_accuracy_time > best_val_accuracy_time:
      best_val_accuracy_time = val_accuracy_time
      best_epoch_time = epoch

    print('################')
    print(f'Epoch: {epoch}, Loss note: {round(loss_note.item(), 2)}, Loss time: {round(loss_time.item(), 2)}')
    print(f'Epoch : {epoch}, Train accuracy note : {train_accuracy_note} %, Val accuracy note : {val_accuracy_note} %')
    print(f'Best val accuracy at epoch {best_epoch_note}: {best_val_accuracy_note} %')
    print('------')
    print(f'Epoch : {epoch}, Train accuracy time : {train_accuracy_time} %, Val accuracy time : {val_accuracy_time} %')
    print(f'Best val accuracy at epoch {best_epoch_time}: {best_val_accuracy_time} %')

# **Experiments**

In [19]:
# Best result
model = RNN(num_classes_note, num_classes_time, embedding_dim_note=128, embedding_dim_time=128, hidden_size=512, num_layers=3).to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=0.05, nesterov=True, momentum=0.9)

train_model(model, optimizer, train_loader, val_loader, num_epochs=100, lr_scheduler='multi_steps')

In [None]:
# Best result
model = GRU(num_classes_note, num_classes_time, embedding_dim_note=128, embedding_dim_time=128, hidden_size=512, num_layers=3).to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=0.05, nesterov=True, momentum=0.9)

train_model(model, optimizer, train_loader, val_loader, num_epochs=100, lr_scheduler='multi_steps')

In [None]:
# Best result
model = LSTM(num_classes_note, num_classes_time, embedding_dim_note=128, embedding_dim_time=128, hidden_size=512, num_layers=3).to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=0.05, nesterov=True, momentum=0.9)

train_model(model, optimizer, train_loader, val_loader, num_epochs=100, lr_scheduler='multi_steps')

# **Music generation**

## Mido utils

In [22]:
def data_to_track(data):
    track = MidiTrack()
    for values in data:
        note = values[0]
        velocity = values[1]
        time = values[2]
        track.append(Message('note_on', channel=0, note=note, velocity=velocity, time=time))
    return(track)
    

In [23]:
def save_track(track, path):
    mid = MidiFile()
    mid.tracks.append(track)
    mid.save(path)

## Generation

In [26]:
# Random generation
n_predictions = 1000
temp_note = 1.5
temp_time = 1.5
initial_note = 60

list_notes = [initial_note]
list_times = [0]
velocity = 50
data = [[initial_note, velocity, 0]]

In [27]:
for i in range(n_predictions):
  list_notes_input = list_notes[-16:]
  list_times_input = list_times[-16:]

  notes_input = torch.reshape(torch.tensor(list_notes_input),(1,-1)).to(device)
  times_input = torch.reshape(torch.tensor(list_times_input),(1,-1)).to(device)

  times_input = (times_input * 2) // 60 - (times_input // 60)
  times_input = torch.min(times_input, torch.tensor([17]).to(device))

  inputs = (notes_input, times_input)

  note, time = model(inputs)

  # Sample note
  array_proba_note = torch.softmax(note / temp_note, 1).detach().cpu().numpy()[0]
  note_sampled = np.random.choice(range(num_classes_note), p=array_proba_note)

  # Sample time
  array_proba_time = torch.softmax(time / temp_time, 1).detach().cpu().numpy()[0]
  time_sampled = np.random.choice(range(num_classes_time), p=array_proba_time)

  final_time = time_sampled * 60

  data.append([note_sampled, velocity, int(final_time)])

  list_notes.append(note_sampled)
  list_times.append(final_time)

In [28]:
track = data_to_track(data)
save_track(track, 'file.mid')