First baseline approach in which we perform multilabel classification at each timestep. It differs from lstm.ipynb and lstm2.ipynb with different approach to training. Here I create big batches and do not backpropagate after processing every sequence.

In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import torch
import torch.nn as nn

import torch.utils.data as data
import os
import numpy as np
from tqdm import tqdm

import pypianoroll

In [2]:
#some constants
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
LEARNING_RATE = 0.001
TRAIN_BATCH_SIZE = 60
VAL_BATCH_SIZE = 30
DATA_PATH = '../data/Nottingham/' # I was using very small dataset here. Just to check if this approach work and don't spend 
#Hours on training just to understand the approach is wrong
NUM_EPOCHS = 5
POSITIVE_WEIGHT = 2 # Since it is more likely not to play a note. I've introduced some small positive weight, so that model
#doesn't converge to predicting all 0's

In [3]:
def path_to_pianoroll(path, resolution = 8):
    #The bigger resolution will be the more detailed the representation, but also the longer sequences will become.
    #So it may be hard for the LSTM to handle them. 
    midi_data = pypianoroll.read(path, resolution=resolution)
    
    piano_roll = midi_data.blend()[:, 21:109] #Taking just 88 useful notes. This will have shape
    #[length_of_sequence, number_of_notes]
    
    #we want to perform multilabel classification at each step so we need to binaryze the roll
    piano_roll[piano_roll > 0] = 1
    
    return piano_roll
    

In [4]:
def collate(batch):
    #Helper function for DataLoader
    #Batch is a list of tuple in the form (input, target)
    #We do not have to padd everything thanks to pack_sequence
    #!Using this function we decide how batches are prepared
    data = [item[0] for item in batch] #
    data = nn.utils.rnn.pack_sequence(data, enforce_sorted=False) # we prepare batch as a packed_sequence.
    #This function is very cool as we do not need to pad these sequences
    targets = [item[1] for item in batch]
    targets = nn.utils.rnn.pack_sequence(targets, enforce_sorted=False)
    return [data, targets]

In [6]:
class NotesGenerationDataset(data.Dataset):
    """I've decided not to work on text and convert it to the piano roll since this only makes more work. We can
       work directly on the pianoroll, and if needed convert it to text representation.
    """
    def __init__(self, path):
        self.path = path
        self.full_filenames = []
        
        #Here we assume that all midi files are valid, we do not check anything here.
        for root, subdirs, files in os.walk(path):
            for f in files:
                self.full_filenames.append(os.path.join(root, f))
                        
    def __len__(self):
        return len(self.full_filenames)
    
    
    def __getitem__(self, index):
        full_filename = self.full_filenames[index]
        
        piano_roll = path_to_pianoroll(full_filename)
        
        #input and gt are shifted by one step w.r.t one another.
        input_sequence = piano_roll[:-1, :]
        ground_truth_sequence = piano_roll[1:, :]
        
        return torch.tensor(input_sequence, dtype=torch.float32), torch.tensor(ground_truth_sequence, dtype=torch.float32)

In [8]:
trainset = NotesGenerationDataset(os.path.join(DATA_PATH, "train"))

#ofc we want big batch_size. However, one training sample takes quite a lot of memory.
#We will use torch.cuda.amp.autocast() so that we can make bigger batches
trainset_loader = torch.utils.data.DataLoader(trainset, batch_size=TRAIN_BATCH_SIZE,
                                              shuffle=True, drop_last=True, collate_fn=collate)

valset = NotesGenerationDataset(os.path.join(DATA_PATH, "valid"))

valset_loader = torch.utils.data.DataLoader(valset, batch_size=VAL_BATCH_SIZE, shuffle=False, drop_last=False, collate_fn=collate)

In [9]:
#Small sanity check that our sets do not intersect at any moment
train_songs = set(trainset.full_filenames)
for song in valset.full_filenames:
    assert not song in train_songs

In [10]:
class RNN(nn.Module):
    
    def __init__(self, input_size, hidden_size, num_classes, n_layers=2):
        
        super(RNN, self).__init__()
        
        self.input_size = input_size # amount of different notes
        self.hidden_size = hidden_size
        self.num_classes = num_classes 
        self.n_layers = n_layers
        
        #At first we need layer that will encode our vector with only once to better representation
        self.notes_encoder = nn.Linear(in_features=input_size, out_features=hidden_size)
        
        self.lstm = nn.LSTM(hidden_size, hidden_size, n_layers)
        
        #At the end we want to get vector with logits of all notes
        self.logits_fc = nn.Linear(hidden_size, num_classes)
    
    
    def forward(self, inp, hidden=None):
        #During training the input is packedSequence, but during inference this will be just a tensor
        if isinstance(inp, nn.utils.rnn.PackedSequence):
            #If we have Packed sequence we proceed a little bit differently
            batch_sizes = inp.batch_sizes
            notes_encoded = self.notes_encoder(inp.data) #PackedSequence.data is a tensor representation of shape [samples, num_of_notes]
            rnn_in = nn.utils.rnn.PackedSequence(notes_encoded,batch_sizes) #This is not recommended in PyTorch documentation.
            #However this saves a day here. Since otherwise we would have to create padded sequences 
            outputs, hidden = self.lstm(rnn_in, hidden)
            
            logits = self.logits_fc(outputs.data) #Again we go from packedSequence to tensor.
        else:
            #If we have tensor at the input this is pretty straightforward
            notes_encoded = self.notes_encoder(inp)
            outputs, hidden = self.lstm(notes_encoded, hidden)
            logits = self.logits_fc(outputs)
        
        return logits, hidden

In [11]:
#Now sanity check about Packed Sequences. So I check if Unpacking -> packing the packed Sequence will lead to exactly the same Object.
inp, targets = next(iter(trainset_loader))

batch_sizes = inp.batch_sizes
inp2 = nn.utils.rnn.PackedSequence(inp.data, batch_sizes)
assert torch.all(torch.eq(inp.data, inp2.data))



In [12]:
rnn = RNN(input_size=88, hidden_size=256, num_classes=88)
rnn = rnn.to(DEVICE)

criterion = nn.BCEWithLogitsLoss(pos_weight=torch.full((88,), POSITIVE_WEIGHT, device=DEVICE))

optimizer = torch.optim.Adam(rnn.parameters(), lr=LEARNING_RATE)

scaler = torch.cuda.amp.GradScaler()

In [13]:
def validate():
    rnn.eval()
    loop = tqdm(valset_loader, leave=True)
    
    losses = []
    
    with torch.no_grad():
        for idx, (inp, target) in enumerate(loop):
            inp, target = inp.to(DEVICE), target.to(DEVICE)
            logits, _ = rnn(inp)

            loss = criterion(logits, target.data)
            
            losses.append(loss.item())
            loop.set_postfix(loss=loss.item())

    rnn.train()
    return sum(losses) / len(losses)

In [14]:
clip = 1.0 #with Rnn's batch normalization is tricky to implement so instead we can use gradient clipping
#but just the clipping may be not enough, so we perform kind of normalization too
best_val_loss = float("inf")

loss_list = []
val_list = []

for epoch_number in range(NUM_EPOCHS):
    loop = tqdm(trainset_loader, leave=True)
    losses = []
    for idx, (inp, target) in enumerate(loop):
        
        inp, target = inp.to(DEVICE), target.to(DEVICE)
        optimizer.zero_grad() # remember to do this every time not to accumulate gradient

        with torch.cuda.amp.autocast(): 
            logits, _ = rnn(inp)
            loss = criterion(logits, target.data)
             
        scaler.scale(loss).backward()
        # Unscales the gradients of optimizer's assigned params in-place
        scaler.unscale_(optimizer)
        # Since the gradients of optimizer's assigned params are unscaled, clips as usual:
        torch.nn.utils.clip_grad_norm_(rnn.parameters(), clip)
        
        scaler.step(optimizer)
        scaler.update()
    
        
        losses.append(loss.item())
        loop.set_postfix(loss=loss.item())
    
    train_loss = sum(losses)/len(losses)
    loss_list.append(train_loss)
    current_val_loss = validate()
    val_list.append(current_val_loss)
    
    print(f"Epoch {epoch_number}, train_loss: {train_loss}, val_loss: {current_val_loss}")
    if current_val_loss < best_val_loss:
        
        torch.save(rnn.state_dict(), 'music_rnn.pth')
        best_val_loss = current_val_loss

100%|██████████| 11/11 [00:55<00:00,  5.05s/it, loss=0.238]
100%|██████████| 6/6 [00:29<00:00,  4.87s/it, loss=0.228]


Epoch 0, train_loss: 0.5099352300167084, val_loss: 0.2149465282758077


100%|██████████| 11/11 [00:39<00:00,  3.55s/it, loss=0.161]
100%|██████████| 6/6 [00:07<00:00,  1.32s/it, loss=0.166]


Epoch 1, train_loss: 0.17619910023429178, val_loss: 0.15660164753595987


100%|██████████| 11/11 [00:30<00:00,  2.74s/it, loss=0.149]
100%|██████████| 6/6 [00:07<00:00,  1.21s/it, loss=0.161]


Epoch 2, train_loss: 0.1536434834653681, val_loss: 0.15122093260288239


100%|██████████| 11/11 [00:30<00:00,  2.79s/it, loss=0.151]
100%|██████████| 6/6 [00:08<00:00,  1.37s/it, loss=0.158]


Epoch 3, train_loss: 0.15057085183533755, val_loss: 0.14899204423030218


100%|██████████| 11/11 [00:31<00:00,  2.86s/it, loss=0.149]
100%|██████████| 6/6 [00:07<00:00,  1.29s/it, loss=0.158]

Epoch 4, train_loss: 0.1496228358962319, val_loss: 0.14815168579419455





In [15]:
def sample_from_piano_rnn(sample_length=4, temperature=1, starting_sequence=None):

    #Sem some default starting sequence if noone was given
    if starting_sequence is None:   
        current_sequence_input = torch.zeros(1,1, 88, dtype=torch.float32, device=DEVICE)
        current_sequence_input[0, 0, 40] = 1
        current_sequence_input[0, 0, 50] = 1
        current_sequence_input[0, 0, 56] = 1

    final_output_sequence = [current_sequence_input.squeeze(1)]
    
    hidden = None
    with torch.no_grad():
        for i in range(sample_length):

            output, hidden = rnn(current_sequence_input, hidden)
            
            #By dividing by temperature before passing it to the sigmoid we can either make it more peaked
            #or more uniform. It works because rate of change of sigmoid is not linear w.r.t input.
            #So changing from 0.01 to 0.1 won't make that big difference But change from 0.1 to 1 make a difference of about 25%
            probabilities = torch.sigmoid(output.div(temperature))
           
            prob_of_0 = 1 - probabilities
            dist = torch.stack((prob_of_0, probabilities), dim=3).squeeze() #Here we will get tensor [num_of_notes, 2]
            
            #from multinomial we have [num_of_notes, 1]. But eventually we want to have [1,1,num_of_notes]
            current_sequence_input = torch.multinomial(dist, 1).squeeze().unsqueeze(0).unsqueeze(1).to(torch.float32)

            final_output_sequence.append(current_sequence_input.squeeze(1))

    sampled_sequence = torch.cat(final_output_sequence, dim=0).cpu().numpy()
    
    return sampled_sequence

In [24]:
sample = sample_from_piano_rnn(sample_length=200, temperature=0.05)

In [2]:
np.sum(sample) # Just to check how many notes are played withing these 200 timesteps.

In [26]:
roll = np.zeros((201,128))
roll[:, 21:109] = sample
roll[roll == 1] = 100
track = pypianoroll.Multitrack(resolution=3)
track.append(pypianoroll.StandardTrack(pianoroll=roll))
pypianoroll.write("baseline1_song2.mid", track)