In [1]:


%load_ext autoreload
%autoreload 2

import numpy as np
import torch
import torch.nn as nn

import torch.utils.data as data
import random
import numpy as np
from tqdm import tqdm
import pypianoroll
import os
import sys
sys.path.append("../")


from midiToTxt import converter2

In [2]:
def midi_folder_to_txt(midi_folder, destination, song_separator="\n"):
    converter = converter2.BetterMidiToTxtConverter()
    with open(destination,'w') as dest_file:
        for root, subdirs, files in os.walk(midi_folder):
            for f in files:
                final_path = os.path.join(root,f)
                
                dest_file.write(converter.midi_to_str(final_path))
                dest_file.write(song_separator)
                
    

In [42]:
BASE_FOLDER = "../data"
MIDI_FOLDER = os.path.join(BASE_FOLDER, "Nottingham")
TXT_FOLDER = os.path.join(BASE_FOLDER, "Nottingham_txt")

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
LEARNING_RATE = 0.001
TRAIN_BATCH_SIZE = 30
VAL_BATCH_SIZE = 30
NUM_EPOCHS =10
POSITIVE_WEIGHT = 1
CLIP_VALUE = 1.0 # clip value for the gradient clipping

In [4]:
for subfolder in os.listdir(MIDI_FOLDER):
    midi_folder_to_txt(os.path.join(MIDI_FOLDER, subfolder), os.path.join(TXT_FOLDER, subfolder + ".txt"))



In [5]:
class Vocabulary:
    def __init__(self, folder_with_txt, song_separator="\n"):
        self.folder_with_txt = folder_with_txt
        self.song_separator = song_separator
        self.build_mappings()
        
    def build_mappings(self):
        corpus = ""
        for root, subdirs, files in os.walk(self.folder_with_txt):
            for f in files:
                final_path = os.path.join(root,f)
                with open(final_path, 'r') as f:
                    corpus = "".join([corpus, f.read().replace(self.song_separator, " ")])
                    
        self.unique_words = set(corpus.strip().split(" "))
        self.vocab_length = len(self.unique_words)
        self.int_to_word = {index : word for index,word in enumerate(self.unique_words)}
        self.word_to_int = {word: index for index,word in self.int_to_word.items()}
        
    def tokenize_song(self, song):
        return song.strip().split(" ")
    
    def numberalize_song(self, song):
        numberalized = []
        
        for token in self.tokenize_song(song):
            numberalized.append(self.word_to_int[token])
            
        return numberalized
            
    def numberlized_to_text(self, numberalized):
        song = []
        
        for token in numberalized:
            song.append(self.int_to_word[token])
            
        return " ".join(song)
            
    

In [31]:
class NotesGenerationDataset(data.Dataset):
    def __init__(self, path, vocab, song_separator = "\n"):
        self.path = path
        self.vocab = vocab
        self.numberalized_songs = []
        self.song_separator = song_separator
        
        with open(path, "r") as f:
            text = f.read().strip()
            for song in text.split(song_separator):
                self.numberalized_songs.append(vocab.numberalize_song(song))
                        
    def __len__(self):
        return len(self.numberalized_songs)
    
    
    def __getitem__(self, index):
        numberalized_song = self.numberalized_songs[index]
        #We don't return one hot encoded vectors here since PyTorch has cool functonality for word embedings.
        #This is quite different situation from these in previous experiment, where we do not have one-hot-vector but just a vector of 1's and 0's
        return torch.tensor(numberalized_song[:-1], dtype=torch.int), torch.tensor(numberalized_song[1:], dtype=torch.long)

In [32]:
def collate(batch):
    #Helper function for DataLoader
    #Batch is a list of tuple in the form (input, target)
    #We do not have to padd everything thanks to pack_sequence
    data = [item[0] for item in batch] #
    data = nn.utils.rnn.pack_sequence(data, enforce_sorted=False)
    targets = [item[1] for item in batch]
    targets = nn.utils.rnn.pack_sequence(targets, enforce_sorted=False)
    return [data, targets]

In [33]:
vocab = Vocabulary(TXT_FOLDER)

trainset = NotesGenerationDataset(os.path.join(TXT_FOLDER, "train.txt"),vocab)

#ofc we want big batch_size. However, one training sample takes quite a lot of memory.
#We will use torch.cuda.amp.autocast() so that we can make bigger batches
trainset_loader = torch.utils.data.DataLoader(trainset, batch_size=TRAIN_BATCH_SIZE,
                                              shuffle=True, drop_last=True, collate_fn=collate)

valset = NotesGenerationDataset(os.path.join(TXT_FOLDER, "valid.txt"),vocab)

valset_loader = torch.utils.data.DataLoader(valset, batch_size=VAL_BATCH_SIZE, shuffle=False, drop_last=False, collate_fn=collate)

In [34]:
print(trainset.__len__())
trainset.__getitem__(0)[0].shape, trainset.__getitem__(0)[1].shape 

694


(torch.Size([689]), torch.Size([689]))

In [35]:
vocab.vocab_length

91

In [36]:
class RNN(nn.Module):
    
    def __init__(self, hidden_size, num_classes, n_layers=2):
        
        super(RNN, self).__init__()
        
        self.hidden_size = hidden_size
        self.num_classes = num_classes 
        self.n_layers = n_layers
        
        #nn.Embeding does the same job as nn.Linear but works like a lookuptable
        self.notes_encoder = nn.Embedding(num_embeddings=num_classes, embedding_dim=hidden_size)
        
        self.lstm = nn.LSTM(hidden_size, hidden_size, n_layers)
        
        #At the end we want to get vector with logits of all notes
        self.logits_fc = nn.Linear(hidden_size, num_classes)
    
    def forward(self, inp, hidden=None):
        
        if isinstance(inp, nn.utils.rnn.PackedSequence):
            #If we have Packed sequence we proceed a little bit differently
            batch_sizes = inp.batch_sizes
            #print(inp.data.shape)
            notes_encoded = self.notes_encoder(inp.data) #PackedSequence.data is a tensor representation of shape [samples, num_of_notes]
            #print(notes_encoded.shape)
            rnn_in = nn.utils.rnn.PackedSequence(notes_encoded,batch_sizes) #This is not recommended in PyTorch documentation.
            #However this saves a day here. Since otherwise we would have to create padded sequences 
            outputs, hidden = self.lstm(rnn_in, hidden)
            #print(outputs.data.shape)
            
            logits = self.logits_fc(outputs.data) #Again we go from packedSequence to tensor.
            #print(logits.shape)
            
        else:
            #If we have tensor at the input this is pretty straightforward
            notes_encoded = self.notes_encoder(inp)
            outputs, hidden = self.lstm(notes_encoded, hidden)
            logits = self.logits_fc(outputs)
            
        
        return logits, hidden

In [37]:
rnn = RNN(hidden_size=256, num_classes=vocab.vocab_length)
rnn = rnn.to(DEVICE)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(rnn.parameters(), lr=LEARNING_RATE)

scaler = torch.cuda.amp.GradScaler()

In [38]:
#! sanity check of the network
# inp, targets = next(iter(trainset_loader))
# logits, _ =rnn.forward(inp.to(DEVICE))

In [39]:
def validate(rnn, criterion, loader, device):
    rnn.eval()
    loop = tqdm(loader, leave=True)
    
    losses = []
    
    with torch.no_grad():
        for idx, (inp, target) in enumerate(loop):
            inp, target = inp.to(device), target.to(device)
            logits, _ = rnn(inp)

            loss = criterion(logits, target.data).item()
            
            losses.append(loss)
            
            loop.set_postfix(loss = loss)

    rnn.train()
    return sum(losses) / len(losses)

In [40]:
def train(rnn, optimizer, criterion, loader, device, clip_value):
    loop = tqdm(loader, leave=True)
    
    losses = []
    
    for idx, (inp, target) in enumerate(loop):
        inp, target = inp.to(device), target.to(device) # remember that target is packed sequence!
        optimizer.zero_grad()

        with torch.cuda.amp.autocast(): 
            logits, _ = rnn(inp)
            
            loss = criterion(logits, target.data)
             
        scaler.scale(loss).backward()
        # Unscales the gradients of optimizer's assigned params in-place
        scaler.unscale_(optimizer)
        # Since the gradients of optimizer's assigned params are unscaled, clips as usual:
        torch.nn.utils.clip_grad_norm_(rnn.parameters(), clip_value)
        
        scaler.step(optimizer)
        scaler.update()
    
        loss = loss.item()
        losses.append(loss)
        loop.set_postfix(loss=loss)
        
    return sum(losses) / len(losses)

In [43]:
clip = 1.0
best_val_loss = float("inf")

train_losses = []
val_losses = []

for epoch_number in range(NUM_EPOCHS):
    train_loss = train(rnn, optimizer, criterion, trainset_loader, DEVICE, CLIP_VALUE)    

    train_losses.append(train_loss)
    
    val_loss = validate(rnn, criterion, valset_loader, DEVICE)

    val_losses.append(val_loss)
    
    
    print(f"Epoch {epoch_number}:\ntrain_loss: {train_loss}\n val_loss: {val_loss}")
    # if current_val_loss < best_val_loss:
        
    #     torch.save(rnn.state_dict(), 'music_rnn.pth')
    #     best_val_loss = current_val_loss

100%|██████████| 23/23 [00:06<00:00,  3.30it/s, loss=1.05]
100%|██████████| 6/6 [00:00<00:00,  6.50it/s, loss=1.11] 


Epoch 0:
train_loss: 1.1172740874083147
 val_loss: 1.0471152861913045


100%|██████████| 23/23 [00:07<00:00,  3.26it/s, loss=1.03] 
100%|██████████| 6/6 [00:00<00:00,  6.51it/s, loss=1.02] 


Epoch 1:
train_loss: 1.0309797525405884
 val_loss: 0.9718188246091207


100%|██████████| 23/23 [00:06<00:00,  3.35it/s, loss=0.934]
100%|██████████| 6/6 [00:00<00:00,  6.51it/s, loss=0.952]


Epoch 2:
train_loss: 0.9602655872054722
 val_loss: 0.9144785006841024


100%|██████████| 23/23 [00:06<00:00,  3.32it/s, loss=0.874]
100%|██████████| 6/6 [00:00<00:00,  6.53it/s, loss=0.915]


Epoch 3:
train_loss: 0.9089342226152834
 val_loss: 0.8779982030391693


100%|██████████| 23/23 [00:06<00:00,  3.31it/s, loss=0.831]
100%|██████████| 6/6 [00:00<00:00,  6.48it/s, loss=0.881]


Epoch 4:
train_loss: 0.8727929825368135
 val_loss: 0.8451782564322153


100%|██████████| 23/23 [00:06<00:00,  3.33it/s, loss=0.856]
100%|██████████| 6/6 [00:00<00:00,  6.50it/s, loss=0.835]


Epoch 5:
train_loss: 0.8462154943010082
 val_loss: 0.8104888498783112


100%|██████████| 23/23 [00:06<00:00,  3.36it/s, loss=0.804]
100%|██████████| 6/6 [00:00<00:00,  6.51it/s, loss=0.808]


Epoch 6:
train_loss: 0.8084866663684016
 val_loss: 0.7785990635553995


100%|██████████| 23/23 [00:06<00:00,  3.32it/s, loss=0.758]
100%|██████████| 6/6 [00:00<00:00,  6.40it/s, loss=0.787]


Epoch 7:
train_loss: 0.7796819546948308
 val_loss: 0.7589213252067566


100%|██████████| 23/23 [00:06<00:00,  3.36it/s, loss=0.713]
100%|██████████| 6/6 [00:00<00:00,  6.46it/s, loss=0.757]


Epoch 8:
train_loss: 0.7556408203166464
 val_loss: 0.7353298862775167


100%|██████████| 23/23 [00:06<00:00,  3.31it/s, loss=0.736]
100%|██████████| 6/6 [00:00<00:00,  6.47it/s, loss=0.734]

Epoch 9:
train_loss: 0.7354539036750793
 val_loss: 0.7118591070175171





In [87]:
def sample_from_piano_rnn(rnn, vocab : Vocabulary, sample_length=4, temperature=1, starting_sequence=None):

    if starting_sequence is None:
        current_sequence_input = torch.tensor([vocab.word_to_int["n72"]], dtype=torch.long).unsqueeze(0)

    final_output_sequence = [current_sequence_input.item()]
    
    hidden = None
    with torch.no_grad():
        for i in range(sample_length):
            #print(current_sequence_input.shape)
            logits ,hidden = rnn(current_sequence_input.to(DEVICE), hidden)
            logits = logits.squeeze(0)
            probabilities = torch.softmax(logits.div(temperature), dim=1) # The less the temperature the bigger probabilities of 1 will be
            #print(probabilities.shape)
            #from multinomial we have [num_of_notes, 1]. But eventually we want to have [1,1,num_of_notes]
            selected = torch.multinomial(probabilities, 1)
            current_sequence_input = selected
        
            final_output_sequence.append(selected.item())

    return final_output_sequence

In [88]:
sample = sample_from_piano_rnn(rnn,vocab,sample_length=201, temperature=0.5)

In [89]:
song = vocab.numberlized_to_text(sample)

In [90]:
converter = converter2.BetterMidiToTxtConverter()
converter.set_biggest_roll((200,128))
converter.str_to_midi(song, "sample3.mid")