In [28]:
import numpy as np
import torch
from torch.utils.data import DataLoader

import os

# Dataset 

In [3]:
dataset_dir = "../Dataset/"
files = os.listdir(dataset_dir)

dataset_files = [ os.path.join(dataset_dir, file) for file in files]

In [4]:
songs = {}
for dataset_filename in dataset_files:
    abc_notation_file = open(dataset_filename, 'r')
    songs[os.path.basename(dataset_filename)] = abc_notation_file.read()
    abc_notation_file.close()

In [7]:
#File to train
train_list = list(songs.keys())
musical_train_file = train_list[0]

## Vectorize the text

Before we begin training our RNN model, we'll need to create a numerical representation of our text-based dataset. To do this, we'll generate two lookup tables: one that maps characters to numbers, and a second that maps numbers back to characters.

In [8]:
# Find all unique characters in the joined string
vocab = sorted(set(songs[musical_train_file]))
print("There are", len(vocab), "unique characters in the dataset")

There are 83 unique characters in the dataset


In [9]:
# Create a mapping from character to unique index.
# For example, to get the index of the character "d", 
#   we can evaluate `char2idx["d"]`.  
char2idx = {u:i for i, u in enumerate(vocab)}

# Create a mapping from indices to characters. This is
#   the inverse of char2idx and allows us to convert back
#   from unique index to the character in our vocabulary.
idx2char = np.array(vocab)

In [10]:
print('{')
for char,_ in zip(char2idx, range(20)):
    print('  {:4s}: {:3d},'.format(repr(char), char2idx[char]))
print('  ...\n}')

{
  '\n':   0,
  ' ' :   1,
  '!' :   2,
  '"' :   3,
  '#' :   4,
  "'" :   5,
  '(' :   6,
  ')' :   7,
  ',' :   8,
  '-' :   9,
  '.' :  10,
  '/' :  11,
  '0' :  12,
  '1' :  13,
  '2' :  14,
  '3' :  15,
  '4' :  16,
  '5' :  17,
  '6' :  18,
  '7' :  19,
  ...
}


In [11]:
def vectorize_string(string):
  vectorized_output = np.array([char2idx[char] for char in string])
  return vectorized_output

In [12]:
print(vectorize_string(songs[musical_train_file]))

[49 22 14 ... 22 82  2]


In [13]:
print(len(songs[musical_train_file]))
print(len(vectorize_string(songs[musical_train_file])))

200425
200425


## PyTorch Dataset

In [37]:
class MusicalDataset(torch.nn.Module):
    def __init__(self, abc_string, seq_lenght):
        self.dataset = abc_string
        self.seq_lenght = seq_lenght

        vocab = self.vocabulary(abc_string)
        
        self.char2idx, self.idx2char = self.mapping(vocab)
        
        self.vectorized_dataset = self.vectorize_string(self.dataset)
    
    def __len__(self):
        '''
            Why -1...?
            Suppose seq_length is 4 and our text is "Hello". Then, our
            input sequence (x) is "Hell" and the target sequence (y) is "ello".
        '''
        return len(self.dataset) - self.seq_lenght - 1
        
    def __getitem__(self, idx):
        x = self.vectorized_dataset[idx : idx + self.seq_lenght]
        y = self.vectorized_dataset[idx + 1 : idx + self.seq_lenght + 1]

        return [torch.tensor(x, dtype=torch.long), torch.tensor(y, dtype=torch.long)]
    
    def vectorize_string(self, string):
        '''
            Vectorize (convert to numerical) a string using the
            mapping created by the notation presented in the dataset.
            
            @return a numpy array with `N` elements, where `N` is
                the number of characters in the input string
        '''
        return np.array([char2idx[char] for char in string])
    
    def vocabulary(self, string):
        '''
            Return the vocabulary used in the input string, i.e
            a set of no duplicated elements.
            
            @param string: the dataset with several songs written using
                a specific anotation
        '''
        return sorted(set(string))
    
    def mapping(self, vocab):
        '''
         Create a mapping from character to unique index and from 
         indices to characters. 
         
         @param vocab: A set with no duplicate elements which represents
             the vocabulary of our anotation (all unique characters).
         @return Mapping contained in a list [char2idx, idx2char]        
        '''
        char2idx = {u:i for i, u in enumerate(vocab)}
        idx2char = np.array(vocab)
        
        return [char2idx, idx2char]

        

In [44]:
test = MusicalDataset(songs[musical_train_file], 8)
print(len(test))

dataloader = DataLoader(test, batch_size=2, shuffle=False, num_workers=0)
print(len(dataloader))

idx = 0
for inputs, targets in dataloader:
    idx += 1

print(targets)

200416
100208
tensor([[27, 15,  1, 26, 82, 32, 15, 22],
        [15,  1, 26, 82, 32, 15, 22, 82]])


In [46]:
x, y = next(iter(dataloader))
print(x.shape)

torch.Size([2, 8])


# The Recurrent Neural Network (RNN) model

The model is based off the LSTM architecture, where we use a state vector to maintain information about the temporal relationships between consecutive characters.

<img src="https://raw.githubusercontent.com/aamini/introtodeeplearning/2019/lab1/img/lstm_unrolled-01-01.png" alt="Drawing"/>

Interesante añadir una descripción del funcionamiento de los *emmbeding layers* para que quede claro su funcionamiento. 

Arreglar la imagen para que quede más explicito las dimensiones entre capas.

Añadir los detalles de los pasos del LSTM: olvidar, añadir...

In [None]:
def MusicalLSTMModel(torch.nn.Module):
    '''Container module with an encoder, a recurrent module, and a decoder.'''

    def __init__(self, vocab_size, embedding_dim, hidden_state_dim, rnn_units, dropout=0.05):
        super(LSTMModel, self).__init__()

        self.encoder = torch.nn.Embedding(vocab_size, embedding_dim)
        self.rnn = torch.nn.LSTM(embedding_dim, hidden_state_dim, rnn_units, dropout=dropout)
        self.decoder = nn.Linear(hidden_state_dim, vocab_size)

    def forward(self, x, hidden):
        emb = self.encoder(x)
        output, hidden = self.rnn(emb, hidden)
        output = self.decoder(output)
        return output, hidden