# MusicVAE

This is my implementation of the MusicVAE described in the paper 
https://arxiv.org/abs/1803.05428 <br>
It is written with the help of the code found here: https://github.com/Variational-Autoencoder/MusicVAE

In [26]:
import torch
import numpy as np
import torch.nn as nn
from torch.nn.functional import softplus
#DATA IMPORTING LIBRARIES
# Add the src folder to the path
import sys
sys.path.insert(0, '../src/')

from data.dataloader import MidiDataset
from data.bar_transform import BarTransform
from torch.utils.data import Dataset, DataLoader

from torch.utils.data.sampler import SubsetRandomSampler
from torch.utils.data import DataLoader, random_split


#half of this is not yet needed but maybe it will be to visualize the latent space
import matplotlib.pyplot as plt
from IPython.display import Image, Audio, display, clear_output
import numpy as np
from sklearn.decomposition import PCA
%matplotlib nbagg
%matplotlib inline
import seaborn as sns
sns.set_style("whitegrid")
sns.set_palette(sns.dark_palette("purple"))

from midi_builder import MidiBuilder
builder = MidiBuilder()

In [27]:
cuda = torch.cuda.is_available()
device = torch.device("cuda" if cuda else "cpu")

In [3]:
NOTESPERBAR = 16 #total notes in one bar

totalbars = 16 #total bars as input 
NUM_PITCHES = 60 + 1 # all possible notes to play +1 for silences

TOTAL_NOTES = NOTESPERBAR * totalbars

num_features = NUM_PITCHES #size of input feature vector

batch_size = 64 #actual batchsize

TEACHER_FORCING = True #not used but it will be needed
csv_filename = 

LOADING DATA

In [24]:
transform = BarTransform(bars = totalbars, note_count = NUM_PITCHES)
midi_dataset = MidiDataset(csv_file = './piano_rolls.csv', transform = transform)
midi_dataset.get_mem_usage()

random_seed = 42
test_split = 0.2
shuffle = True

if random_seed is not None:
    np.random.seed(random_seed)
    
dataset_size = len(midi_dataset)           #number of musics on dataset
test_size = int(test_split * dataset_size) #test size length
train_size = dataset_size - test_size      #train data length

train_dataset, test_dataset = random_split(midi_dataset, [train_size, test_size])

train_loader = DataLoader(train_dataset, shuffle=shuffle, batch_size=batch_size, num_workers=4)#, sampler=train_sampler)
test_loader = DataLoader(test_dataset, shuffle=shuffle, batch_size=batch_size, num_workers=4)#, sampler=test_sampler)

print("Train size: {}, Test size: {}".format(train_size, test_size))

Train size: 6230, Test size: 1557


In [23]:
train_dataset[0]['piano_rolls'].shape

(256, 61)

## MusicVAE Model

In [25]:
# Define size of variables
input_size = NUM_PITCHES 
encoder_hidden_size = 256 # hidden size of the encoder 
conductor_hidden_size = 256 # hidden size of the decoder
decoder_hidden_size = 64 # hidden size of the decoder
decoder_initial_state = 32 # input size of the decoder

n_layers_encoder = 1
n_layers_conductor = 2
n_layers_decoder = 1


latent_features = 64 # latent space dimension
sequence_length = 16 # notes per decoder

dropout_rate = 0.2

In [None]:
class Encoder(nn.Module):
    def __init__(self, latent_featurest):
        super(Encoder, self).__init__()
        self.latent_features = latent_features
        
        self.encoder = nn.LSTM(
            batch_first = True,
            input_size = input_size,
            hidden_size = encoder_hidden_size,
            num_layers = n_layers_encoder,
            bidirectional = True)
        self.encoderOut = nn.Linear(2 * encoder_hidden_size, 2 * latent_features)
        self.linear_z = nn.Linear(latent_features, decoders_initial_size) 
        
    def init_hidden(self, batch_size):   
        init = torch.zeros(2, batch_size, enc_hidden_size, device = device)
        c0 = torch.zeros(2, batch_size, enc_hidden_size, device = device)

    def epsilon(self, epsilon_size):
        # Don't propagate gradients through randomness
        with torch.no_grad():
            epsilon = torch.randn(epsilon_size, 1, self.latent_features, device = device)
        return epsilon
        
    def forward(self, x):
        batch_size = x.size(0)
        note = torch.zeros(batch_size, 1 , NUM_PITCHES, device = device)
        x = self.worddropout(x)
        h0, c0, = self.init_hidden(batch_size)
        x, hidden = self.encoder(x, (h0,c0))
        x = self.encoderOut(x)
        mu, log_var = torch.chunk(x, 2, dim = -1)
        log_var = softplus(log_var) # Make sure that the log variance is positive

        sigma = torch.exp(2 * log_var)
        epsilon = self.epsilon(mu.size(0))
        z = mu + epsilon * sigma
        z = self.linear_z(z)
        return z, mu, log_var
        

In [None]:
class Decoder(nn.Module):
    def __init__(self, ):
        super(Decoder, self).__init__()
        self.conductor = nn.LSTM(decoders_initial_size, 
                                 decoders_initial_size, 
                                 num_layers = 1,
                                 batch_first = True)
        
        self.decoder = nn.LSTM(NUM_PITCHES + decoders_initial_size,
                               decoders_initial_size, 
                               num_layers = 1,
                               batch_first = True)
  

## DRAFTS

In [None]:
class VariationalAutoencoder(nn.Module):
    def __init__(self, latent_features, teacher_forcing, eps_i):
        super(VariationalAutoencoder, self).__init__()
        self.teacher_forcing = teacher_forcing
        self.eps_i = eps_i
        
        self.latent_features = latent_features
        
        self.encoder = nn.LSTM(
            batch_first = True,
            input_size = input_size,
            hidden_size = encoder_hidden_size,
            num_layers = n_layers_encoder,
            bidirectional = True)
        
        self.encoderOut = nn.Linear(2 * encoder_hidden_size, 2 * latent_features)
        self.linear_z = nn.Linear(latent_features, decoders_initial_size)
        
        self.dropout = nn.Dropout(p = dropout_rate)
        self.worddropout = nn.Dropout2d(p = dropout_rate)
        
        self.conductor = nn.LSTM(decoders_initial_size, 
                                 decoders_initial_size, 
                                 num_layers = 1,
                                 batch_first = True)
        
        self.decoder = nn.LSTM(NUM_PITCHES + decoders_initial_size, 
                               decoders_initial_size, 
                               num_layers = 1,
                               batch_first = True)
        
        self.linear = nn.Linear(decoders_initial_size, NUM_PITCHES)

   
       
        
            