# MusicVAE

This is my implementation of the MusicVAE described in the paper 
https://arxiv.org/abs/1803.05428 <br>
It is written with the help of the code found here: https://github.com/Variational-Autoencoder/MusicVAE

In [1]:
import torch
import numpy as np
import torch.nn as nn
from torch.nn.functional import softplus
#DATA IMPORTING LIBRARIES
# Add the src folder to the path
import sys
sys.path.insert(0, '../src/')

from data.dataloader import MidiDataset
from data.bar_transform import BarTransform
from torch.utils.data import Dataset, DataLoader

from torch.utils.data.sampler import SubsetRandomSampler
from torch.utils.data import DataLoader, random_split


#half of this is not yet needed but maybe it will be to visualize the latent space
import matplotlib.pyplot as plt
from IPython.display import Image, Audio, display, clear_output
import numpy as np
from sklearn.decomposition import PCA
%matplotlib nbagg
%matplotlib inline
import seaborn as sns
sns.set_style("whitegrid")
sns.set_palette(sns.dark_palette("purple"))

from midi_builder import MidiBuilder
builder = MidiBuilder()

In [2]:
NOTESPERBAR=16 #total notes in one bar

totalbars=16 #total bars as input 
NUM_PITCHES=60+1 # all possible notes to play +1 for silences

TOTAL_NOTES=NOTESPERBAR*totalbars

num_features=NUM_PITCHES #size of input feature vector

batch_size = 64 #actual batchsize

TEACHER_FORCING=True #not used but it will be needed


In [3]:
cuda = torch.cuda.is_available()
device = torch.device("cuda" if cuda else "cpu")

In [9]:
# Define size of variables

params = dict(
    input_size = NUM_PITCHES, 
    encoder_hidden_size = 256, # hidden size of the encoder 
    conductor_hidden_size = 256, # hidden size of the decoder
    decoder_hidden_size = 64, # hidden size of the decoder
    decoder_initial_state = 32, # input size of the decoder

    n_layers_encoder = 1,
    n_layers_conductor = 2,
    n_layers_decoder = 1,

    latent_features = 64, # latent space dimension
    sequence_length = 16, # notes per decoder

    dropout_rate = 0.2,
    device = device,
    
    csv_filename = './piano_rolls.csv',
    
    NOTESPERBAR = NOTESPERBAR,
    totalbars = totalbars, #total bars as input 
    NUM_PITCHES = NUM_PITCHES, # all possible notes to play +1 for silences   
    TOTAL_NOTES = TOTAL_NOTES,
    num_features = num_features, #size of input feature vector
    batch_size = batch_size, #actual batchsize
    TEACHER_FORCING = True #not used but it will be needed
)

LOADING DATA

In [10]:
transform = BarTransform(bars = params['totalbars'], note_count = params['NUM_PITCHES'])
midi_dataset = MidiDataset(csv_file = params['csv_filename'], transform = transform)
midi_dataset.get_mem_usage()

random_seed = 42
test_split = 0.2
shuffle = True

if random_seed is not None:
    np.random.seed(random_seed)
    
dataset_size = len(midi_dataset)           #number of musics on dataset
test_size = int(test_split * dataset_size) #test size length
train_size = dataset_size - test_size      #train data length

train_dataset, test_dataset = random_split(midi_dataset, [train_size, test_size])

train_loader = DataLoader(train_dataset, 
                          shuffle=shuffle, 
                          batch_size = params['batch_size'], 
                          num_workers = 4)#, sampler=train_sampler)
test_loader = DataLoader(test_dataset, 
                         shuffle = shuffle, 
                         batch_size = params['batch_size'], 
                         num_workers = 4)#, sampler=test_sampler)

print("Train size: {}, Test size: {}".format(train_size, test_size))

  mask |= (ar1 == a)


Train size: 6230, Test size: 1557


In [11]:
train_dataset[0]['piano_rolls'].shape

(256, 61)

## MusicVAE Model

In [12]:
class Encoder(nn.Module):
    def __init__(self, vae_params):
        super(Encoder, self).__init__()
        self.latent_features = vae_params['latent_features']
        self.decoder_initial_size = vae_params['decoder_hidden_size']
        self.encoder_hidden_size = vae_params['encoder_hidden_size']
        self.NUM_PITCHES = vae_params['NUM_PITCHES']
        self.device = vae_params['device']
        self.dropout_rate = vae_params['dropout_rate']
        self.worddropout = nn.Dropout2d(p = self.dropout_rate)
        
        self.encoder = nn.LSTM(
            input_size = vae_params['input_size'],
            hidden_size = vae_params['encoder_hidden_size'],
            num_layers = vae_params['n_layers_encoder'],
            batch_first = True,
            bidirectional = True)
        self.encoderOut = nn.Linear(2 * self.encoder_hidden_size, 2 * self.latent_features)
        self.linear_z = nn.Linear(self.latent_features, self.decoder_initial_size) 
        
    def init_hidden(self, batch_size):   
        init = torch.zeros(2, batch_size, self.encoder_hidden_size, device = self.device)
        c0 = torch.zeros(2, batch_size, self.encoder_hidden_size, device = self.device)
        return init, c0

    def epsilon(self, epsilon_size):
        # Don't propagate gradients through randomness
        with torch.no_grad():
            epsilon = torch.randn(epsilon_size, 1, self.latent_features, device = self.device)
        return epsilon
        
    def forward(self, x):
        batch_size = x.size(0)
        note = torch.zeros(batch_size, 1 , self.NUM_PITCHES, device = self.device)
        x = self.worddropout(x)
        h0, c0, = self.init_hidden(batch_size)
        x, hidden = self.encoder(x, (h0,c0))
        x = self.encoderOut(x)
        mu, log_var = torch.chunk(x, 2, dim = -1)
        log_var = softplus(log_var) # Make sure that the log variance is positive

        sigma = torch.exp(2 * log_var)
        epsilon = self.epsilon(mu.size(0))
        z = mu + epsilon * sigma
        z = self.linear_z(z)
        return z, mu, log_var
        

In [34]:
class Decoder(nn.Module):
    def __init__(self, vae_parameters, eps_i = 1):
        super(Decoder, self).__init__()
        self.latent_features = vae_params['latent_features']
        self.decoder_initial_size = vae_params['decoder_hidden_size']
        self.encoder_hidden_size = vae_params['encoder_hidden_size']
        self.NUM_PITCHES = vae_params['NUM_PITCHES']
        self.TOTAL_NOTES = vae_params['TOTAL_NOTES']
        self.device = vae_params['device']
        self.TEACHER_FORCING = params['TEACHER_FORCING']
        self.eps_i = eps_i
        
        self.conductor = nn.LSTM(
            input_size = self.decoder_initial_size, 
            hidden_size = self.decoder_initial_size, 
            num_layers = 1,
            batch_first = True
        )
        
        self.decoder = nn.LSTM(
            input_size = self.NUM_PITCHES + self.decoder_initial_size,
            hidden_size = self.decoder_initial_size, 
            num_layers = 1,
            batch_first = True)
        
        self.linear = nn.Linear(self.decoder_initial_size, self.NUM_PITCHES)
        
    def use_teacher_forcing(self):
        with torch.no_grad():
            tf = np.random.rand(1)[0] <= self.eps_i
        return tf   
    
    def init_hidden(self, batch_size):
        init_conductor = torch.zeros(1, batch_size, self.decoder_initial_size, device = self.device)
        c_condunctor = torch.zeros(1, batch_size, self.decoder_initial_size, device = self.device)     
        return init_conductor, c_condunctor
    
    def forward(self, z, x):
        batch_size = x.size(0)
        
        hconductor, cconductor = self.init_hidden(batch_size)
        conductor_hidden = (hconductor,cconductor)
        counter = 0
        notes = torch.zeros(batch_size, self.TOTAL_NOTES, self.NUM_PITCHES, device = self.device)
        note = torch.zeros(batch_size, 1 , self.NUM_PITCHES, device = self.device)
        the_input = torch.cat([note,x],dim=1)
        for i in range(16):
            embedding, conductor_hidden = self.conductor(z[:,i,:].view(batch_size, 1, -1), conductor_hidden)  
            # Reset the decoder state of each 16 bar sequence
            decoder_hidden = (torch.randn(1, batch_size, self.decoder_initial_size, device = self.device), 
                              torch.randn(1, batch_size, self.decoder_initial_size, device = self.device))
            if self.use_teacher_forcing():
                embedding = embedding.expand(batch_size, NOTESPERBAR, embedding.shape[2])
                e = torch.cat([embedding,the_input[:, range(i * 16, i * 16 + 16),:]],dim = -1)
                notes2, decoder_hidden = self.decoder(e, decoder_hidden)
                aux = self.linear(notes2)
                aux = torch.softmax(aux, dim = 2);
                #generates 16 notes per batch at a time
                notes[:, range(i * 16, i * 16 + 16),:] = aux;
            else:           
                for _ in range(sequence_length):
                    # Concat embedding with previous note         
                    e = torch.cat([embedding, note], dim=-1)
                    e = e.view(batch_size, 1, -1)
                    # Generate a single note (for each batch)
                    note, decoder_hidden = self.decoder(e, decoder_hidden)                  
                    aux = self.linear(note)
                    aux = torch.softmax(aux, dim = 2);                   
                    notes[:,counter,:] = aux.squeeze();                   
                    note = aux                    
                    counter = counter + 1
        outputs = {}         
        outputs['x_hat'] = notes
        outputs['z'] = z
        return outputs 
       

In [35]:
encoder = Encoder(vae_params).to(device)

In [36]:
decoder = Decoder(vae_params, TEACHER_FORCING).to(device)

In [57]:
#directly taken from notebook, probably some adaptation might be needed

from torch.nn.functional import binary_cross_entropy
from torch import optim
from torch.distributions.normal import Normal
from torch.distributions.kl import kl_divergence


def ELBO_loss(y, t, mu, log_var, weight):
    # Reconstruction error, log[p(x|z)]
    # Sum over features
    likelihood = -binary_cross_entropy(y, t, reduction="none")
    likelihood = likelihood.view(likelihood.size(0), -1).sum(1)

    # Regularization error: 
    # Kulback-Leibler divergence between approximate posterior, q(z|x)
    # and prior p(z) = N(z | mu, sigma*I).
    sigma = torch.exp(log_var*2)
    n_mu = torch.Tensor([0])
    n_sigma = torch.Tensor([1])
    if cuda:
        n_mu = n_mu.cuda()
        n_sigma = n_sigma.cuda()

    p = Normal(n_mu, n_sigma)
    q = Normal(mu, sigma)

    #The method signature is P and Q, but might need to be reversed to calculate divergence of Q with respect to P
    kl_div = kl_divergence(q, p)
    
    # In the case of the KL-divergence between diagonal covariance Gaussian and 
    # a standard Gaussian, an analytic solution exists. Using this excerts a lower
    # variance estimator of KL(q||p)
    #kl = -weight * torch.sum(1 + log_var - mu**2 - torch.exp(log_var), dim=(1,2))
    
    # Combining the two terms in the evidence lower bound objective (ELBO) 
    # mean over batch
    ELBO = torch.mean(likelihood) - (weight*torch.mean(kl_div)) # add a weight to the kl using warmup
    
    # notice minus sign as we want to maximise ELBO
    return -ELBO, kl_div.mean(),weight*kl_div.mean() # mean instead of sum



# define our optimizer
# The Adam optimizer works really well with VAEs.
params = list(encoder.parameters()) + list(decoder.parameters())
optimizer = optim.Adam(params, lr=0.001)
loss_function = ELBO_loss

Testing if forward pass works

In [59]:
from torch.autograd import Variable

#setting dummy data
#Generating dummy data
a = np.random.randint(NUM_PITCHES, size = TOTAL_NOTES)
data = np.zeros((TOTAL_NOTES, NUM_PITCHES))
data[np.arange(TOTAL_NOTES), a] = 1 #generating dummy data

a = np.random.randint(NUM_PITCHES, size = TOTAL_NOTES)
data1 = np.zeros((TOTAL_NOTES, NUM_PITCHES))
data1[np.arange(TOTAL_NOTES), a] = 1 #generating dummy data
d = np.zeros((2,TOTAL_NOTES, NUM_PITCHES))
d[0] = data
d[1] = data1

print(d.shape)
x = d 
x = Variable(torch.Tensor(x)).to(device)
z, mu, log_var = encoder(x)
outputs = decoder(z,x)


x_hat = outputs["x_hat"]
z = outputs["z"]

loss, kl,klw = loss_function(x_hat, x, mu, log_var,1)

print('x shape:', x.shape)
print('z_shape', x_hat.shape)
print('z shape:', z.shape)
print(loss)
print(kl)


(2, 256, 61)
x shape: torch.Size([2, 256, 61])
z_shape torch.Size([2, 256, 61])
z shape: torch.Size([2, 256, 64])
tensor(1312.7104, grad_fn=<NegBackward>)
tensor(6.1399, grad_fn=<MeanBackward0>)


## DRAFTS

In [None]:
class VariationalAutoencoder(nn.Module):
    def __init__(self, latent_features, teacher_forcing, eps_i):
        super(VariationalAutoencoder, self).__init__()
        self.teacher_forcing = teacher_forcing
        self.eps_i = eps_i
        
        self.latent_features = latent_features
        
        self.encoder = nn.LSTM(
            batch_first = True,
            input_size = input_size,
            hidden_size = encoder_hidden_size,
            num_layers = n_layers_encoder,
            bidirectional = True)
        
        self.encoderOut = nn.Linear(2 * encoder_hidden_size, 2 * latent_features)
        self.linear_z = nn.Linear(latent_features, decoders_initial_size)
        
        self.dropout = nn.Dropout(p = dropout_rate)
        self.worddropout = nn.Dropout2d(p = dropout_rate)
        
        self.conductor = nn.LSTM(decoders_initial_size, 
                                 decoders_initial_size, 
                                 num_layers = 1,
                                 batch_first = True)
        
        self.decoder = nn.LSTM(NUM_PITCHES + decoders_initial_size, 
                               decoders_initial_size, 
                               num_layers = 1,
                               batch_first = True)
        
        self.linear = nn.Linear(decoders_initial_size, NUM_PITCHES)

   
       
        
            