In [1]:
import numpy as np
import glob
import pypianoroll as ppr
import time
import music21
import os
import torch
import torch.utils.data
from torch import nn, optim
from torch.nn import functional as F
from utils.utils import *
#np.set_printoptions(threshold=np.inf)
#torch.set_printoptions(threshold=50000)

In [2]:
##################################
#HYPERPARAMS
##################################
epochs = 1
learning_rate = 5e-4
batch_size= 98
log_interval = 1  #Log/show loss per batch

# Load MIDI files from npz

In [3]:
data = np.load('../WikifoniaPartlyNoTranspose.npz')

midiDatasetTrain = data['train']
midiDatasetTest = data['test']

data.close()

"""
print("Training set: ({}, {}, {}, {})".format(midiDatasetTrain.size()[0],
                                                midiDatasetTrain.size()[1],
                                                midiDatasetTrain.size()[2],
                                                midiDatasetTrain.size()[3]))
print("Test set: ({}, {}, {}, {})".format(midiDatasetTest.size()[0],
                                                midiDatasetTest.size()[1],
                                                midiDatasetTest.size()[2],
                                                midiDatasetTest.size()[3]))
"""

print("Training set: {}".format(midiDatasetTrain.shape))
print("Test set: {}".format(midiDatasetTest.shape))

Training set: (4056, 1, 96, 60)
Test set: (1034, 1, 96, 60)


In [4]:
#print(getSlicedPianorollMatrix('WikifoniaServer/train80/Ahmad-Jamal---Poinciana.mid').shape)

In [5]:
fullPitch = 128
_, _, length, reducedPitch = midiDatasetTrain.shape

# CDVAE

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [7]:
midiDatasetTrain = torch.from_numpy(midiDatasetTrain)#.float()
trainLoader = torch.utils.data.DataLoader(midiDatasetTrain, batch_size=batch_size, shuffle=True, drop_last=True)

midiDatasetTest = torch.from_numpy(midiDatasetTest)#.float()
testLoader = torch.utils.data.DataLoader(midiDatasetTest, batch_size=batch_size, shuffle=True, drop_last=True)

In [14]:
class CDVAE(nn.Module):
    def __init__(self):
        super(CDVAE, self).__init__()
        
        ###ENCODER###
        self.encode1 = nn.Sequential(
            nn.Conv2d(1,100,(16,5),stride=(16,5),padding=0),
            nn.BatchNorm2d(100),
            nn.ELU(),
            nn.Conv2d(100,200,(2,1),stride=(2,1),padding=0),
            nn.BatchNorm2d(200),
            nn.ELU(),
            nn.Conv2d(200,400,(2,2),stride=(1,2),padding=0),
            nn.BatchNorm2d(400),
            nn.ELU(),
            nn.Conv2d(400,800,(2,2),stride=(2,2),padding=0),
            nn.BatchNorm2d(800),
            nn.ELU()
        )
            
        self.encode2 = nn.Sequential(
            nn.Linear(2400,800),
            nn.BatchNorm1d(800),
            nn.ELU(),
            nn.Linear(800,400),
            nn.BatchNorm1d(400),
            nn.ELU(),
            nn.Linear(400,100),
            nn.BatchNorm1d(100),
            nn.ELU()
        )

        ###DECODER###
        self.decode1 = nn.Sequential(
            nn.Linear(100,400),
            nn.BatchNorm1d(400),
            nn.ELU(),
            nn.Linear(400,800),
            nn.BatchNorm1d(800),
            nn.ELU(),
            nn.Linear(800,2400),
            nn.BatchNorm1d(2400),
            nn.ELU()
        )
        self.decode2 = nn.Sequential(
            nn.ConvTranspose2d(800,400,(2,2),stride=(2,2),padding=0),
            nn.BatchNorm2d(400),
            nn.ELU(),
            nn.ConvTranspose2d(400,200,(2,2),stride=(1,2),padding=0),
            nn.BatchNorm2d(200),
            nn.ELU(),
            nn.ConvTranspose2d(200,100,(2,1),stride=(2,1),padding=0),
            nn.BatchNorm2d(100),
            nn.ELU(),
            nn.ConvTranspose2d(100,1,(16,5),stride=(16,5),padding=0),
            nn.BatchNorm2d(1),
            nn.ELU()
        )
    
    def encoder(self, x):
        #print("ENOCDER")
        hEnc = self.encode1(x)
        hEnc = torch.squeeze(hEnc,3).view(-1,800*3)
        hEnc = self.encode2(hEnc)
        return hEnc

    def decoder(self, z):
        #print("DECODER")
        hDec = self.decode1(z)
        hDec = hDec.view(hDec.size()[0],800,-1).unsqueeze(2)
        hDec = self.decode2(hDec)
        return hDec

    def forward(self, x):
        mu = self.encoder(x)
        return self.decoder(mu), mu
    
model = CDVAE().to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)


def loss_function(recon_x, x, mu, dataset):
    
    cos = nn.CosineSimilarity(dim=1, eps=1e-8) 
    
    ###NESTED COSINE SIMILARITY SOFTMAX LOSS 
    ###WITH 4 RANDOM RECONSTRUCTED VECTORS
    ###WHICH INCLUDES NEGATIVE EXAMPLES OF RECONSTRUCTIONS
    if(False):#model.training): 
        sftmax = nn.Softmax(dim=0)
        lenDataset = dataset.size()[0]
        #print(lenDataset)
        softmax = 0
        for orgInput, recon in zip(x, recon_x):
                     
            #HAVE TO DO IT LIKE THIS BECAUSE BATCHSIZE 1 DOESN'T WORK IN PYTORCH
            rand = np.random.randint(0,lenDataset-1)
            randomRecons = dataset[rand:rand+1,:,:,:] 
            for i in range(3):
                rand = np.random.randint(0,lenDataset-1)
                randomTemp = dataset[rand:rand+1,:,:,:]
                randomRecons = torch.cat((randomRecons,randomTemp),dim=0)
            with torch.no_grad():
                randomRecons, _ = model(randomRecons.float().to(device))
            #COMPUTE COSINE SIMILARITY FOR ORIGINAL INPUT
            #AND ITS RECONSTRUCTION
            cosSimOrg = cos(orgInput.view(1,-1),recon.view(1,-1))
            print(cosSimOrg)
            
            #INCLUDE NEGATIVE EXAMPLES WITH SOFTMAX
            cosSim = cos(orgInput.view(1,-1),randomRecons[0].view(1,-1))
            for randomRecon in randomRecons[1:]:
                cosTemp = cos(orgInput.view(1,-1),randomRecon[0].view(1,-1))
                cosSim = torch.cat((cosSim, cosTemp),dim=0)
            cosSoftmax = torch.cat((cosSimOrg,cosSim),dim=0)            
            softmax += sftmax(cosSoftmax)[0]
        
        return softmax
    #OLDLOSS JUST COSINE SIMILARITY ON RECONSTRUCTION (GOOD FOR TESTING?)
    else:     
        cosSim = 0
        for datapoint, recon in zip(x, recon_x):
            cosTemp = cos(datapoint.view(1,-1),recon.view(1,-1))
            cosSim += cosTemp
            
        #print(cosSim)
        return x.size()[0]-cosSim
        

def train(epoch):
    model.train()
    trainLoss = 0

    for batch_idx, data in enumerate(trainLoader):
        ###DENOISING AUTOENCODER
        #data = data.float().to(device)
        #noise = torch.bernoulli((torch.rand_like(data))).to(device)
        #noisyData = data+noise
        #optimizer.zero_grad()
        #reconBatch, mu = model(noisyData)
        ###
        
        ###NORMAL AUTOENCODER
        data = data.float().to(device)
        optimizer.zero_grad()
        reconBatch, mu = model(data)
        ###
        
        loss = loss_function(reconBatch, data, mu, trainLoader.dataset)
        loss.backward()
        trainLoss += loss.item()
        optimizer.step()
        if(batch_idx % log_interval == 0):
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(trainLoader.dataset),
                100. * batch_idx / len(trainLoader),
                loss.item() / len(data)))
        #if (batch_idx == 10):
        #    break

    print('====> Epoch: {} Average loss: {:.4f}'.format(
          epoch, trainLoss / len(trainLoader.dataset)))

def test(epoch):
    model.eval()
    testLoss = 0
    with torch.no_grad():
        for i, data in enumerate(testLoader):
            ###DENOISING AUTOENCODER
            #data = data.float().to(device)
            #noise = torch.bernoulli((torch.rand_like(data))).to(device)
            #noisyData = data+noise
            #reconBatch, mu = model(noisyData)
            ###
            
            ###NORMAL AUTOENCODER
            data = data.float().to(device)
            reconBatch, mu = model(data)
            
            #TEMP
            randomRecons = torch.zeros(1,1,60,96).to(device)
            #TEMP END
            testLoss += loss_function(reconBatch, data, mu, randomRecons).item()
            
            #if(i==10):
            #    break
    testLoss /= len(testLoader.dataset)

    print('====> Test set loss: {:.4f}'.format(testLoss))

In [15]:
"""
#LOAD MODEL
pathToModel = 'model/YamahaPianoCompetition2002Transposedby60_10Epochs.model'

try:
    #LOAD TRAINED MODEL INTO GPU
    if(torch.cuda.is_available()):
        model = torch.load(pathToModel)
        
    #LOAD MODEL TRAINED ON GPU INTO CPU
    else:
        model = torch.load(pathToModel, map_location=lambda storage, loc: storage)
    print("\n--------model restored--------\n")
except:
    print("\n--------no saved model found--------\n")
"""
print('')




In [16]:
for epoch in range(1, epochs + 1):
    train(epoch)
    test(epoch)

tensor([0.0045], grad_fn=<DivBackward1>)
tensor([0.0054], grad_fn=<DivBackward1>)
tensor([0.0039], grad_fn=<DivBackward1>)
tensor([-0.0127], grad_fn=<DivBackward1>)
tensor([0.0094], grad_fn=<DivBackward1>)
tensor([-0.0035], grad_fn=<DivBackward1>)
tensor([-0.0136], grad_fn=<DivBackward1>)
tensor([0.0005], grad_fn=<DivBackward1>)
tensor([0.0164], grad_fn=<DivBackward1>)
tensor([0.0029], grad_fn=<DivBackward1>)
tensor([0.0203], grad_fn=<DivBackward1>)
tensor([-0.0020], grad_fn=<DivBackward1>)
tensor([-0.0066], grad_fn=<DivBackward1>)
tensor([0.0043], grad_fn=<DivBackward1>)
tensor([-0.0204], grad_fn=<DivBackward1>)
tensor([-0.0105], grad_fn=<DivBackward1>)
tensor([0.0077], grad_fn=<DivBackward1>)
tensor([0.0203], grad_fn=<DivBackward1>)
tensor([0.0077], grad_fn=<DivBackward1>)
tensor([-0.0064], grad_fn=<DivBackward1>)
tensor([-0.0013], grad_fn=<DivBackward1>)
tensor([0.0073], grad_fn=<DivBackward1>)
tensor([-0.0087], grad_fn=<DivBackward1>)
tensor([-0.0103], grad_fn=<DivBackward1>)
tenso

tensor([0.0263], grad_fn=<DivBackward1>)
tensor([-0.0971], grad_fn=<DivBackward1>)
tensor([-0.1399], grad_fn=<DivBackward1>)
tensor([-0.1200], grad_fn=<DivBackward1>)
tensor([-0.0429], grad_fn=<DivBackward1>)
tensor([-0.1249], grad_fn=<DivBackward1>)
tensor([-0.1558], grad_fn=<DivBackward1>)
tensor([-0.2104], grad_fn=<DivBackward1>)
tensor([-0.1415], grad_fn=<DivBackward1>)
tensor([-0.1698], grad_fn=<DivBackward1>)
tensor([-0.2422], grad_fn=<DivBackward1>)
tensor([-0.0655], grad_fn=<DivBackward1>)
tensor([-0.2098], grad_fn=<DivBackward1>)
tensor([-0.0277], grad_fn=<DivBackward1>)
tensor([-0.2228], grad_fn=<DivBackward1>)
tensor([-0.1727], grad_fn=<DivBackward1>)
tensor([-0.0854], grad_fn=<DivBackward1>)
tensor([-0.0983], grad_fn=<DivBackward1>)
tensor([-0.1673], grad_fn=<DivBackward1>)
tensor([-0.1896], grad_fn=<DivBackward1>)
tensor([-0.1459], grad_fn=<DivBackward1>)
tensor([-0.1123], grad_fn=<DivBackward1>)
tensor([-0.1417], grad_fn=<DivBackward1>)
tensor([-0.1361], grad_fn=<DivBackw

tensor([-0.2794], grad_fn=<DivBackward1>)


KeyboardInterrupt: 

In [None]:
#torch.save(model,'/media/EXTHD/niciData/models/DougMcKenzie14up14down.model')

In [None]:
print(model.encode1[0].weight.size())

In [None]:
np.set_printoptions(precision=2, suppress=True, threshold=np.inf)


In [None]:

###PLAY WHOLE SONG IN BARS
with torch.no_grad():
    
    sampleNp1 = getSlicedPianorollMatrixNp("../DougMcKenzieFiles_noDrums/samples/KingandI.mid")
    for sampleNp in sampleNp1:
        
        if(np.any(sampleNp)):
            #sampleNp = sampleNp[12,:,:]
            #print(sampleNp)
            sampleNp = sampleNp[:,36:-32]
            sample = torch.from_numpy(sampleNp).float()
            noise = torch.bernoulli((torch.rand_like(sample))).to(device)
            sample = sample + noise
            pred, embed = model(sample.reshape(1,1,length,reducedPitch).to(device))
            prediction = pred.squeeze(0).squeeze(0).cpu().numpy()
            
            #print(sampleNp[:,:])
            #print(prediction[:,:])
            #print(np.sum(sampleNp.numpy(), axis=1))
            
            #NORMALIZE PREDICTIONS
            prediction /= np.abs(np.max(prediction))
            
            #CHECK MIDI ACTIVATIONS IN PREDICTION TO INCLUDE RESTS
            prediction[prediction < 0.3] = 0

            

            ###MONOPHONIC OUTPUT MATRIX POLOYPHONIC POSSIBLE WITH ACTIVATION THRESHOLD###
            #score = music21.converter.parse('WikifoniaServer/samples/The-Doors---Don\'t-you-love-her-Madly?.mid')
            #score.show()

            samplePlay = debinarizeMidi(sampleNp, prediction=False)
            samplePlay = addCuttedOctaves(samplePlay)
            prediction = debinarizeMidi(prediction, prediction=True)
            prediction = addCuttedOctaves(prediction)
            #print(np.argmax(samplePlay, axis=1))
            #print('')
            #print(np.argmax(prediction, axis=1))
            print("INPUT")
            pianorollMatrixToTempMidi(samplePlay,show=True,showPlayer=True,autoplay=True)
            #time.sleep(1)
            print("RECONSTRUCTION")
            pianorollMatrixToTempMidi(prediction,show=True,showPlayer=True,autoplay=True)
            print("\n\n")
            

print('')