In [5]:
import dataloaders.mp3_dataloader
import librosa
import torch

In [2]:
sample_path = 'data/'
sample_rate = 16000
sample_length = 1024*10
dataset = mp3_dataloader.MP3Dataset(sample_path, sample_length=sample_length, SR=sample_rate)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=512, shuffle=True)


In [3]:
sample = next(iter(dataloader))
## play the first sample in the notebook
import IPython.display as ipd
ipd.Audio(sample[0].numpy(), rate=sample_rate)


In [4]:
import models.autoencoder
import torch.optim as optim
import torch.nn as nn

model = autoencoder.VAE_Audio(input_size=sample_length, latent_size=128)
def vae_loss(x, x_hat, mu, logvar):
    loss = nn.functional.mse_loss(x_hat, x, reduction='sum')
    KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    return loss + KLD

optimizer = optim.Adam(model.parameters(), lr=0.001)

for epoch in range(10):
    for i, sample in enumerate(dataloader):
        optimizer.zero_grad()
        x = sample
        x_hat, mu, logvar = model(x)
        loss = vae_loss(x, x_hat, mu, logvar)
        loss.backward()
        optimizer.step()
        if i % 100 == 0:
            print(f'epoch {epoch} step {i} loss {loss.item()}')
            ipd.Audio(x_hat[0].detach().numpy(), rate=sample_rate)
    torch.save(model.state_dict(), 'model.pth')

epoch 0 step 0 loss 5969549.5
epoch 1 step 0 loss 477903680.0


KeyboardInterrupt: 

In [None]:
## Test and play results
model.eval()
sample = next(iter(dataloader))
x = sample
x_hat, mu, logvar = model(x)
ipd.Audio(x_hat[0].detach().numpy(), rate=sample_rate)