# Set up model and hyperparameters

In [1]:
from models.WaveNetVAE.WaveVae import WaveNetVAE
from models.WaveNetVAE.WVData import WVDataset
import torch
from torch.utils.data import DataLoader
import warnings
warnings.filterwarnings("ignore")

"""
Hyperparameters
"""

learning_rate = 0.000001
batchsize = 8
device='cuda'
input_size = (40, 112)
upsamples = [2, 2, 2, 2, 2, 2, 2, 2]
zsize = 32

WaveVAE = WaveNetVAE(input_size,
                     num_hiddens = 768,
                     upsamples = upsamples,
                     zsize = zsize,
                    out_channels = 256)

WaveVAE.to(device)

VAEDataset = WVDataset(audio_path = "../ConvDenoiser_Old/clips",
                       length = 4096,
                       skip_size = 4096 // 2,
                       sample_rate = 24000,
                       max_files = 200,
                       hop_length = 128)

val_VAEDataset = WVDataset(audio_path = "../ConvDenoiser_Old/testdata",
                       length = 4096,
                       skip_size = 4096 // 2,
                       sample_rate = 24000,
                       max_files = 200,
                       hop_length = 128)

VAEDataloader = DataLoader(VAEDataset,
                           batch_size = batchsize,
                           shuffle = True)

val_VAEDataloader = DataLoader(val_VAEDataset,
                           batch_size = batchsize,
                           shuffle = False)

OSError: [WinError 127] The specified procedure could not be found. Error loading "C:\Users\woute\.conda\envs\2Diff\lib\site-packages\torch\lib\torch_cuda_cpp.dll" or one of its dependencies.

# Test tensor

In [12]:
import torchaudio

loss_fn = torch.nn.MSELoss()
onehot, mfcc, target = next(iter(VAEDataloader))
onehot = onehot.to(device)
mfcc = mfcc.to(device)
target = target.to(device)

print("Trying tensors with sizes:")
print("Onehot size: ", onehot.size(), "| MFCC size: ", mfcc.size(), "| Target size: ", target.size())

output, _, _ = WaveVAE(onehot, mfcc, True)
print("Tensors passed through model succesfully", output.size())

reconstruction_loss = loss_fn(output[:, :, -1], target)
print("Reconstruction loss =", reconstruction_loss.item())

# onehot = torch.nn.functional.sigmoid(onehot[0, :, -1])
# output = torch.nn.functional.sigmoid(output[0, :, -1])
# onehotmax = torch.argmax(onehot)
# outputmax = torch.argmax(output)

print(onehot[0, :, -1], output[0, :, -1], target[0])




Trying tensors with sizes:
Onehot size:  torch.Size([8, 1, 4096]) | MFCC size:  torch.Size([8, 40, 33]) | Target size:  torch.Size([8])
Tensors passed through model succesfully torch.Size([8, 1, 4096])
Reconstruction loss = 0.021551592275500298
tensor([-0.0264], device='cuda:0') tensor([0.0110], device='cuda:0', grad_fn=<SelectBackward0>) tensor(-0.0238, device='cuda:0')


# Start training

In [2]:
from models.WaveNetVAE.train import train
from torch.utils.tensorboard import SummaryWriter
import warnings
warnings.filterwarnings("ignore")
writer = SummaryWriter()

train(WaveVAE, VAEDataloader, val_VAEDataloader, 
      writer = writer, 
      learning_rate = learning_rate,
      epoch_amount = 100,
      logs_per_epoch = 5,
      kl_anneal = 0.01,
      max_kl = 0.5,
      device = device)


Training. Epoch: 0. Loss for step 1: n.v.t.:   0%|          | 0/1051 [00:00<?, ?it/s]

RuntimeError: Given groups=1, weight of size [768, 256, 1], expected input[8, 1, 4096] to have 256 channels, but got 1 channels instead

# Generate Audio

In [None]:
val_VAEDataset = WVDataset(audio_path = "../ConvDenoiser_Old/testdata",
                       length = 4096,
                       skip_size = 1,
                       sample_rate = 24000,
                       max_files = 1,
                       hop_length = 128)

val_VAEDataloader = DataLoader(val_VAEDataset,
                           batch_size = 1,
                           shuffle = False)

generated_audio = WaveVAE.inference(val_VAEDataloader)