# Creating a model and training it for generating and clustering music

In [1]:
#TODO: Create models: VAE, WAE, U-NET
#Save model weights
#Evaluate in this file?

In [2]:
#libraries
import deeplay as dl
import deeptrack as dt
import torch
import os
from PIL import Image, ImageOps
import torchvision.transforms as T
import matplotlib.pyplot as plt
import numpy as np



In [3]:
#Preprocess data
data_dir = os.path.expanduser("./trainImagesSmall")

#Load image files using ImageFolder
#trainFiles = dt.sources.ImageFolder(root=data_dir)
trainFilesSmall = dt.sources.ImageFolder(root=data_dir)

print(f"Number of train images: {len(trainFilesSmall)}")

Number of train images: 10000


In [4]:
#Create image pipeline
class CropWidth:
    def __init__(self, target_width):
        self.target_width = target_width

    def __call__(self, x: torch.Tensor):
        return x[..., :self.target_width]
    
image_pip = (dt.LoadImage(trainFilesSmall.path) >> dt.NormalizeMinMax()
             >> dt.MoveAxis(2, 0) >> dt.pytorch.ToTensor(dtype=torch.float) >> CropWidth(512
             ))

In [5]:
img_tensor = image_pip(trainFilesSmall.path[0])
print(f"The size of each image is: {img_tensor.shape}")

The size of each image is: torch.Size([1, 512, 512])


In [6]:
#VAE
from deeplay import AdamW #Specifically importing AdamW so we can easily change it to something else like SGDM //BD
import torch

#Even with very small beta the KL loss seems to decrease while the rec loss stays steady, suggesting the network is not able to learn the latent space //BD

from variationalAutoEncoder import VariationalAutoEncoder

vae = VariationalAutoEncoder(input_size=(512,512),
    latent_dim=50, channels=[32, 64, 128],
    reconstruction_loss=torch.nn.MSELoss(), beta=1, optimizer=AdamW(lr=1e-3)
).create()


vae.encoder.blocks[0].layer.kernel_size = (5,5)
vae.encoder.blocks[1].layer.kernel_size = (5,5)
vae.encoder.blocks[2].layer.kernel_size = (5,5)
vae.encoder.blocks[3].layer.kernel_size = (5,5)
vae.decoder.blocks[0].layer.kernel_size = (5,5)
vae.decoder.blocks[1].layer.kernel_size = (5,5)
vae.decoder.blocks[2].layer.kernel_size = (5,5)
vae.decoder.blocks[3].layer.kernel_size = (5,5)


print(vae)


VariationalAutoEncoder(
  (encoder): ConvolutionalEncoder2d(
    (blocks): LayerList(
      (0): Conv2dBlock(
        (layer): Conv2d(1, 32, kernel_size=(5, 5), stride=(1, 1), padding=(1, 1))
        (activation): ReLU()
      )
      (1): Conv2dBlock(
        (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
        (layer): Conv2d(32, 64, kernel_size=(5, 5), stride=(1, 1), padding=(1, 1))
        (activation): ReLU()
      )
      (2): Conv2dBlock(
        (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
        (layer): Conv2d(64, 128, kernel_size=(5, 5), stride=(1, 1), padding=(1, 1))
        (activation): ReLU()
      )
      (3): Conv2dBlock(
        (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
        (layer): Conv2d(128, 128, kernel_size=(5, 5), stride=(1, 1), padding=(1, 1))
        (activation): Identity()
      )
    )
    (postprocess): Flatten(start_dim=1, end_dim=-1)
 

In [7]:
import dataset #Imported modified dataset code
#train_dataset = dataset.Dataset(image_pip & image_pip, inputs=trainFiles)
train_dataset = dt.pytorch.Dataset(image_pip & image_pip, inputs = trainFilesSmall)
train_loader = dl.DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=15)

In [None]:
vae_trainer = dl.Trainer(max_epochs=20, accelerator="auto", accumulate_grad_batches=8)
vae_trainer.fit(vae, train_loader)

/home/rasmus/.local/lib/python3.10/site-packages/lightning/pytorch/trainer/configuration_validator.py:70: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.

  | Name                | Type                   | Params | Mode 
-----------------------------------------------------------------------
0 | encoder             | ConvolutionalEncoder2d | 240 K  | train
1 | fc_mu               | Linear                 | 26.2 M | train
2 | fc_var              | Linear                 | 26.2 M | train
3 | fc_dec              | Linear                 | 26.7 M | train
4 | decoder             | ConvolutionalDecoder2d | 326 K  | train
5 | reconstruction_loss | MSELoss                | 0      | train
6 | train_metrics       | MetricCollection       | 0      | train
7 | val_metrics         | MetricCollection       | 0      | train
8 | test_metrics        | MetricCollection       | 0      | train
9 | optimizer           | AdamW                  | 0      | train
--------------

Epoch 0:  50%|█████     | 314/625 [00:42<00:42,  7.34it/s, v_num=304, train_rec_loss_step=0.917, train_KL_penalty_step=2.25e+4, train_total_loss_step=2.25e+4]

### Plotting the training progress and saving the model

In [None]:
import torch
vae_trainer.print()
torch.save(vae.state_dict(), "vae.pth")  #Saving the model.
#vae.load_state_dict(torch.load("vae.pth"))  // Use this to load the weights into a new model //BD

In [None]:
#WAE, let us just focus on the VAE for now, later we can use this code

wae = dl.WassersteinAutoEncoder(
    channels=[32, 64, 128], latent_dim=20,
    reconstruction_loss=torch.nn.MSELoss(reduction="mean"),
).create()

#print(wae)

In [None]:
#Example on how to generate music, just random noice now but with proper training and clustering we could try to sample things adjacent to a genre //BD
"""
--------------------------
THIS IS OUTDATED DON'T USE
--------------------------
"""
import torch
from Image2Sound import Image2Sound, SaveAudio
from torchvision.utils import save_image

vae.eval()  
n_samples = 1 
latent_dim = 20  
#Had to add a scalar to make the volume higher, think because of the normalization in the pipeline //BD
z = 255*torch.randn(n_samples, latent_dim).to(next(vae.parameters()).device)
print(z.shape)

with torch.no_grad():
    generated_image = vae.decode(z)
    #TODO: Change the Image2Sound so we don't have to define conf again here when generating, should be a simple enough fix. Alternatively just make a file holding the class
    #with a proper init //BD
    class conf:
        sampling_rate = 44100
        duration = 30
        hop_length = 694
        fmin = 20
        fmax = sampling_rate // 2
        n_mels = 128
        n_fft = n_mels * 20
        samples = sampling_rate * duration
    save_image(generated_image, 'generated_sample.jpg')
    audio = Image2Sound('generated_sample.jpg', conf)
    SaveAudio(audio,os.getcwd(),"testing.mp3")
