# Creating a model and training it for generating and clustering music

In [1]:
#TODO: Create models: VAE, WAE, U-NET
#Save model weights
#Evaluate in this file?

In [2]:
#libraries
import deeplay as dl
import deeptrack as dt
import torch
import os
from PIL import Image, ImageOps
import torchvision.transforms as T
import matplotlib.pyplot as plt
import numpy as np


    pip install deeptrack==1.7

For more details, refer to the DeepTrack documentation.


In [3]:
#Preprocess data
data_dir = os.path.expanduser("./trainImages")

#Load image files using ImageFolder
trainFiles = dt.sources.ImageFolder(root=data_dir)

print(f"Number of train images: {len(trainFiles)}")

Number of train images: 24981


In [4]:
#Create image pipeline
class CropWidth:
    def __init__(self, target_width):
        self.target_width = target_width

    def __call__(self, x: torch.Tensor):
        # assuming input shape [C, H, W]
        return x[..., :self.target_width]
    
image_pip = (dt.LoadImage(trainFiles.path) >> dt.NormalizeMinMax()
             >> dt.MoveAxis(2, 0) >> dt.pytorch.ToTensor(dtype=torch.float) >> CropWidth(644))

In [5]:
img_tensor = image_pip(trainFiles.path[0])
print(f"The size of each image is: {img_tensor.shape}")

The size of each image is: torch.Size([1, 256, 644])


In [6]:
#VAE
from deeplay import AdamW
import torch
torch.cuda.empty_cache()
#Dont use reduction="sum" gradients becomes giant and memory issues are made worse. //BD
vae = dl.VariationalAutoEncoder(input_size=(256,647),
    latent_dim=20, channels=[32, 64],
    reconstruction_loss=torch.nn.BCELoss(reduction="mean"), beta=1, optimizer=AdamW(lr=0.001)
).create()

print(vae)

VariationalAutoEncoder(
  (encoder): ConvolutionalEncoder2d(
    (blocks): LayerList(
      (0): Conv2dBlock(
        (layer): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (activation): ReLU()
      )
      (1): Conv2dBlock(
        (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
        (layer): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (activation): ReLU()
      )
      (2): Conv2dBlock(
        (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
        (layer): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (activation): Identity()
      )
    )
    (postprocess): Flatten(start_dim=1, end_dim=-1)
  )
  (fc_mu): Linear(in_features=659456, out_features=20, bias=True)
  (fc_var): Linear(in_features=659456, out_features=20, bias=True)
  (fc_dec): Linear(in_features=20, out_features=659456, bias=True)
  (decoder): ConvolutionalDecoder2d

In [7]:
#Giovanni used image_pip & image_pip here, do not really know why // L-Thor
#I think it is because one is used for the mean and one is used for the variance //BD
train_dataset = dt.pytorch.Dataset(image_pip & image_pip, inputs=trainFiles)
train_loader = dl.DataLoader(train_dataset, batch_size=32, shuffle=True)

In [8]:
#NOTE: Training works, no errors, but it is VERY slow and my computer semi-freezes when I do it, perhaps something is wrong,
#or my computer is trash or maybe we just have to thug it out // L-Thor
#Memory issues, it seems to not properly reuse memory after each batch. Larger batches uses less memory which makes no sense to something
# s definetly wrong //BD
vae_trainer = dl.Trainer(max_epochs=25, accelerator="auto")
vae_trainer.fit(vae, train_loader)

/home/rasmus/.local/lib/python3.10/site-packages/lightning/pytorch/trainer/configuration_validator.py:70: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.
2025-04-29 18:12:57.932320: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-29 18:12:57.940625: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745943177.951133   23528 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745943177.954263   23528 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin 

Epoch 24: 100%|██████████| 781/781 [01:47<00:00,  7.23it/s, v_num=50, train_rec_loss_step=0.618, train_KL_step=44.70, train_total_loss_step=45.30, train_rec_loss_epoch=0.604, train_KL_epoch=23.50, train_total_loss_epoch=24.10]       


### Plotting the training progress and saving the model

In [9]:
import torch

vae_trainer.print()
torch.save(vae.state_dict(), "vae.pth")  #Saving the model.
#vae.load_state_dict(torch.load("vae.pth"))  // Use this to load the model! //BD






In [None]:
#WAE, let us just focus on the VAE for now, later we can use this code
wae = dl.WassersteinAutoEncoder(
    channels=[32, 64, 128], latent_dim=20,
    reconstruction_loss=torch.nn.MSELoss(reduction="mean"),
).create()

#print(wae)

In [None]:
#Example on how to generate music, just random noice now but with proper training and clustering we could try to sample things adjacent to a genre //BD

import torch
from Image2Sound import Image2Sound, SaveAudio
from torchvision.utils import save_image

vae.eval()  
n_samples = 1 
latent_dim = 20  
#Had to add a scalar to make the volume higher, think because of the normalization in the pipeline //BD
z = 255*torch.randn(n_samples, latent_dim).to(next(vae.parameters()).device)
print(z.shape)

with torch.no_grad():
    generated_image = vae.decode(z)
    #TODO: Change the Image2Sound so we don't have to define conf again here when generating, should be a simple enough fix. Alternatively just make a file holding the class
    #with a proper init //BD
    class conf:
        sampling_rate = 44100
        duration = 30
        hop_length = 694
        fmin = 20
        fmax = sampling_rate // 2
        n_mels = 128
        n_fft = n_mels * 20
        samples = sampling_rate * duration
    save_image(generated_image, 'generated_sample.jpg')
    audio = Image2Sound('generated_sample.jpg', conf)
    SaveAudio(audio,os.getcwd(),"testing.mp3")
