# An Improved VAE

## Table of Contents
1. [Data and Model Definitions](Data-and-Model-Definitions)
2. [Audio Analysis](#Audio-Analysis)

In [39]:
import librosa, os #audio processing and file system parsing
import librosa.display
import numpy as np #math library
import tensorflow as tf #for model building
from tensorflow.keras import Model
from tensorflow.keras.layers import Input, Dense, Lambda, Reshape, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import backend as K
import matplotlib.pyplot as plt # for visualization
import pandas as pd #for data analysis / prep
import IPython.display as ipd #for sound output
from scipy.spatial.distance import cosine

## Data and Model Definitions

In this section we define our core data processing techniques and VAE model structure.

In [19]:
# Define hop_length (adjust as needed)
HOP_LENGTH = 512
TIME_FRAMES = 1290

def load_data(audio_dir: str, n_mels: int=128, time_frames: int=216):
    """Processes a directory of audio files into a DataFrame which contains the ground truth audio, ground truth spectrogram, and normalization parameters min_max for each song."""
    audio_files = os.listdir(audio_dir)
    audio_data = {'filename': [], 'gt_audio': [], 'gt_spectrogram': [], 'min_max': []}

    for file in audio_files:
        try:
            original_audio, sampling_rate = librosa.load(os.path.join(audio_dir, file), sr=None)  # Preserve original sampling rate

            mel_spectrogram = librosa.feature.melspectrogram(y=original_audio, sr=sampling_rate, n_mels=n_mels, hop_length=HOP_LENGTH)
            S_dB = librosa.power_to_db(mel_spectrogram, ref=np.max)  # Normalize based on max power
            
            # Save min/max values for proper denormalization later
            S_dB_min = np.min(S_dB)
            S_dB_max = np.max(S_dB)
            
            # Normalize to [0,1]
            S_dB = (S_dB - S_dB_min) / (S_dB_max - S_dB_min)

            # **Truncate or pad spectrogram to $time frames**
            if S_dB.shape[1] < TIME_FRAMES:
                pad_width = TIME_FRAMES - S_dB.shape[1]
                S_dB = np.pad(S_dB, pad_width=((0, 0), (0, pad_width)), mode='constant')
            elif S_dB.shape[1] > TIME_FRAMES:
                S_dB = S_dB[:, :TIME_FRAMES]
            
            # Store data
            audio_data['filename'].append(file)
            audio_data['gt_audio'].append(original_audio)
            audio_data['gt_spectrogram'].append(S_dB)
            audio_data['min_max'].append((S_dB_min, S_dB_max))  # Save for denormalization
        
        except Exception as e:
            print(f"Skipping file {file} due to error: {e}")
    
    return pd.DataFrame(audio_data)

def reconstruct_audio(spectrogram, sr=22050, n_mels=128, min_max=None):
    """Reconstructs audio from a normalized Mel spectrogram. Utilizes parameters from normalization to reverse process!"""
    S_dB_min, S_dB_max = min_max # Retrieve stored min/max values
    S_dB = spectrogram * int(S_dB_max - S_dB_min) + S_dB_min  # Reverse normalization
    S = librosa.db_to_power(S_dB) # Convert back to power scale
    reconstructed_audio = librosa.feature.inverse.mel_to_audio(S, sr=sr, hop_length=HOP_LENGTH) # Convert Mel spectrogram back to waveform using the correct hop_leng
    return reconstructed_audio

In [3]:
jazz_df = load_data("../data/kaggle/genres_original/jazz/")
jazz_df

  original_audio, sampling_rate = librosa.load(os.path.join(audio_dir, file), sr=None)  # Preserve original sampling rate
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Skipping file jazz.00054.wav due to error: 


Unnamed: 0,filename,gt_audio,gt_spectrogram,min_max
0,jazz.00080.wav,"[-0.010284424, -0.018707275, -0.014312744, -0....","[[0.6116525, 0.6273083, 0.59515595, 0.59347874...","(-62.807693, 0.0)"
1,jazz.00098.wav,"[-0.031066895, -0.05078125, -0.04537964, -0.04...","[[0.6259705, 0.5301552, 0.46367303, 0.4521164,...","(-80.0, 0.0)"
2,jazz.00001.wav,"[0.0024108887, 0.005493164, 0.008666992, 0.011...","[[0.59155864, 0.52710944, 0.507942, 0.5584143,...","(-80.0, 0.0)"
3,jazz.00069.wav,"[-0.029541016, -0.07070923, -0.10971069, -0.13...","[[0.65641576, 0.57051414, 0.40731898, 0.422603...","(-76.149796, 0.0)"
4,jazz.00050.wav,"[-0.13595581, -0.09390259, -0.06210327, -0.216...","[[0.842084, 0.87873775, 0.8421996, 0.8063969, ...","(-72.79858, 0.0)"
...,...,...,...,...
94,jazz.00008.wav,"[0.056854248, 0.075653076, 0.040283203, 0.0259...","[[0.54276, 0.5168449, 0.50455403, 0.6284218, 0...","(-80.0, 0.0)"
95,jazz.00013.wav,"[0.02557373, 0.043823242, 0.036010742, 0.02526...","[[0.6091228, 0.63428605, 0.6647167, 0.7280141,...","(-80.0, 0.0)"
96,jazz.00095.wav,"[-0.062194824, -0.09188843, -0.072052, -0.0799...","[[0.56795484, 0.53531694, 0.5111699, 0.5515669...","(-80.0, 0.0)"
97,jazz.00097.wav,"[0.02279663, 0.03564453, 0.029205322, 0.029632...","[[0.475236, 0.4526332, 0.4655529, 0.47875708, ...","(-80.0, 0.0)"


In [4]:
#let's try an architecture modeled off of this article (https://yuehan-z.medium.com/how-to-train-your-ml-models-music-generation-try-it-out-d4c0ab01c9f4)
n_mels, time_frames = 128, TIME_FRAMES #audio params
latent_dim = 16 #dimensionality of VAE

# Encoder
encoder_input = Input(shape=(n_mels, time_frames))
encoder_flatten = Flatten()(encoder_input)
z_mean = Dense(latent_dim)(encoder_flatten)
z_log_var = Dense(latent_dim)(encoder_flatten)

def sampling(args):
    z_mean, z_log_var = args
    batch = K.shape(z_mean)[0]
    dim = K.int_shape(z_mean)[1]
    epsilon = K.random_normal(shape=(batch, dim))
    return z_mean + K.exp(0.5 * z_log_var) * epsilon

z = Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_var])
encoder = Model(encoder_input, [z_mean, z_log_var, z], name="encoder")

#Decoder
decoder_input = Input(shape=(latent_dim,))
decoder_dense = Dense(n_mels * time_frames, activation="sigmoid")(decoder_input)
decoder_reshaped = Reshape((n_mels, time_frames))(decoder_dense)
decoder = Model(decoder_input, decoder_reshaped, name="decoder")

#Loss
def vae_loss_function(args):
    y_true, y_pred, z_mean, z_log_var = args
    reconstruction_loss = K.mean(K.square(y_true - y_pred))
    kl_loss = -0.5 * K.mean(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var))
    return reconstruction_loss + kl_loss

#Full VAE
# Create the VAE
vae_input = Input(shape=(n_mels, time_frames))
vae_target = Input(shape=(n_mels, time_frames))
encoder_output = encoder(vae_input)
decoder_output = decoder(encoder_output[2])

# Add the loss function to the model using a Lambda layer
loss = Lambda(vae_loss_function, output_shape=(1,), name='loss')([vae_target, decoder_output, encoder_output[0], encoder_output[1]])
vae = Model(inputs=[vae_input, vae_target], outputs=[decoder_output, loss])

2025-03-14 18:59:51.923710: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


In [5]:
#TRAIN VAE
len(set(jazz_df['gt_spectrogram'].apply(lambda spec: spec.shape))) == 1 #all specs the same length?
jazz_songs = np.array(jazz_df['gt_spectrogram'].tolist()) #transform song shapes

#hyperparams
batch_size = 10
epochs = 100

# Compile the VAE
vae.compile(optimizer=Adam(learning_rate=0.001), loss=['mse', None])

# Train the VAE
history = vae.fit([jazz_songs,jazz_songs], [jazz_songs,np.zeros_like(jazz_songs)],
                  epochs=epochs,
                  batch_size=batch_size)

Epoch 1/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 69ms/step - loss: 0.0616
Epoch 2/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 67ms/step - loss: 0.0243 
Epoch 3/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 71ms/step - loss: 0.0185 
Epoch 4/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 67ms/step - loss: 0.0200 
Epoch 5/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 68ms/step - loss: 0.0185 
Epoch 6/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 81ms/step - loss: 0.0170 
Epoch 7/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 101ms/step - loss: 0.0175
Epoch 8/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 73ms/step - loss: 0.0187 
Epoch 9/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 80ms/step - loss: 0.0161 
Epoch 10/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 81ms/s

## Model Analysis

In [6]:
#get min and max averages from normalizations
mi,ma = zip(*jazz_df['min_max'])
avg_minmax = np.mean(mi),np.mean(ma)

In [7]:
pred_spectrograms, _ = vae.predict([jazz_songs, jazz_songs])
pred_spectrograms.shape

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step 


(99, 128, 1290)

In [8]:
jazz_df['gt_spectrogram']

0     [[0.6116525, 0.6273083, 0.59515595, 0.59347874...
1     [[0.6259705, 0.5301552, 0.46367303, 0.4521164,...
2     [[0.59155864, 0.52710944, 0.507942, 0.5584143,...
3     [[0.65641576, 0.57051414, 0.40731898, 0.422603...
4     [[0.842084, 0.87873775, 0.8421996, 0.8063969, ...
                            ...                        
94    [[0.54276, 0.5168449, 0.50455403, 0.6284218, 0...
95    [[0.6091228, 0.63428605, 0.6647167, 0.7280141,...
96    [[0.56795484, 0.53531694, 0.5111699, 0.5515669...
97    [[0.475236, 0.4526332, 0.4655529, 0.47875708, ...
98    [[0.5353414, 0.5135221, 0.44609776, 0.43149728...
Name: gt_spectrogram, Length: 99, dtype: object

In [27]:
pred_spectrograms, _ = vae.predict([jazz_songs, jazz_songs])

def play_prediction(jazz_df: pd.DataFrame, pred_spectrograms: list, sample_n: int=0):
    """Convenience function to play a single generated song."""
    pred_audio = reconstruct_audio(pred_spectrograms[sample_n], 22050, 128, jazz_df.iloc[sample_n]['min_max'])
    return jazz_df.iloc[sample_n],ipd.Audio(pred_audio, rate=22050)

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step


In [35]:
real, pred = compare_song_audios(jazz_df, 5)

In [36]:
real

In [37]:
pred

In [46]:
def get_cosine_error(gt_spectrograms, pred_spectrograms)
    return cosine(gt_spectrograms.flatten(), pred_spectrograms.flatten())

cosine_error = get_cosine_error(jazz_df.['gt_spectrogram'], pred_spectrograms)

#TODO
#order by cosine error
#do cosine analysis, audio analysis, spectrogram analysis
#clean up other nbs
#try more data and 2d convolutions

ValueError: Input vector should be 1-D.

In [None]:
#make this into play_index_N function
#measure by cosine similarity, show best and words preds!!

In [None]:
break

In [None]:
sample_n = 5
original_song = audio_data['original_audio'][sample_n]
pos = ipd.Audio(original_song, rate=22050)
transformed_song = reconstruct_audio(audio_data['spectrogram'][sample_n], 22050, 128, audio_data['min_max'][sample_n])
pts = ipd.Audio(transformed_song, rate=22050) #this sounds HORRIBLE! it is probably the fact that we didn't unnormalize?
print(f"original size: {original_song.shape}\ttransformed size: {transformed_song.shape}")

In [None]:
pos

In [None]:
pts

In [None]:
#now transform original audio
#now play both

In [None]:
#let's try our conversion from the first notebook!