In [1]:
import librosa, os #audio processing and file system parsing
import librosa.display
import numpy as np #math library
import tensorflow as tf #for model building
from tensorflow.keras import Model
from tensorflow.keras.layers import Input, Dense, Lambda, Reshape, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import backend as K
import matplotlib.pyplot as plt # for visualization
import pandas as pd #for data analysis / prep
import IPython.display as ipd #for sound output

2025-03-14 16:26:05.892878: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-14 16:26:05.926368: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-03-14 16:26:06.248411: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-03-14 16:26:06.452158: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1741983966.644438   10231 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1741983966.69

In [9]:
def load_data(audio_dir: str, n_mels: int=128, time_frames: int=216):
    audio_files = os.listdir(audio_dir)
    audio_data = {'original_audio': [], 'spectrogram': [], 'transformed_audio': []}

    for file in audio_files:
        try:
            original_audio, sampling_rate = librosa.load(audio_dir+file)

            mel_spectrogram = librosa.feature.melspectrogram(y=original_audio, sr=sampling_rate, n_mels=n_mels)
            S_dB = librosa.power_to_db(mel_spectrogram)

            # If the time_frames is greater than the actual frames, pad with zeros
            if S_dB.shape[1] < time_frames:
                pad_width = time_frames - S_dB.shape[1]
                S_dB = np.pad(S_dB, pad_width=((0, 0), (0, pad_width)), mode='constant')

            # If the time_frames is less than the actual frames, truncate the extra frames
            if S_dB.shape[1] > time_frames:
                S_dB = S_dB[:, :time_frames]

            #Normalize the data
            S_dB = (S_dB - np.min(S_dB)) / (np.max(S_dB) - np.min(S_dB))
            final_spectrogram = np.array(S_dB)
            
            # Save data
            audio_data['original_audio'].append(original_audio)
            audio_data['spectrogram'].append(final_spectrogram)
        
        except Exception as e:
            print(f"Skipping file {file} due to error: {e}")
    
    return audio_data

def reconstruct_audio(spectrogram, sr=22050, n_mels=128):
    """
    Reconstructs audio from a normalized Mel spectrogram.
    """
    
    # Reverse normalization
    S_dB_min = np.min(spectrogram)
    S_dB_max = np.max(spectrogram)
    S_dB = spectrogram * (S_dB_max - S_dB_min) + S_dB_min
    
    # Convert back to power scale
    S = librosa.db_to_power(S_dB)
    
    # Convert Mel spectrogram back to waveform
    reconstructed_audio = librosa.feature.inverse.mel_to_audio(S, sr=sr)
    
    return reconstructed_audio

In [35]:
import numpy as np
import librosa
import librosa.display
import soundfile as sf
import os

# Define hop_length (adjust as needed)
HOP_LENGTH = 512

def load_data(audio_dir: str, n_mels: int=128, time_frames: int=216):
    audio_files = os.listdir(audio_dir)
    audio_data = {'original_audio': [], 'spectrogram': []}

    for file in audio_files:
        try:
            original_audio, sampling_rate = librosa.load(os.path.join(audio_dir, file), sr=None)  # Preserve original sampling rate

            mel_spectrogram = librosa.feature.melspectrogram(y=original_audio, sr=sampling_rate, n_mels=n_mels, hop_length=HOP_LENGTH)
            S_dB = librosa.power_to_db(mel_spectrogram, ref=np.max)  # Normalize based on max power
            
            # Save min/max values for proper denormalization later
            S_dB_min = np.min(S_dB)
            S_dB_max = np.max(S_dB)
            
            # Normalize to [0,1]
            S_dB = (S_dB - S_dB_min) / (S_dB_max - S_dB_min)

            # Store data
            audio_data['original_audio'].append(original_audio)
            audio_data['spectrogram'].append(S_dB)
            audio_data['min_max'] = (S_dB_min, S_dB_max)  # Save for denormalization
        
        except Exception as e:
            print(f"Skipping file {file} due to error: {e}")
    
    return audio_data

def reconstruct_audio(spectrogram, sr=22050, n_mels=128, min_max=None):
    """Reconstructs audio from a normalized Mel spectrogram."""
    
    # Retrieve stored min/max values
    S_dB_min, S_dB_max = min_max

    # Reverse normalization
    S_dB = spectrogram * (S_dB_max - S_dB_min) + S_dB_min
    
    # Convert back to power scale
    S = librosa.db_to_power(S_dB)
    
    # Convert Mel spectrogram back to waveform using the correct hop_length
    reconstructed_audio = librosa.feature.inverse.mel_to_audio(S, sr=sr, hop_length=HOP_LENGTH)
    
    return reconstructed_audio

In [36]:
audio_data = load_data("../data/kaggle/genres_original/jazz/")

  original_audio, sampling_rate = librosa.load(os.path.join(audio_dir, file), sr=None)  # Preserve original sampling rate


Skipping file jazz.00054.wav due to error: 


In [37]:
original_song = audio_data['original_audio'][0]
pos = ipd.Audio(original_song, rate=22050)
transformed_song = reconstruct_audio(audio_data['spectrogram'][0], 22050, 128, audio_data['min_max'])
pts = ipd.Audio(transformed_song, rate=22050) #this sounds HORRIBLE! it is probably the fact that we didn't unnormalize?
print(f"original size: {original_song.shape}\ttransformed size: {transformed_song.shape}")

original size: (661794,)	transformed size: (661504,)


In [38]:
pos

In [39]:
pts

In [None]:
#now transform original audio
#now play both

In [None]:
#let's try our conversion from the first notebook!