# Creating a model and training it for generating and clustering music

### Downloading the dataset

In [None]:
"""import kagglehub

# Download latest version
path = kagglehub.dataset_download("imsparsh/fma-free-music-archive-small-medium")

print("Path to dataset files:", path)"""

### Functions needed to transform the .mp3 soundfiles to .jpg images for the training set

In [4]:
import numpy as np
import librosa
import librosa.display
import random
from PIL import Image
import gc
import matplotlib.pyplot as plt
from scipy import signal
from scipy.io import wavfile as wav
from numpy.lib import stride_tricks
import os
# Reading the audio file and applying some transformations (trimming, padding...) to "clean" the sound file

def read_audio(conf, pathname, trim_long_data):
    y, sr = librosa.load(pathname, sr=conf.sampling_rate)
    # trim silence
    if 0 < len(y): # workaround: 0 length causes error
        y, _ = librosa.effects.trim(y) # trim, top_db=default(60)
    # make it unified length to conf.samples
    if len(y) > conf.samples: # long enough
        if trim_long_data:
            y = y[0:0+conf.samples]
    else: # pad blank
        padding = conf.samples - len(y)    # add padding at both ends
        offset = padding // 2
        y = np.pad(y, (offset, conf.samples - len(y) - offset), 'constant')
    return y

# Thanks to the librosa library, generating the mel-spectogram from the audio file

def audio_to_melspectrogram(conf, audio):
    spectrogram = librosa.feature.melspectrogram(y=audio, 
                                                 sr=conf.sampling_rate,
                                                 n_mels=conf.n_mels,
                                                 hop_length=conf.hop_length,
                                                 n_fft=conf.n_fft,
                                                 fmin=conf.fmin,
                                                 fmax=conf.fmax)
    spectrogram = librosa.power_to_db(spectrogram)
    spectrogram = spectrogram.astype(np.float32)
    return spectrogram

# Adding both previous function together

def read_as_melspectrogram(conf, pathname, trim_long_data, debug_display=False):
    x = read_audio(conf, pathname, trim_long_data)
    mels = audio_to_melspectrogram(conf, x)
    return mels

# A set of settings that you can adapt to fit your audio files (frequency, average duration, number of Fourier transforms...)

class conf:
    # Preprocessing settings
    sampling_rate = 44100
    duration = 30
    hop_length = 694
    fmin = 20
    fmax = sampling_rate // 2
    n_mels = 128
    n_fft = n_mels * 20
    samples = sampling_rate * duration



def mono_to_color(X, mean=None, std=None, norm_max=None, norm_min=None, eps=1e-6):
    # Stack X as [X,X,X]
    X = np.stack([X, X, X], axis=-1)

    # Standardize
    mean = mean or X.mean()
    std = std or X.std()
    Xstd = (X - mean) / (std + eps)
    _min, _max = Xstd.min(), Xstd.max()
    norm_max = norm_max or _max
    norm_min = norm_min or _min
    if (_max - _min) > eps:
        # Scale to [0, 255]
        V = Xstd
        V[V < norm_min] = norm_min
        V[V > norm_max] = norm_max
        V = 255 * (V - norm_min) / (norm_max - norm_min)
        V = V.astype(np.uint8)
    else:
        # Just zero
        V = np.zeros_like(Xstd, dtype=np.uint8)
    return V

def rename_file(img_name):
    img_name = img_name.split("/")[2]
    img_name = img_name[:-4]
    img_name += ".jpg"
    return img_name

def save_image_from_sound(img_path):
    filename = rename_file(img_path)
    x = read_as_melspectrogram(conf, img_path, trim_long_data=False, debug_display=True)
    #x_color = mono_to_color(x)
    x_min, x_max = x.min(), x.max()
    x_norm = (x - x_min) / (x_max - x_min + 1e-6)
    x_img = (x_norm * 255).astype(np.uint8)
    img = Image.fromarray(x_img, mode='L')
    img.save(os.path.join('trainImages', filename))
    del x
    gc.collect()

### Convert each song in the dataset to an image and save it

In [5]:
import os

root_dir = 'test_music_folder'
for i, (dirpath, dirnames, filenames) in enumerate(os.walk(root_dir)):
    #print(f"At map {i} out of 155")
    for fn in filenames:
        if fn.endswith('.mp3'):

            full_path = os.path.join(dirpath, fn)
            exists = os.path.isfile(rename_file(full_path))
            if not exists:
                try:
                    save_image_from_sound(full_path)
                except:
                    continue