# Creating a model and training it for generating and clustering music

### Downloading the dataset

In [7]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("imsparsh/fma-free-music-archive-small-medium")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/imsparsh/fma-free-music-archive-small-medium?dataset_version_number=1...


100%|██████████| 29.8G/29.8G [46:47<00:00, 11.4MB/s]  

Extracting files...





Path to dataset files: /home/rasmus/.cache/kagglehub/datasets/imsparsh/fma-free-music-archive-small-medium/versions/1


### Functions needed to transform the .mp3 soundfiles to .jpg images for the training set

In [2]:
import numpy as np
import librosa
import librosa.display
import random
from PIL import Image
import gc
import matplotlib.pyplot as plt
from scipy import signal
from scipy.io import wavfile as wav
from numpy.lib import stride_tricks
import os
# Reading the audio file and applying some transformations (trimming, padding...) to "clean" the sound file

def read_audio(conf, pathname, trim_long_data):
    y, sr = librosa.load(pathname, sr=conf.sampling_rate)
    # trim silence
    if 0 < len(y): # workaround: 0 length causes error
        y, _ = librosa.effects.trim(y) # trim, top_db=default(60)
    # make it unified length to conf.samples
    if len(y) > conf.samples: # long enough
        if trim_long_data:
            y = y[0:0+conf.samples]
    else: # pad blank
        padding = conf.samples - len(y)    # add padding at both ends
        offset = padding // 2
        y = np.pad(y, (offset, conf.samples - len(y) - offset), 'constant')
    return y

# Thanks to the librosa library, generating the mel-spectogram from the audio file

def audio_to_melspectrogram(conf, audio):
    spectrogram = librosa.feature.melspectrogram(y=audio, 
                                                 sr=conf.sampling_rate,
                                                 n_mels=conf.n_mels,
                                                 hop_length=conf.hop_length,
                                                 n_fft=conf.n_fft,
                                                 fmin=conf.fmin,
                                                 fmax=conf.fmax)
    spectrogram = librosa.power_to_db(spectrogram)
    spectrogram = spectrogram.astype(np.float32)
    return spectrogram

# Adding both previous function together

def read_as_melspectrogram(conf, pathname, trim_long_data, debug_display=False):
    x = read_audio(conf, pathname, trim_long_data)
    mels = audio_to_melspectrogram(conf, x)
    return mels

# A set of settings that you can adapt to fit your audio files (frequency, average duration, number of Fourier transforms...)

class conf:
    # Preprocessing settings
    sampling_rate = 44100
    duration = 30
    hop_length = 694
    fmin = 20
    fmax = sampling_rate // 2
    n_mels = 128
    n_fft = n_mels * 20
    samples = sampling_rate * duration



def mono_to_color(X, mean=None, std=None, norm_max=None, norm_min=None, eps=1e-6):
    # Stack X as [X,X,X]
    X = np.stack([X, X, X], axis=-1)

    # Standardize
    mean = mean or X.mean()
    std = std or X.std()
    Xstd = (X - mean) / (std + eps)
    _min, _max = Xstd.min(), Xstd.max()
    norm_max = norm_max or _max
    norm_min = norm_min or _min
    if (_max - _min) > eps:
        # Scale to [0, 255]
        V = Xstd
        V[V < norm_min] = norm_min
        V[V > norm_max] = norm_max
        V = 255 * (V - norm_min) / (norm_max - norm_min)
        V = V.astype(np.uint8)
    else:
        # Just zero
        V = np.zeros_like(Xstd, dtype=np.uint8)
    return V

def rename_file(img_name):
    img_name = img_name.split("/")[-1]
    img_name = img_name[:-4]
    img_name += ".jpg"
    return img_name

def save_image_from_sound(img_path):
    filename = rename_file(img_path)
    x = read_as_melspectrogram(conf, img_path, trim_long_data=False, debug_display=True)
    #x_color = mono_to_color(x)
    x_min, x_max = x.min(), x.max()
    x_norm = (x - x_min) / (x_max - x_min + 1e-6)
    x_img = (x_norm * 255).astype(np.uint8)
    img = Image.fromarray(x_img, mode='L')
    img.save(os.path.join('trainImages', filename))
    del x
    gc.collect()

### Convert each song in the dataset to an image and save it

In [3]:
import os
from concurrent.futures import ThreadPoolExecutor, as_completed

def process_file(full_path):
    try:
        exists = os.path.isfile(rename_file(full_path))
        if not exists:
            save_image_from_sound(full_path)
    except Exception as e:
        print(f"Error processing {full_path}: {e}")
        return  # Continue processing other files even if one fails

def ParalellConvertImage():
    root_dir = '/home/rasmus/.cache/kagglehub/datasets/imsparsh/fma-free-music-archive-small-medium/versions/1/fma_medium' #Change to your path
    futures = []
    total_files = sum([len(files) for _, _, files in os.walk(root_dir) if any(file.endswith('.mp3') for file in files)])
    with ThreadPoolExecutor() as executor:
        for dirpath, dirnames, filenames in os.walk(root_dir):
            for fn in filenames:
                if fn.endswith('.mp3'):
                    full_path = os.path.join(dirpath, fn)
                    futures.append(executor.submit(process_file, full_path))
                    
        for idx, future in enumerate(as_completed(futures)):
            if idx%500 == 0:
                print(f"Progress: {idx + 1}/{total_files} files processed")

ParalellConvertImage()

Progress: 1/25000 files processed
Progress: 501/25000 files processed


  y, sr = librosa.load(pathname, sr=conf.sampling_rate)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Error processing /home/rasmus/.cache/kagglehub/datasets/imsparsh/fma-free-music-archive-small-medium/versions/1/fma_medium/fma_medium/065/065753.mp3: 
Progress: 1001/25000 files processed
Progress: 1501/25000 files processed
Progress: 2001/25000 files processed
Progress: 2501/25000 files processed


[src/libmpg123/layer3.c:INT123_do_layer3():1804] error: dequantization failed!


Progress: 3001/25000 files processed
Progress: 3501/25000 files processed


[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!


Progress: 4001/25000 files processed
Progress: 4501/25000 files processed
Progress: 5001/25000 files processed
Progress: 5501/25000 files processed


[src/libmpg123/layer3.c:INT123_do_layer3():1804] error: dequantization failed!
  y, sr = librosa.load(pathname, sr=conf.sampling_rate)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Error processing /home/rasmus/.cache/kagglehub/datasets/imsparsh/fma-free-music-archive-small-medium/versions/1/fma_medium/fma_medium/108/108925.mp3: 
Progress: 6001/25000 files processed


[src/libmpg123/layer3.c:INT123_do_layer3():1804] error: dequantization failed!
[src/libmpg123/layer3.c:INT123_do_layer3():1804] error: dequantization failed!


Progress: 6501/25000 files processed
Progress: 7001/25000 files processed


[src/libmpg123/layer3.c:INT123_do_layer3():1804] error: dequantization failed!


Progress: 7501/25000 files processed


[src/libmpg123/layer3.c:INT123_do_layer3():1774] error: part2_3_length (3264) too large for available bit count (3224)


Progress: 8001/25000 files processed


  y, sr = librosa.load(pathname, sr=conf.sampling_rate)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Error processing /home/rasmus/.cache/kagglehub/datasets/imsparsh/fma-free-music-archive-small-medium/versions/1/fma_medium/fma_medium/105/105247.mp3: 
Progress: 8501/25000 files processed
Progress: 9001/25000 files processed
Progress: 9501/25000 files processed


[src/libmpg123/layer3.c:INT123_do_layer3():1774] error: part2_3_length (3360) too large for available bit count (3240)
[src/libmpg123/layer3.c:INT123_do_layer3():1774] error: part2_3_length (3328) too large for available bit count (3240)


Progress: 10001/25000 files processed
Progress: 10501/25000 files processed


  y, sr = librosa.load(pathname, sr=conf.sampling_rate)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Error processing /home/rasmus/.cache/kagglehub/datasets/imsparsh/fma-free-music-archive-small-medium/versions/1/fma_medium/fma_medium/133/133297.mp3: 
Progress: 11001/25000 files processed
Progress: 11501/25000 files processed


  y, sr = librosa.load(pathname, sr=conf.sampling_rate)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Error processing /home/rasmus/.cache/kagglehub/datasets/imsparsh/fma-free-music-archive-small-medium/versions/1/fma_medium/fma_medium/099/099134.mp3: 
Progress: 12001/25000 files processed


[src/libmpg123/layer3.c:INT123_do_layer3():1804] error: dequantization failed!
[src/libmpg123/layer3.c:INT123_do_layer3():1804] error: dequantization failed!


Progress: 12501/25000 files processed


[src/libmpg123/layer3.c:INT123_do_layer3():1804] error: dequantization failed!
  y, sr = librosa.load(pathname, sr=conf.sampling_rate)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Error processing /home/rasmus/.cache/kagglehub/datasets/imsparsh/fma-free-music-archive-small-medium/versions/1/fma_medium/fma_medium/143/143992.mp3: 
Progress: 13001/25000 files processed
Progress: 13501/25000 files processed
Progress: 14001/25000 files processed
Progress: 14501/25000 files processed


  y, sr = librosa.load(pathname, sr=conf.sampling_rate)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Error processing /home/rasmus/.cache/kagglehub/datasets/imsparsh/fma-free-music-archive-small-medium/versions/1/fma_medium/fma_medium/001/001486.mp3: 
Progress: 15001/25000 files processed
Progress: 15501/25000 files processed
Progress: 16001/25000 files processed
Progress: 16501/25000 files processed


  y, sr = librosa.load(pathname, sr=conf.sampling_rate)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Error processing /home/rasmus/.cache/kagglehub/datasets/imsparsh/fma-free-music-archive-small-medium/versions/1/fma_medium/fma_medium/005/005574.mp3: 
Progress: 17001/25000 files processed
Progress: 17501/25000 files processed
Progress: 18001/25000 files processed
Progress: 18501/25000 files processed
Progress: 19001/25000 files processed
Progress: 19501/25000 files processed


  y, sr = librosa.load(pathname, sr=conf.sampling_rate)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Error processing /home/rasmus/.cache/kagglehub/datasets/imsparsh/fma-free-music-archive-small-medium/versions/1/fma_medium/fma_medium/127/127336.mp3: 


[src/libmpg123/layer3.c:INT123_do_layer3():1804] error: dequantization failed!


Progress: 20001/25000 files processed


[src/libmpg123/layer3.c:INT123_do_layer3():1804] error: dequantization failed!


Progress: 20501/25000 files processed
Progress: 21001/25000 files processed
Progress: 21501/25000 files processed
Progress: 22001/25000 files processed


Note: Illegal Audio-MPEG-Header 0x00000000 at offset 22401.
  y, sr = librosa.load(pathname, sr=conf.sampling_rate)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
Note: Trying to resync...
Note: Skipped 1024 bytes in input.
[src/libmpg123/parse.c:wetwork():1349] error: Giving up resync after 1024 bytes - your stream is not nice... (maybe increasing resync limit could help).
[src/libmpg123/layer3.c:INT123_do_layer3():1804] error: dequantization failed!
Note: Illegal Audio-MPEG-Header 0x00000000 at offset 63168.
Note: Trying to resync...
Note: Skipped 1024 bytes in input.
[src/libmpg123/parse.c:wetwork():1349] error: Giving up resync after 1024 bytes - your stream is not nice... (maybe increasing resync limit could help).
Note: Illegal Audio-MPEG-Header 0x00000000 at offset 33361.
Note: Trying to resync...
Note: Skipped 1024 bytes in input.
[src/libmpg123/parse.c:wetwork():1349] error:

Error processing /home/rasmus/.cache/kagglehub/datasets/imsparsh/fma-free-music-archive-small-medium/versions/1/fma_medium/fma_medium/098/098571.mp3: 


  y, sr = librosa.load(pathname, sr=conf.sampling_rate)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Error processing /home/rasmus/.cache/kagglehub/datasets/imsparsh/fma-free-music-archive-small-medium/versions/1/fma_medium/fma_medium/098/098559.mp3: 


  y, sr = librosa.load(pathname, sr=conf.sampling_rate)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Error processing /home/rasmus/.cache/kagglehub/datasets/imsparsh/fma-free-music-archive-small-medium/versions/1/fma_medium/fma_medium/098/098560.mp3: 


  y, sr = librosa.load(pathname, sr=conf.sampling_rate)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Error processing /home/rasmus/.cache/kagglehub/datasets/imsparsh/fma-free-music-archive-small-medium/versions/1/fma_medium/fma_medium/098/098558.mp3: 


  y, sr = librosa.load(pathname, sr=conf.sampling_rate)
Note: Illegal Audio-MPEG-Header 0x00000000 at offset 187493.
Note: Trying to resync...
Note: Skipped 1024 bytes in input.
[src/libmpg123/parse.c:wetwork():1349] error: Giving up resync after 1024 bytes - your stream is not nice... (maybe increasing resync limit could help).
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
Note: Illegal Audio-MPEG-Header 0x00000000 at offset 106439.
Note: Trying to resync...
Note: Skipped 1024 bytes in input.
[src/libmpg123/parse.c:wetwork():1349] error: Giving up resync after 1024 bytes - your stream is not nice... (maybe increasing resync limit could help).


Progress: 22501/25000 files processed


  y, sr = librosa.load(pathname, sr=conf.sampling_rate)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Error processing /home/rasmus/.cache/kagglehub/datasets/imsparsh/fma-free-music-archive-small-medium/versions/1/fma_medium/fma_medium/080/080391.mp3: 
Progress: 23001/25000 files processed
Progress: 23501/25000 files processed


  y, sr = librosa.load(pathname, sr=conf.sampling_rate)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Error processing /home/rasmus/.cache/kagglehub/datasets/imsparsh/fma-free-music-archive-small-medium/versions/1/fma_medium/fma_medium/126/126981.mp3: 
Progress: 24001/25000 files processed


[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!


Progress: 24501/25000 files processed
