## EXPLORING mfcc_features

In [1]:
import sys
!{sys.executable} -m pip install librosa

Defaulting to user installation because normal site-packages is not writeable
Collecting librosa
  Obtaining dependency information for librosa from https://files.pythonhosted.org/packages/b5/ba/c63c5786dfee4c3417094c4b00966e61e4a63efecee22cb7b4c0387dda83/librosa-0.11.0-py3-none-any.whl.metadata
  Downloading librosa-0.11.0-py3-none-any.whl.metadata (8.7 kB)
Collecting audioread>=2.1.9 (from librosa)
  Obtaining dependency information for audioread>=2.1.9 from https://files.pythonhosted.org/packages/57/8d/30aa32745af16af0a9a650115fbe81bde7c610ed5c21b381fca0196f3a7f/audioread-3.0.1-py3-none-any.whl.metadata
  Downloading audioread-3.0.1-py3-none-any.whl.metadata (8.4 kB)
Collecting numba>=0.51.0 (from librosa)
  Obtaining dependency information for numba>=0.51.0 from https://files.pythonhosted.org/packages/97/c8/8740616c8436c86c1b9a62e72cb891177d2c34c2d24ddcde4c390371bf4c/numba-0.61.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata
  Downloading numba-0.61.2-cp311-cp

In [3]:
!pip install torch torchvision torchaudio

Defaulting to user installation because normal site-packages is not writeable
Collecting torch
  Obtaining dependency information for torch from https://files.pythonhosted.org/packages/0e/6b/87fcddd34df9f53880fa1f0c23af7b6b96c935856473faf3914323588c40/torch-2.7.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata
  Downloading torch-2.7.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (29 kB)
Collecting torchvision
  Obtaining dependency information for torchvision from https://files.pythonhosted.org/packages/09/42/6908bff012a1dcc4fc515e52339652d7f488e208986542765c02ea775c2f/torchvision-0.22.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata
  Downloading torchvision-0.22.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (6.1 kB)
Collecting torchaudio
  Obtaining dependency information for torchaudio from https://files.pythonhosted.org/packages/ab/20/1873a49df9f1778c241543eaca14d613d657b9f9351c254952114251cb86/torchaudio-2.7.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata
  Downloading torch

In [13]:
import numpy as np
import torch.nn as nn
import torch.optim as optim
import torch
from torch.utils.data import Dataset
from torch.utils.data import random_split, DataLoader
import torchvision
from sklearn.model_selection import train_test_split
import os
import librosa
import numpy as np
import pandas as pd

In [2]:
data = np.load("../features/mfcc_features.npz")
print(data.files) 

['X', 'y']


In [3]:
X = data['X']  
y = data['y']  
print("Original shape:", X.shape)
# print("Original shape:", y.shape)

Original shape: (7994, 40)


### have total of 7994 samples and 40 features 
### Next extracting MFCC features (from .mp3 to .npy file)

In [22]:
SOURCE_DIR = '../musicData/fma_small/'
DEST_DIR = '../features/lstm_mfcc'

# number of audio samples per second, to resample file to a smaller size and processing time while retaining enough frequency
SAMPLE_RATE = 22050

# MFCC Coefficients(Mel-Frequency Cepstral Coefficients) to extract per time frame
N_MFCC = 20
# The number of time frames per audio clip you want.
MAX_LEN = 130

os.makedirs(DEST_DIR, exist_ok=True)

def extract_mfcc(file_path):
    try:
        signal, sr = librosa.load(file_path, sr=SAMPLE_RATE)
        mfcc = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=N_MFCC)

        if mfcc.shape[1] < MAX_LEN:
            mfcc = np.pad(mfcc, ((0, 0), (0, MAX_LEN - mfcc.shape[1])), mode='constant')
        else:
            mfcc = mfcc[:, :MAX_LEN]

        return mfcc.T  

    except Exception as e:
        print(f"Failed: {file_path} → {e}")
        return None

def process_all_files():
    count = 0
    print(f"Walking through: {SOURCE_DIR}")
    for root, _, files in os.walk(SOURCE_DIR):
        for fname in files:
            if fname.lower().endswith(".mp3"):  # case-insensitive
                full_path = os.path.join(root, fname)
                mfcc = extract_mfcc(full_path)
                if mfcc is not None:
                    out_name = os.path.splitext(fname)[0] + ".npy"
                    save_path = os.path.join(DEST_DIR, out_name)
                    np.save(save_path, mfcc)
                    print(f" Saved: {save_path}")
                    count += 1
    print(f"\n Done. Total processed: {count}")


process_all_files()

Walking through: ../musicData/fma_small/
 Saved: ../features/lstm_mfcc/073100.npy
 Saved: ../features/lstm_mfcc/073519.npy
 Saved: ../features/lstm_mfcc/073572.npy
 Saved: ../features/lstm_mfcc/073772.npy
 Saved: ../features/lstm_mfcc/073468.npy
 Saved: ../features/lstm_mfcc/073495.npy
 Saved: ../features/lstm_mfcc/073335.npy
 Saved: ../features/lstm_mfcc/073820.npy
 Saved: ../features/lstm_mfcc/073658.npy
 Saved: ../features/lstm_mfcc/073776.npy
 Saved: ../features/lstm_mfcc/073465.npy
 Saved: ../features/lstm_mfcc/073797.npy
 Saved: ../features/lstm_mfcc/073371.npy
 Saved: ../features/lstm_mfcc/073821.npy
 Saved: ../features/lstm_mfcc/073560.npy
 Saved: ../features/lstm_mfcc/073486.npy
 Saved: ../features/lstm_mfcc/073372.npy
 Saved: ../features/lstm_mfcc/073125.npy
 Saved: ../features/lstm_mfcc/073921.npy
 Saved: ../features/lstm_mfcc/073306.npy
 Saved: ../features/lstm_mfcc/073775.npy
 Saved: ../features/lstm_mfcc/073585.npy
 Saved: ../features/lstm_mfcc/073773.npy
 Saved: ../featu

  signal, sr = librosa.load(file_path, sr=SAMPLE_RATE)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Failed: ../musicData/fma_small/133/133297.mp3 → 
 Saved: ../features/lstm_mfcc/133781.npy
 Saved: ../features/lstm_mfcc/133102.npy
 Saved: ../features/lstm_mfcc/133442.npy
 Saved: ../features/lstm_mfcc/108967.npy
 Saved: ../features/lstm_mfcc/108230.npy
 Saved: ../features/lstm_mfcc/108473.npy
 Saved: ../features/lstm_mfcc/108500.npy
 Saved: ../features/lstm_mfcc/108490.npy
 Saved: ../features/lstm_mfcc/108460.npy
 Saved: ../features/lstm_mfcc/108428.npy
 Saved: ../features/lstm_mfcc/108060.npy
 Saved: ../features/lstm_mfcc/108489.npy
 Saved: ../features/lstm_mfcc/108015.npy
 Saved: ../features/lstm_mfcc/108495.npy
 Saved: ../features/lstm_mfcc/108878.npy
 Saved: ../features/lstm_mfcc/108426.npy
 Saved: ../features/lstm_mfcc/108957.npy
 Saved: ../features/lstm_mfcc/108425.npy
 Saved: ../features/lstm_mfcc/108488.npy
 Saved: ../features/lstm_mfcc/108415.npy
 Saved: ../features/lstm_mfcc/108528.npy
 Saved: ../features/lstm_mfcc/108837.npy
 Saved: ../features/lstm_mfcc/108302.npy
 Saved: 

  signal, sr = librosa.load(file_path, sr=SAMPLE_RATE)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Failed: ../musicData/fma_small/108/108925.mp3 → 
 Saved: ../features/lstm_mfcc/108503.npy
 Saved: ../features/lstm_mfcc/108318.npy
 Saved: ../features/lstm_mfcc/108059.npy
 Saved: ../features/lstm_mfcc/108906.npy
 Saved: ../features/lstm_mfcc/108532.npy
 Saved: ../features/lstm_mfcc/108464.npy
 Saved: ../features/lstm_mfcc/108839.npy
 Saved: ../features/lstm_mfcc/108342.npy
 Saved: ../features/lstm_mfcc/108022.npy
 Saved: ../features/lstm_mfcc/108303.npy
 Saved: ../features/lstm_mfcc/108845.npy
 Saved: ../features/lstm_mfcc/108307.npy
 Saved: ../features/lstm_mfcc/108499.npy
 Saved: ../features/lstm_mfcc/108471.npy
 Saved: ../features/lstm_mfcc/108531.npy
 Saved: ../features/lstm_mfcc/108341.npy
 Saved: ../features/lstm_mfcc/108836.npy
 Saved: ../features/lstm_mfcc/108840.npy
 Saved: ../features/lstm_mfcc/108745.npy
 Saved: ../features/lstm_mfcc/108496.npy
 Saved: ../features/lstm_mfcc/108477.npy
 Saved: ../features/lstm_mfcc/108308.npy
 Saved: ../features/lstm_mfcc/108525.npy
 Saved: 

[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!


 Saved: ../features/lstm_mfcc/011298.npy
 Saved: ../features/lstm_mfcc/011769.npy
 Saved: ../features/lstm_mfcc/011922.npy
 Saved: ../features/lstm_mfcc/011937.npy
 Saved: ../features/lstm_mfcc/011679.npy
 Saved: ../features/lstm_mfcc/011333.npy
 Saved: ../features/lstm_mfcc/011942.npy
 Saved: ../features/lstm_mfcc/011763.npy
 Saved: ../features/lstm_mfcc/011674.npy
 Saved: ../features/lstm_mfcc/011682.npy
 Saved: ../features/lstm_mfcc/011918.npy
 Saved: ../features/lstm_mfcc/011795.npy
 Saved: ../features/lstm_mfcc/011334.npy
 Saved: ../features/lstm_mfcc/011781.npy
 Saved: ../features/lstm_mfcc/011818.npy
 Saved: ../features/lstm_mfcc/011683.npy
 Saved: ../features/lstm_mfcc/011059.npy
 Saved: ../features/lstm_mfcc/143056.npy
 Saved: ../features/lstm_mfcc/143989.npy
 Saved: ../features/lstm_mfcc/143098.npy
 Saved: ../features/lstm_mfcc/143239.npy
 Saved: ../features/lstm_mfcc/143220.npy
 Saved: ../features/lstm_mfcc/143304.npy
 Saved: ../features/lstm_mfcc/143221.npy
 Saved: ../featu

[src/libmpg123/layer3.c:INT123_do_layer3():1804] error: dequantization failed!


 Saved: ../features/lstm_mfcc/021657.npy
 Saved: ../features/lstm_mfcc/021774.npy
 Saved: ../features/lstm_mfcc/021860.npy
 Saved: ../features/lstm_mfcc/021401.npy
 Saved: ../features/lstm_mfcc/021400.npy
 Saved: ../features/lstm_mfcc/021403.npy
 Saved: ../features/lstm_mfcc/021997.npy
 Saved: ../features/lstm_mfcc/021587.npy
 Saved: ../features/lstm_mfcc/021402.npy
 Saved: ../features/lstm_mfcc/021676.npy
 Saved: ../features/lstm_mfcc/021565.npy
 Saved: ../features/lstm_mfcc/019415.npy
 Saved: ../features/lstm_mfcc/019423.npy
 Saved: ../features/lstm_mfcc/019890.npy
 Saved: ../features/lstm_mfcc/019179.npy
 Saved: ../features/lstm_mfcc/019758.npy
 Saved: ../features/lstm_mfcc/019760.npy
 Saved: ../features/lstm_mfcc/019459.npy
 Saved: ../features/lstm_mfcc/019420.npy
 Saved: ../features/lstm_mfcc/019708.npy
 Saved: ../features/lstm_mfcc/019442.npy
 Saved: ../features/lstm_mfcc/019729.npy
 Saved: ../features/lstm_mfcc/019074.npy
 Saved: ../features/lstm_mfcc/019192.npy
 Saved: ../featu

  signal, sr = librosa.load(file_path, sr=SAMPLE_RATE)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


 Saved: ../features/lstm_mfcc/099437.npy
 Saved: ../features/lstm_mfcc/099707.npy
 Saved: ../features/lstm_mfcc/099261.npy
 Saved: ../features/lstm_mfcc/099442.npy
 Saved: ../features/lstm_mfcc/099392.npy
 Saved: ../features/lstm_mfcc/099374.npy
 Saved: ../features/lstm_mfcc/099260.npy
 Saved: ../features/lstm_mfcc/099389.npy
 Saved: ../features/lstm_mfcc/099041.npy
 Saved: ../features/lstm_mfcc/099390.npy
 Saved: ../features/lstm_mfcc/099393.npy
 Saved: ../features/lstm_mfcc/099411.npy
 Saved: ../features/lstm_mfcc/099439.npy
 Saved: ../features/lstm_mfcc/099135.npy
 Saved: ../features/lstm_mfcc/099703.npy
 Saved: ../features/lstm_mfcc/099274.npy
 Saved: ../features/lstm_mfcc/099394.npy
 Saved: ../features/lstm_mfcc/099436.npy
 Saved: ../features/lstm_mfcc/099369.npy
 Saved: ../features/lstm_mfcc/099371.npy
 Saved: ../features/lstm_mfcc/099438.npy
 Saved: ../features/lstm_mfcc/099372.npy
 Saved: ../features/lstm_mfcc/099704.npy
 Saved: ../features/lstm_mfcc/099419.npy
 Saved: ../featu

[src/libmpg123/layer3.c:INT123_do_layer3():1804] error: dequantization failed!


 Saved: ../features/lstm_mfcc/029245.npy
 Saved: ../features/lstm_mfcc/029465.npy
 Saved: ../features/lstm_mfcc/029243.npy
 Saved: ../features/lstm_mfcc/029350.npy
 Saved: ../features/lstm_mfcc/029602.npy
 Saved: ../features/lstm_mfcc/029255.npy
 Saved: ../features/lstm_mfcc/029673.npy
 Saved: ../features/lstm_mfcc/029039.npy
 Saved: ../features/lstm_mfcc/029807.npy
 Saved: ../features/lstm_mfcc/029746.npy
 Saved: ../features/lstm_mfcc/029480.npy
 Saved: ../features/lstm_mfcc/029042.npy
 Saved: ../features/lstm_mfcc/029745.npy
 Saved: ../features/lstm_mfcc/029043.npy
 Saved: ../features/lstm_mfcc/029742.npy
 Saved: ../features/lstm_mfcc/144477.npy
 Saved: ../features/lstm_mfcc/144733.npy
 Saved: ../features/lstm_mfcc/144552.npy
 Saved: ../features/lstm_mfcc/144424.npy
 Saved: ../features/lstm_mfcc/144170.npy
 Saved: ../features/lstm_mfcc/144544.npy
 Saved: ../features/lstm_mfcc/144179.npy
 Saved: ../features/lstm_mfcc/144467.npy
 Saved: ../features/lstm_mfcc/144492.npy
 Saved: ../featu

[src/libmpg123/layer3.c:INT123_do_layer3():1774] error: part2_3_length (3328) too large for available bit count (3240)


 Saved: ../features/lstm_mfcc/054666.npy
 Saved: ../features/lstm_mfcc/054568.npy


[src/libmpg123/layer3.c:INT123_do_layer3():1774] error: part2_3_length (3360) too large for available bit count (3240)


 Saved: ../features/lstm_mfcc/054463.npy
 Saved: ../features/lstm_mfcc/054436.npy
 Saved: ../features/lstm_mfcc/054155.npy
 Saved: ../features/lstm_mfcc/054236.npy
 Saved: ../features/lstm_mfcc/054464.npy
 Saved: ../features/lstm_mfcc/054365.npy
 Saved: ../features/lstm_mfcc/054874.npy
 Saved: ../features/lstm_mfcc/054160.npy
 Saved: ../features/lstm_mfcc/054475.npy
 Saved: ../features/lstm_mfcc/054482.npy
 Saved: ../features/lstm_mfcc/054438.npy
 Saved: ../features/lstm_mfcc/054433.npy
 Saved: ../features/lstm_mfcc/054665.npy
 Saved: ../features/lstm_mfcc/054468.npy
 Saved: ../features/lstm_mfcc/054496.npy
 Saved: ../features/lstm_mfcc/054466.npy
 Saved: ../features/lstm_mfcc/054625.npy
 Saved: ../features/lstm_mfcc/054467.npy
 Saved: ../features/lstm_mfcc/054624.npy
 Saved: ../features/lstm_mfcc/054064.npy
 Saved: ../features/lstm_mfcc/054154.npy
 Saved: ../features/lstm_mfcc/054159.npy
 Saved: ../features/lstm_mfcc/054578.npy
 Saved: ../features/lstm_mfcc/054031.npy
 Saved: ../featu

Note: Illegal Audio-MPEG-Header 0x00000000 at offset 33361.
Note: Trying to resync...
Note: Skipped 1024 bytes in input.
[src/libmpg123/parse.c:wetwork():1349] error: Giving up resync after 1024 bytes - your stream is not nice... (maybe increasing resync limit could help).
  signal, sr = librosa.load(file_path, sr=SAMPLE_RATE)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


 Saved: ../features/lstm_mfcc/098565.npy
 Saved: ../features/lstm_mfcc/098617.npy
 Saved: ../features/lstm_mfcc/098550.npy
 Saved: ../features/lstm_mfcc/098576.npy
 Saved: ../features/lstm_mfcc/098549.npy
 Saved: ../features/lstm_mfcc/098619.npy
 Saved: ../features/lstm_mfcc/098026.npy
 Saved: ../features/lstm_mfcc/098236.npy
 Saved: ../features/lstm_mfcc/098655.npy
 Saved: ../features/lstm_mfcc/098626.npy
 Saved: ../features/lstm_mfcc/098667.npy
 Saved: ../features/lstm_mfcc/098028.npy
 Saved: ../features/lstm_mfcc/098838.npy
 Saved: ../features/lstm_mfcc/098620.npy
 Saved: ../features/lstm_mfcc/098627.npy
 Saved: ../features/lstm_mfcc/098579.npy
 Saved: ../features/lstm_mfcc/098770.npy
 Saved: ../features/lstm_mfcc/098552.npy
 Saved: ../features/lstm_mfcc/098339.npy
 Saved: ../features/lstm_mfcc/098567.npy


Note: Illegal Audio-MPEG-Header 0x00000000 at offset 22401.
Note: Trying to resync...
Note: Skipped 1024 bytes in input.
[src/libmpg123/parse.c:wetwork():1349] error: Giving up resync after 1024 bytes - your stream is not nice... (maybe increasing resync limit could help).
  signal, sr = librosa.load(file_path, sr=SAMPLE_RATE)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


 Saved: ../features/lstm_mfcc/098618.npy
 Saved: ../features/lstm_mfcc/098668.npy
 Saved: ../features/lstm_mfcc/098204.npy
 Saved: ../features/lstm_mfcc/098205.npy
 Saved: ../features/lstm_mfcc/098577.npy
 Saved: ../features/lstm_mfcc/098031.npy
 Saved: ../features/lstm_mfcc/098622.npy
 Saved: ../features/lstm_mfcc/098680.npy
 Saved: ../features/lstm_mfcc/098298.npy
 Saved: ../features/lstm_mfcc/098348.npy
 Saved: ../features/lstm_mfcc/098671.npy
 Saved: ../features/lstm_mfcc/098582.npy
 Saved: ../features/lstm_mfcc/098584.npy
 Saved: ../features/lstm_mfcc/098625.npy
 Saved: ../features/lstm_mfcc/098556.npy
 Saved: ../features/lstm_mfcc/098574.npy
 Saved: ../features/lstm_mfcc/098553.npy
 Saved: ../features/lstm_mfcc/098670.npy
 Saved: ../features/lstm_mfcc/098578.npy
 Saved: ../features/lstm_mfcc/098621.npy
 Saved: ../features/lstm_mfcc/098656.npy
 Saved: ../features/lstm_mfcc/098547.npy
 Saved: ../features/lstm_mfcc/098666.npy


[src/libmpg123/layer3.c:INT123_do_layer3():1804] error: dequantization failed!
Note: Illegal Audio-MPEG-Header 0x00000000 at offset 63168.
Note: Trying to resync...
Note: Skipped 1024 bytes in input.
[src/libmpg123/parse.c:wetwork():1349] error: Giving up resync after 1024 bytes - your stream is not nice... (maybe increasing resync limit could help).
  signal, sr = librosa.load(file_path, sr=SAMPLE_RATE)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


 Saved: ../features/lstm_mfcc/098569.npy
 Saved: ../features/lstm_mfcc/098300.npy
 Saved: ../features/lstm_mfcc/098580.npy
 Saved: ../features/lstm_mfcc/098624.npy
 Saved: ../features/lstm_mfcc/098681.npy
 Saved: ../features/lstm_mfcc/098228.npy
 Saved: ../features/lstm_mfcc/098669.npy
 Saved: ../features/lstm_mfcc/098238.npy
 Saved: ../features/lstm_mfcc/098301.npy
 Saved: ../features/lstm_mfcc/098229.npy
 Saved: ../features/lstm_mfcc/098573.npy
 Saved: ../features/lstm_mfcc/098227.npy
 Saved: ../features/lstm_mfcc/098299.npy
 Saved: ../features/lstm_mfcc/098297.npy
 Saved: ../features/lstm_mfcc/098557.npy
 Saved: ../features/lstm_mfcc/098554.npy
 Saved: ../features/lstm_mfcc/098583.npy
 Saved: ../features/lstm_mfcc/098551.npy
 Saved: ../features/lstm_mfcc/098613.npy
 Saved: ../features/lstm_mfcc/098077.npy
 Saved: ../features/lstm_mfcc/098349.npy
 Saved: ../features/lstm_mfcc/098235.npy
 Saved: ../features/lstm_mfcc/098701.npy
 Saved: ../features/lstm_mfcc/136331.npy
 Saved: ../featu

In [4]:
genre_df = pd.read_csv("../reports/1_Explore_Metadata/valid_track_genres.csv")

# Get unique genres and assign integer labels
unique_genres = sorted(genre_df['genre_top'].dropna().unique())
genre_to_id = {genre: idx for idx, genre in enumerate(unique_genres)}

# Map each track_id to a label
label_map = {
    row['track_id']: genre_to_id[row['genre_top']]
    for _, row in genre_df.iterrows()
}

print("Genre → ID mapping:")
print(genre_to_id)
print(f"Total labeled tracks: {len(label_map)}")


Genre → ID mapping:
{'Electronic': 0, 'Experimental': 1, 'Folk': 2, 'Hip-Hop': 3, 'Instrumental': 4, 'International': 5, 'Pop': 6, 'Rock': 7}
Total labeled tracks: 8000


In [40]:
import os

npy_files = os.listdir('../features/lstm_mfcc')
print("Sample .npy files:", npy_files[:5])


Sample .npy files: ['145730.npy', '091162.npy', '131436.npy', '055293.npy', '132779.npy']


In [28]:
class FMA_LSTM_Dataset(Dataset):
    def __init__(self, feature_dir, label_map):
        self.feature_dir = feature_dir
        self.label_map = label_map
        self.file_list = [
            f for f in os.listdir(feature_dir)
            if f.endswith(".npy") and int(os.path.splitext(f)[0]) in label_map
        ]
        print(f"Total .npy files with labels found: {len(self.file_list)}")
        print("Example file_list content:", self.file_list[:5])

    def __len__(self):
        return len(self.file_list)

    def __getitem__(self, idx):
        filename = self.file_list[idx]
        track_id = int(os.path.splitext(filename)[0])
        label = self.label_map[track_id]

        mfcc_path = os.path.join(self.feature_dir, filename)
        x = np.load(mfcc_path)  # shape: (130, 20)
        # normalizing to avoid divide by 0
        x = (x - np.mean(x)) / (np.std(x) + 1e-8) 
        
        x_tensor = torch.tensor(x, dtype=torch.float32)
        y_tensor = torch.tensor(label, dtype=torch.long)
        return x_tensor, y_tensor

In [29]:
dataset = FMA_LSTM_Dataset(
    feature_dir='../features/lstm_mfcc',
    label_map=label_map
)

# Sanity check
print(len(dataset))
x, y = dataset[0]
print("MFCC shape:", x.shape)   # (130, 20)
print("Genre class:", y)

Total .npy files with labels found: 7997
Example file_list content: ['145730.npy', '091162.npy', '131436.npy', '055293.npy', '132779.npy']
7997
MFCC shape: torch.Size([130, 20])
Genre class: tensor(3)


In [30]:
## test train split

In [31]:
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size

train_set, test_set = random_split(dataset, [train_size, test_size])
train_loader = DataLoader(train_set, batch_size=32, shuffle=True)
test_loader = DataLoader(test_set, batch_size=32)
x_batch, y_batch = next(iter(train_loader))
# print("Batch shape:", x_batch.shape)  
# print("Genre class labels:", y_batch[:5])

In [32]:
# creating custom NN which will be used
class LSTMClassifier(nn.Module):
    def __init__(self, input_size=20, hidden_size=128, num_layers=3, num_classes=8, dropout=0.3):
        super(LSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(input_size=input_size,
                            hidden_size=hidden_size,
                            num_layers=num_layers,
                            batch_first=True,
                            dropout=dropout,
                           bidirectional=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        out, (hn, _) = self.lstm(x)       
        logits = self.fc(hn[-1])          
        return logits


In [33]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = LSTMClassifier(num_classes=len(set(label_map.values())))
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [34]:
def train(model, loader, criterion, optimizer):
    model.train()
    total_loss, correct = 0, 0
    for x_batch, y_batch in loader:
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)

        optimizer.zero_grad()
        outputs = model(x_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        correct += (outputs.argmax(1) == y_batch).sum().item()

    avg_loss = total_loss / len(loader)
    accuracy = correct / len(loader.dataset)
    return avg_loss, accuracy

def evaluate(model, loader, criterion):
    model.eval()
    total_loss, correct = 0, 0
    with torch.no_grad():
        for x_batch, y_batch in loader:
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            outputs = model(x_batch)
            loss = criterion(outputs, y_batch)
            total_loss += loss.item()
            correct += (outputs.argmax(1) == y_batch).sum().item()

    avg_loss = total_loss / len(loader)
    accuracy = correct / len(loader.dataset)
    return avg_loss, accuracy

In [35]:
train_losses, val_losses = [], []
train_accuracies, val_accuracies = [], []

epochs = 20
for epoch in range(epochs):
    train_loss, train_acc = train(model, train_loader, criterion, optimizer)
    val_loss, val_acc = evaluate(model, test_loader, criterion)

    train_losses.append(train_loss)
    val_losses.append(val_loss)
    train_accuracies.append(train_acc)
    val_accuracies.append(val_acc)

    print(f"Epoch {epoch+1}/{epochs}")
    print(f"  Train Loss: {train_loss:.4f}, Accuracy: {train_acc:.4f}")
    print(f"  Val   Loss: {val_loss:.4f}, Accuracy: {val_acc:.4f}")


Epoch 1/20
  Train Loss: 1.9650, Accuracy: 0.2229
  Val   Loss: 1.8895, Accuracy: 0.2794
Epoch 2/20
  Train Loss: 1.8937, Accuracy: 0.2776
  Val   Loss: 1.8810, Accuracy: 0.3006
Epoch 3/20
  Train Loss: 1.8507, Accuracy: 0.3001
  Val   Loss: 1.8400, Accuracy: 0.3119
Epoch 4/20
  Train Loss: 1.8247, Accuracy: 0.3109
  Val   Loss: 1.8141, Accuracy: 0.3162
Epoch 5/20
  Train Loss: 1.7858, Accuracy: 0.3270
  Val   Loss: 1.8840, Accuracy: 0.2919
Epoch 6/20
  Train Loss: 1.7873, Accuracy: 0.3253
  Val   Loss: 1.8091, Accuracy: 0.3206
Epoch 7/20
  Train Loss: 1.7798, Accuracy: 0.3345
  Val   Loss: 1.7795, Accuracy: 0.3375
Epoch 8/20
  Train Loss: 1.7239, Accuracy: 0.3577
  Val   Loss: 1.7385, Accuracy: 0.3550
Epoch 9/20
  Train Loss: 1.7426, Accuracy: 0.3467
  Val   Loss: 1.7252, Accuracy: 0.3575
Epoch 10/20
  Train Loss: 1.6977, Accuracy: 0.3636
  Val   Loss: 1.7342, Accuracy: 0.3556
Epoch 11/20
  Train Loss: 1.6709, Accuracy: 0.3803
  Val   Loss: 1.7230, Accuracy: 0.3606
Epoch 12/20
  Train

In [27]:
torch.save(model.state_dict(), '../models/lstm_genre_classifier.pth')