In [None]:

import os, glob
import numpy as np
import pandas as pd
from tqdm import tqdm
import librosa

SR = 22050               # Sampling rate for all audio files
N_FFT = 2048             # FFT window size
HOP = 512                # Hop length for STFT
MFCC_N = 13              # Number of MFCC coefficients
ROLLOFF_PERCENT = 0.85   # Energy roll-off threshold

def _agg_stats(x):
    """
    Compute mean and standard deviation for a feature array.
    Handles NaN or Inf values by converting them to 0.
    """
    x = np.nan_to_num(x, nan=0.0, posinf=0.0, neginf=0.0)
    return np.mean(x), np.std(x)

def extract_features(path):
    """
    Extracts a set of audio features from a given .wav file.
    Returns a feature vector of fixed length (~75 values).
    """
    y, sr = librosa.load(path, sr=SR, mono=True)
    S = np.abs(librosa.stft(y, n_fft=N_FFT, hop_length=HOP))
    
    # --- MFCC ---
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=MFCC_N, hop_length=HOP)
    mfcc_stats = [val for i in range(MFCC_N) for val in _agg_stats(mfcc[i])]
    
    # --- Chroma ---
    chroma = librosa.feature.chroma_stft(S=S, sr=sr, hop_length=HOP)
    chroma_stats = [val for i in range(chroma.shape[0]) for val in _agg_stats(chroma[i])]
    
    # --- Spectral Contrast ---
    contrast = librosa.feature.spectral_contrast(S=S, sr=sr, hop_length=HOP)
    contrast_stats = [val for i in range(contrast.shape[0]) for val in _agg_stats(contrast[i])]
    
    # --- Centroid, Bandwidth, Rolloff, ZCR, RMS ---
    centroid_stats = _agg_stats(librosa.feature.spectral_centroid(S=S, sr=sr)[0])
    bandwidth_stats = _agg_stats(librosa.feature.spectral_bandwidth(S=S, sr=sr)[0])
    rolloff_stats = _agg_stats(librosa.feature.spectral_rolloff(S=S, sr=sr, roll_percent=ROLLOFF_PERCENT)[0])
    zcr_stats = _agg_stats(librosa.feature.zero_crossing_rate(y, hop_length=HOP)[0])
    rms_stats = _agg_stats(librosa.feature.rms(S=S)[0])
    
    # --- Tempo (BPM) ---
    tempo, _ = librosa.beat.beat_track(y=y, sr=sr, hop_length=HOP)
    tempo = 0.0 if np.isnan(tempo) else float(tempo)
    
    features = np.array(
        mfcc_stats + chroma_stats + contrast_stats +
        list(centroid_stats) + list(bandwidth_stats) +
        list(rolloff_stats) + list(zcr_stats) + list(rms_stats) +
        [tempo],
        dtype=np.float32
    )
    return features

# Step 3 - Build dataset (features, y)
# Collect file paths and labels
paths, labels = [], []
for genre in GENRES:
    files = sorted(glob.glob(os.path.join(DATA_DIR, g, "*.*")))
    for file in files:
        paths.append(file)   # extract path file
        labels.append(genre) # assign label

X_features, y_labels = [], []
for path, label in tqdm(zip(paths, labels), total=len(paths), desc="Extracting features"):
    feat = extract_features(path)
    X_features.append(feat)
    y_labels.append(label)

X_features = np.vstack(X_features)
y_labels = np.array(labels)

print("Feature matrix shape:", X_features.shape)
print("Labels array shape:", y_labels.shape)

# Step 4 - Save dataset
os.makedirs("data/features", exist_ok=True)

np.save("data/features/X_features.npy", X_features)
np.save("data/features/y_labels.npy", y_labels)

print("Saved data/features/X_features.npy e y_labels.npy")

# Step 5 - Quick sanity check
# Preview the first few feature vectors
df_preview = pd.DataFrame(X_features, index=y_labels)
print(df_preview.head())

# Class distribution
print("Genre distribution: ", pd.Series(labels).value_counts().to_dict())


Dataset dir: data/GTZAN
