In [2]:
import os, librosa, numpy as np, pandas as pd, warnings
from pathlib import Path


In [3]:

warnings.filterwarnings('ignore')
DATA_DIR = Path('Data\genres_original')  # change if your folder is different
OUT_FEATURES = Path('gtzan_features.csv')

def extract_features_file(path, sr=22050, n_mfcc=13):
    y, sr = librosa.load(path, sr=sr, mono=True)
    # Trim leading/trailing silence
    y, _ = librosa.effects.trim(y)
    # Duration
    duration = librosa.get_duration(y=y, sr=sr)
    # Compute features
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    mfcc_mean = np.mean(mfcc, axis=1)
    mfcc_std = np.std(mfcc, axis=1)
    chroma = librosa.feature.chroma_stft(y=y, sr=sr)
    chroma_mean = np.mean(chroma, axis=1)
    spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr)
    spec_cent_mean = np.mean(spec_cent)
    spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
    spec_bw_mean = np.mean(spec_bw)
    rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
    rolloff_mean = np.mean(rolloff)
    zcr = librosa.feature.zero_crossing_rate(y)
    zcr_mean = np.mean(zcr)
    # Aggregate into a flat dict
    features = {}
    features['file_path'] = str(path)
    features['duration'] = duration
    # mfcc features
    for i, (m, s) in enumerate(zip(mfcc_mean, mfcc_std), 1):
        features[f'mfcc_mean_{i}'] = float(m)
        features[f'mfcc_std_{i}'] = float(s)
    # chroma
    for i, c in enumerate(chroma_mean, 1):
        features[f'chroma_mean_{i}'] = float(c)
    features['spectral_centroid_mean'] = float(spec_cent_mean)
    features['spectral_bandwidth_mean'] = float(spec_bw_mean)
    features['rolloff_mean'] = float(rolloff_mean)
    features['zcr_mean'] = float(zcr_mean)
    return features

# Walk dataset and extract
files = list(DATA_DIR.rglob('*.wav'))
print('Found', len(files), '.wav files in', DATA_DIR)
rows = []
for i, f in enumerate(files):
    try:
        feats = extract_features_file(f)
        # label from parent folder name
        feats['genre'] = f.parent.name
        rows.append(feats)
        if (i+1) % 50 == 0:
            print('Processed', i+1, '/', len(files))
    except Exception as e:
        print('Failed', f, e)

df = pd.DataFrame(rows)
print('Feature dataframe shape:', df.shape)
# Save CSV
df.to_csv(OUT_FEATURES, index=False)
print('Saved features to', OUT_FEATURES)
df.head()


Found 1000 .wav files in Data\genres_original
Processed 50 / 1000
Processed 100 / 1000
Processed 150 / 1000
Processed 200 / 1000
Processed 250 / 1000
Processed 300 / 1000
Processed 350 / 1000
Processed 400 / 1000
Processed 450 / 1000
Processed 500 / 1000
Processed 550 / 1000
Failed Data\genres_original\jazz\jazz.00054.wav 
Processed 600 / 1000
Processed 650 / 1000
Processed 700 / 1000
Processed 750 / 1000
Processed 800 / 1000
Processed 850 / 1000
Processed 900 / 1000
Processed 950 / 1000
Processed 1000 / 1000
Feature dataframe shape: (999, 45)
Saved features to gtzan_features.csv


Unnamed: 0,file_path,duration,mfcc_mean_1,mfcc_std_1,mfcc_mean_2,mfcc_std_2,mfcc_mean_3,mfcc_std_3,mfcc_mean_4,mfcc_std_4,...,chroma_mean_8,chroma_mean_9,chroma_mean_10,chroma_mean_11,chroma_mean_12,spectral_centroid_mean,spectral_bandwidth_mean,rolloff_mean,zcr_mean,genre
0,Data\genres_original\blues\blues.00000.wav,30.013333,-113.598824,50.688946,121.570671,17.200207,-19.162262,15.348761,42.363937,12.289782,...,0.435684,0.295986,0.31508,0.407008,0.385101,1784.122641,2002.412407,3805.72303,0.083045,blues
1,Data\genres_original\blues\blues.00001.wav,30.013333,-207.523834,88.142525,123.985138,23.662491,8.947019,23.923552,35.867149,16.270117,...,0.560355,0.384179,0.255326,0.284776,0.334094,1530.261767,2038.987608,3550.713616,0.05604,blues
2,Data\genres_original\blues\blues.00002.wav,30.013333,-90.757164,57.601101,140.440872,22.55784,-29.084547,20.29937,31.686693,11.998093,...,0.288404,0.334152,0.401819,0.384389,0.588508,1552.832481,1747.754087,3042.410115,0.076291,blues
3,Data\genres_original\blues\blues.00003.wav,30.013333,-199.575134,74.217697,150.086105,21.361393,5.663404,16.034643,26.855278,12.584162,...,0.251613,0.246271,0.315111,0.316579,0.383199,1070.153418,1596.422564,2184.879029,0.033309,blues
4,Data\genres_original\blues\blues.00004.wav,30.013333,-160.354172,72.104813,126.20948,29.210808,-35.581394,18.27655,22.139256,13.919527,...,0.139121,0.276424,0.324327,0.555124,0.272931,1835.128513,1748.410758,3579.957471,0.101461,blues


In [4]:
import pandas as pd, numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import pickle



In [5]:

FEATURES_CSV = 'gtzan_features.csv'
df = pd.read_csv(FEATURES_CSV)
print('Loaded features:', df.shape)

# Prepare X, y
y = df['genre'].astype(str).values
X = df.drop(columns=['file_path','genre'])


Loaded features: (999, 45)


In [6]:

# Handle missing values (if any)
X = X.fillna(0.0)

# Encode labels
le = LabelEncoder()
y_enc = le.fit_transform(y)
label_names = le.classes_

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y_enc, test_size=0.2, random_state=42, stratify=y_enc)
print('Train/test:', X_train.shape, X_test.shape)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)



Train/test: (799, 43) (200, 43)


In [None]:
# Random Forest + GridSearchCV
rfc = RandomForestClassifier(random_state=42, n_jobs=-1)
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 20, 40],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}
grid = GridSearchCV(rfc, param_grid, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)
grid.fit(X_train_scaled, y_train)
print('Best params:', grid.best_params_)
best = grid.best_estimator_

# Evaluate
y_pred = best.predict(X_test_scaled)
acc = accuracy_score(y_test, y_pred)
print('Test accuracy:', acc)
print('Classification report:\n', classification_report(y_test, y_pred, target_names=label_names))

# Confusion matrix plot
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8,6))
plt.imshow(cm, interpolation='nearest', cmap='Blues')
plt.title('Confusion Matrix')
plt.colorbar()
plt.xticks(range(len(label_names)), label_names, rotation=90)
plt.yticks(range(len(label_names)), label_names)
plt.ylabel('True label')
plt.xlabel('Predicted label')
# add text annotations
thresh = cm.max() / 2.
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        plt.text(j, i, format(cm[i, j], 'd'),
                 ha="center", va="center",
                 color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.show()

# Feature importance
importances = best.feature_importances_
feature_names = X.columns.tolist()
imp_idx = np.argsort(importances)[::-1][:30]  # top 30
plt.figure(figsize=(8,6))
plt.barh(range(len(imp_idx)), importances[imp_idx][::-1])
plt.yticks(range(len(imp_idx)), [feature_names[i] for i in imp_idx][::-1])
plt.title('Top feature importances (Random Forest)')
plt.tight_layout()
plt.show()

# PCA 2D visualization of scaled features colored by genre
pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(np.vstack([X_train_scaled, X_test_scaled]))
y_all = np.concatenate([y_train, y_test])
plt.figure(figsize=(8,6))
for lab in np.unique(y_all):
    idxs = np.where(y_all==lab)
    plt.scatter(X_pca[idxs,0], X_pca[idxs,1], label=label_names[lab], s=10)
plt.legend(loc='best', bbox_to_anchor=(1.05, 1))
plt.title('PCA of audio features (2D)')
plt.xlabel('PC1'); plt.ylabel('PC2')
plt.tight_layout()
plt.show()

# Save model, scaler, label encoder
with open('gtzan_rf_model.pkl', 'wb') as f:
    pickle.dump({'model': best, 'scaler': scaler, 'label_encoder': le}, f)
print('Saved model to gtzan_rf_model.pkl')


Fitting 3 folds for each of 24 candidates, totalling 72 fits


In [None]:

# 3) Example waveform and MFCC visualization for a sample file
import librosa.display, matplotlib.pyplot as plt
import pandas as pd
df = pd.read_csv('gtzan_features.csv')
sample = df['file_path'].iloc[0]
y, sr = librosa.load(sample, sr=22050)
plt.figure(figsize=(10,3))
librosa.display.waveshow(y, sr=sr)
plt.title('Waveform: ' + sample)
plt.show()

mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
plt.figure(figsize=(10,4))
librosa.display.specshow(mfcc, x_axis='time')
plt.colorbar()
plt.title('MFCC (sample)')
plt.tight_layout()
plt.show()
