Level 3
Task 6: Music Genre Classification Description

Description:


*   Dataset (Recommended): GTZAN (Kaggle).
*   Classify songs into genres based on extracted audio features.
*   Preprocess features such as MFCCs or use spectrogram images.
*   Train and evaluate a multi-class model using tabular or image data.
*   If image-based, use a CNN model.







Tools & Libraries:


*   Python
*   Librosa (for features)
*   Scikit-learn or Keras


Covered Topics:


*   Audio data
*   CNNs
*   Multi-class classification








Bonus:


*   Try both tabular and image-based approaches and compare results.
*   Use transfer learning on spectrograms.



In [None]:
import csv
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import LinearSVC
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.applications.mobilenet_v2 import preprocess_input
import math
import librosa
from librosa.util import find_files

Data Visualization

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# ===== Collect WAV paths + labels (audio) =====

AUDIO_ROOT = "/content/drive/MyDrive/Elevvo Internship/Task 6/gtzan/Data/genres_original"  # the folder with 10 genre subfolders

wav_paths = find_files(AUDIO_ROOT, ext=["wav"], recurse=True)  # list of ".../<genre>/<file>.wav"
labels = [p.replace("\\", "/").split("/")[-2] for p in wav_paths]  # parent folder name = genre

print(f"Found {len(wav_paths)} audio files across {len(set(labels))} genres.")
print("Example:", wav_paths[0], "->", labels[0])


Found 1000 audio files across 10 genres.
Example: /content/drive/MyDrive/Elevvo Internship/Task 6/gtzan/Data/genres_original/blues/blues.00000.wav -> blues


In [None]:
# Robust, faster MFCC extraction that skips bad files

def extract_mfcc_mean_safe(path, n_mfcc=13, sr=22050, duration=10, hop_length=1024):
    """
    Load audio and return 13-D MFCC mean vector.
    Skips files that fail to decode.
    """
    try:
        y, sr = librosa.load(
            path, sr=sr, mono=True, duration=duration, res_type="kaiser_fast"
        )
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc, hop_length=hop_length)
        feat = mfcc.mean(axis=1)  # 13 values
        return [float(v) for v in feat]
    except Exception as e:
        # Comment out the print if too noisy
        print(f"[skip] {path} -> {type(e).__name__}: {e}")
        return None

# Build features
X_raw = [extract_mfcc_mean_safe(p) for p in wav_paths]  # wav_paths from your earlier code
y_raw = labels  # parallel list of genres

# Filter out failed loads
X, y = [], []
skipped = 0
for xi, yi in zip(X_raw, y_raw):
    if xi is None:
        skipped += 1
    else:
        X.append(xi)
        y.append(yi)
print(f"Built features: kept {len(X)}, skipped {skipped}")

# Encode labels & split
le = LabelEncoder()
y_enc = le.fit_transform(y)

Xtr, Xte, ytr, yte = train_test_split(
    X, y_enc, test_size=0.2, stratify=y_enc, random_state=42
)

# Scale & train a fast baseline (you can switch to RBF SVM later)
scaler = StandardScaler()
Xtr = scaler.fit_transform(Xtr)
Xte = scaler.transform(Xte)

clf = LinearSVC(random_state=42)  # very fast; try SVC(kernel="rbf") after it works
clf.fit(Xtr, ytr)
yhat = clf.predict(Xte)

print("Accuracy:", f"{accuracy_score(yte, yhat):.4f}")
print(classification_report(yte, yhat, target_names=list(le.classes_)))


  y, sr = librosa.load(
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


[skip] /content/drive/MyDrive/Elevvo Internship/Task 6/gtzan/Data/genres_original/jazz/jazz.00054.wav -> NoBackendError: 
Built features: kept 999, skipped 1
Accuracy: 0.4450
              precision    recall  f1-score   support

       blues       0.41      0.65      0.50        20
   classical       0.70      0.80      0.74        20
     country       0.37      0.35      0.36        20
       disco       0.27      0.20      0.23        20
      hiphop       0.16      0.15      0.15        20
        jazz       0.36      0.25      0.29        20
       metal       0.61      0.85      0.71        20
         pop       0.57      0.80      0.67        20
      reggae       0.39      0.35      0.37        20
        rock       0.25      0.05      0.08        20

    accuracy                           0.45       200
   macro avg       0.41      0.44      0.41       200
weighted avg       0.41      0.45      0.41       200



In [None]:
# ===== Safe mel-spectrograms + CNN (skips unreadable files) =====

TARGET_MELS = 128
TARGET_FRAMES = 128

def mel_db_tensor_safe(path, sr=22050, duration=10, n_mels=128, hop_length=512,
                       target_mels=TARGET_MELS, target_frames=TARGET_FRAMES):
    """
    Returns a [target_mels, target_frames, 1] tensor, or None if load fails.
    """
    try:
        y, sr = librosa.load(path, sr=sr, mono=True, duration=duration, res_type="kaiser_fast")
        if y is None or len(y) == 0:
            return None
        S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels, hop_length=hop_length)
        S_db = librosa.power_to_db(S, ref=lambda x: S.max() if S.max() != 0 else 1.0)

        t = tf.convert_to_tensor(S_db, dtype=tf.float32)[:, :, None]  # [mels, T, 1]
        t = tf.image.resize(t, size=(target_mels, tf.shape(t)[1]))
        cur_T = tf.shape(t)[1]
        t = tf.cond(cur_T < target_frames,
                    lambda: tf.pad(t, [[0,0],[0, target_frames-cur_T],[0,0]]),
                    lambda: t[:, :target_frames, :])
        t = t[:, :target_frames, :]
        t_min, t_max = tf.reduce_min(t), tf.reduce_max(t)
        t = tf.cond(t_max > t_min, lambda: (t - t_min) / (t_max - t_min), lambda: tf.zeros_like(t))
        return t
    except Exception:
        return None

# Build tensors while skipping failures
X_tensors, y_clean = [], []
for p, lab in zip(wav_paths, labels):
    t = mel_db_tensor_safe(p)
    if t is not None:
        X_tensors.append(t); y_clean.append(lab)
print(f"Kept {len(X_tensors)} files, skipped {len(wav_paths) - len(X_tensors)} unreadable files.")

X_all = tf.stack(X_tensors, axis=0)
# simple label map without sklearn
classes = sorted(set(y_clean))
cls2id = {c:i for i,c in enumerate(classes)}
y_all = tf.constant([cls2id[c] for c in y_clean], dtype=tf.int32)

# random split (pure TF)
N = tf.shape(X_all)[0]
idx = tf.random.shuffle(tf.range(N), seed=42)
val_size = tf.cast(tf.math.round(0.2 * tf.cast(N, tf.float32)), tf.int32)
idx_val = idx[:val_size]; idx_tr = idx[val_size:]

Xtr, ytr = tf.gather(X_all, idx_tr), tf.gather(y_all, idx_tr)
Xva, yva = tf.gather(X_all, idx_val), tf.gather(y_all, idx_val)

batch_size = 16
train_ds = tf.data.Dataset.from_tensor_slices((Xtr, ytr)).batch(batch_size).prefetch(tf.data.AUTOTUNE)
val_ds   = tf.data.Dataset.from_tensor_slices((Xva, yva)).batch(batch_size).prefetch(tf.data.AUTOTUNE)

model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(TARGET_MELS, TARGET_FRAMES, 1)),
    tf.keras.layers.Conv2D(32, 3, activation="relu"), tf.keras.layers.MaxPooling2D(),
    tf.keras.layers.Conv2D(64, 3, activation="relu"), tf.keras.layers.MaxPooling2D(),
    tf.keras.layers.Conv2D(128, 3, activation="relu"),
    tf.keras.layers.GlobalAveragePooling2D(),
    tf.keras.layers.Dense(128, activation="relu"),
    tf.keras.layers.Dense(len(classes), activation="softmax"),
])
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
hist = model.fit(train_ds, validation_data=val_ds, epochs=15)
print("Validation accuracy:", hist.history["val_accuracy"][-1])


  y, sr = librosa.load(path, sr=sr, mono=True, duration=duration, res_type="kaiser_fast")
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Kept 999 files, skipped 1 unreadable files.
Epoch 1/15
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 547ms/step - accuracy: 0.1015 - loss: 2.3047 - val_accuracy: 0.0750 - val_loss: 2.3094
Epoch 2/15
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 533ms/step - accuracy: 0.1190 - loss: 2.2760 - val_accuracy: 0.1450 - val_loss: 2.2738
Epoch 3/15
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 521ms/step - accuracy: 0.1908 - loss: 2.1247 - val_accuracy: 0.1950 - val_loss: 2.1098
Epoch 4/15
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 489ms/step - accuracy: 0.2334 - loss: 2.0606 - val_accuracy: 0.2200 - val_loss: 2.0852
Epoch 5/15
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 521ms/step - accuracy: 0.2563 - loss: 2.0319 - val_accuracy: 0.2500 - val_loss: 2.0697
Epoch 6/15
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 505ms/step - accuracy: 0.2999 - loss: 1.9710 - val_accuracy: 0.29

In [None]:
# ===== images_original → CNN =====

IMG_DIR = "/content/drive/MyDrive/Elevvo Internship/Task 6/gtzan/Data/images_original"   # root with 10 genre subfolders
train_ds = tf.keras.utils.image_dataset_from_directory(
    IMG_DIR, validation_split=0.2, subset="training",
    seed=42, image_size=(128,128), batch_size=32
)
val_ds = tf.keras.utils.image_dataset_from_directory(
    IMG_DIR, validation_split=0.2, subset="validation",
    seed=42, image_size=(128,128), batch_size=32
)

model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(128,128,3)),
    tf.keras.layers.Rescaling(1./255),
    tf.keras.layers.Conv2D(32, 3, activation="relu"), tf.keras.layers.MaxPooling2D(),
    tf.keras.layers.Conv2D(64, 3, activation="relu"), tf.keras.layers.MaxPooling2D(),
    tf.keras.layers.Conv2D(128, 3, activation="relu"),
    tf.keras.layers.GlobalAveragePooling2D(),
    tf.keras.layers.Dense(128, activation="relu"),
    tf.keras.layers.Dense(len(train_ds.class_names), activation="softmax"),
])
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
model.fit(train_ds, validation_data=val_ds, epochs=15)


Found 1001 files belonging to 10 classes.
Using 801 files for training.
Found 1001 files belonging to 10 classes.
Using 200 files for validation.
Epoch 1/15
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 2s/step - accuracy: 0.1176 - loss: 2.3070 - val_accuracy: 0.1550 - val_loss: 2.2985
Epoch 2/15
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 1s/step - accuracy: 0.1870 - loss: 2.2708 - val_accuracy: 0.1700 - val_loss: 2.1966
Epoch 3/15
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 2s/step - accuracy: 0.1763 - loss: 2.1571 - val_accuracy: 0.1550 - val_loss: 2.0806
Epoch 4/15
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 1s/step - accuracy: 0.1508 - loss: 2.0734 - val_accuracy: 0.1550 - val_loss: 2.0793
Epoch 5/15
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 2s/step - accuracy: 0.2144 - loss: 2.0058 - val_accuracy: 0.2100 - val_loss: 2.0394
Epoch 6/15
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

<keras.src.callbacks.history.History at 0x7c16d5b5abd0>

In [None]:
# ===== genre classification — MFCCs =====

# 1) Collect WAV paths + labels from GTZAN
AUDIO_ROOT = "/content/drive/MyDrive/Elevvo Internship/Task 6/gtzan/Data/genres_original"  # folder with 10 genre subfolders
wav_paths = find_files(AUDIO_ROOT, ext=["wav"], recurse=True)
labels = [p.replace("\\", "/").split("/")[-2] for p in wav_paths]  # parent folder name

# 2) Robust MFCC extractor (fast + skips bad files)
def mfcc_mean_safe(path, n_mfcc=13, sr=22050, duration=10, hop_length=1024):
    try:
        y, sr = librosa.load(path, sr=sr, mono=True, duration=duration, res_type="kaiser_fast")
        if y is None or len(y) == 0:
            return None
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc, hop_length=hop_length)
        feat = mfcc.mean(axis=1)  # 13-D (ndarray supports .mean without numpy import)
        return [float(v) for v in feat]
    except Exception:
        return None

# 3) Build dataset (skip unreadable files)
X_raw = [mfcc_mean_safe(p) for p in wav_paths]
X, y = [], []
for xi, yi in zip(X_raw, labels):
    if xi is not None:
        X.append(xi); y.append(yi)

# 4) Encode, split, scale, train
le = LabelEncoder()
y_enc = le.fit_transform(y)

Xtr, Xte, ytr, yte = train_test_split(X, y_enc, test_size=0.2, stratify=y_enc, random_state=42)

scaler = StandardScaler()
Xtr = scaler.fit_transform(Xtr)
Xte = scaler.transform(Xte)

clf = LogisticRegression(max_iter=1000, n_jobs=-1)   # strong + fast baseline
clf.fit(Xtr, ytr)
yhat = clf.predict(Xte)

print("Accuracy:", f"{accuracy_score(yte, yhat):.4f}")
print(classification_report(yte, yhat, target_names=list(le.classes_)))


  y, sr = librosa.load(path, sr=sr, mono=True, duration=duration, res_type="kaiser_fast")
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Accuracy: 0.4400
              precision    recall  f1-score   support

       blues       0.35      0.40      0.37        20
   classical       0.88      0.75      0.81        20
     country       0.32      0.35      0.33        20
       disco       0.27      0.20      0.23        20
      hiphop       0.25      0.30      0.27        20
        jazz       0.26      0.30      0.28        20
       metal       0.68      0.75      0.71        20
         pop       0.64      0.80      0.71        20
      reggae       0.42      0.40      0.41        20
        rock       0.30      0.15      0.20        20

    accuracy                           0.44       200
   macro avg       0.44      0.44      0.43       200
weighted avg       0.44      0.44      0.43       200



In [None]:
# ===== genre classification — Mel-spectrograms =====

AUDIO_ROOT = "/content/drive/MyDrive/Elevvo Internship/Task 6/gtzan/Data/genres_original"
wav_paths = find_files(AUDIO_ROOT, ext=["wav"], recurse=True)
labels = [p.replace("\\", "/").split("/")[-2] for p in wav_paths]

# Label map (no sklearn needed)
classes = sorted(set(labels))
cls2id = {c:i for i,c in enumerate(classes)}
y_all = tf.constant([cls2id[c] for c in labels], dtype=tf.int32)

TARGET_MELS, TARGET_FRAMES = 128, 128

def mel_db_tensor_safe(path, sr=22050, duration=10, n_mels=128, hop_length=512,
                       target_mels=TARGET_MELS, target_frames=TARGET_FRAMES):
    try:
        y, sr = librosa.load(path, sr=sr, mono=True, duration=duration, res_type="kaiser_fast")
        if y is None or len(y) == 0:
            return None
        S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels, hop_length=hop_length)
        S_db = librosa.power_to_db(S, ref=lambda x: S.max() if S.max() != 0 else 1.0)  # [mels,T]
        t = tf.convert_to_tensor(S_db, dtype=tf.float32)[:, :, None]                   # [mels,T,1]
        t = tf.image.resize(t, size=(target_mels, tf.shape(t)[1]))                     # fix mel bins
        cur_T = tf.shape(t)[1]
        t = tf.cond(cur_T < target_frames,
                    lambda: tf.pad(t, [[0,0],[0, target_frames-cur_T],[0,0]]),
                    lambda: t[:, :target_frames, :])
        t = t[:, :target_frames, :]
        # min-max normalize per sample
        t_min, t_max = tf.reduce_min(t), tf.reduce_max(t)
        t = tf.cond(t_max > t_min, lambda: (t - t_min) / (t_max - t_min), lambda: tf.zeros_like(t))
        return t
    except Exception:
        return None

# Build tensors (skip bad files)
X_tensors, y_clean = [], []
for p, lab in zip(wav_paths, y_all.numpy().tolist()):
    t = mel_db_tensor_safe(p)
    if t is not None:
        X_tensors.append(t); y_clean.append(lab)

X_all = tf.stack(X_tensors, axis=0)                           # [N,128,128,1]
y_all = tf.constant(y_clean, dtype=tf.int32)

# Train/val split (pure TF)
N = tf.shape(X_all)[0]
idx = tf.random.shuffle(tf.range(N), seed=42)
val_n = tf.cast(tf.math.round(0.2 * tf.cast(N, tf.float32)), tf.int32)
idx_val, idx_tr = idx[:val_n], idx[val_n:]
Xtr, ytr = tf.gather(X_all, idx_tr), tf.gather(y_all, idx_tr)
Xva, yva = tf.gather(X_all, idx_val), tf.gather(y_all, idx_val)

train_ds = tf.data.Dataset.from_tensor_slices((Xtr, ytr)).batch(16).prefetch(tf.data.AUTOTUNE)
val_ds   = tf.data.Dataset.from_tensor_slices((Xva, yva)).batch(16).prefetch(tf.data.AUTOTUNE)

# Small CNN
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(TARGET_MELS, TARGET_FRAMES, 1)),
    tf.keras.layers.Conv2D(32, 3, activation="relu"), tf.keras.layers.MaxPooling2D(),
    tf.keras.layers.Conv2D(64, 3, activation="relu"), tf.keras.layers.MaxPooling2D(),
    tf.keras.layers.Conv2D(128, 3, activation="relu"),
    tf.keras.layers.GlobalAveragePooling2D(),
    tf.keras.layers.Dense(128, activation="relu"),
    tf.keras.layers.Dense(len(classes), activation="softmax"),
])
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
history = model.fit(train_ds, validation_data=val_ds, epochs=15, verbose=1)
print("Val accuracy:", history.history["val_accuracy"][-1])


  y, sr = librosa.load(path, sr=sr, mono=True, duration=duration, res_type="kaiser_fast")
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Epoch 1/15
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 878ms/step - accuracy: 0.0999 - loss: 2.3059 - val_accuracy: 0.0850 - val_loss: 2.3119
Epoch 2/15
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 452ms/step - accuracy: 0.1135 - loss: 2.2896 - val_accuracy: 0.1300 - val_loss: 2.2789
Epoch 3/15
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 483ms/step - accuracy: 0.1671 - loss: 2.1985 - val_accuracy: 0.1550 - val_loss: 2.1561
Epoch 4/15
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 722ms/step - accuracy: 0.2053 - loss: 2.1288 - val_accuracy: 0.1850 - val_loss: 2.1070
Epoch 5/15
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 904ms/step - accuracy: 0.2391 - loss: 2.0782 - val_accuracy: 0.2300 - val_loss: 2.0533
Epoch 6/15
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 526ms/step - accuracy: 0.2490 - loss: 2.0309 - val_accuracy: 0.2450 - val_loss: 2.0310
Epoch 7/15
[1m50/50[

In [None]:
# ===== MFCC tabular pipeline =====

# 1) Collect file paths + labels (parent folder name)
AUDIO_ROOT = "/content/drive/MyDrive/Elevvo Internship/Task 6/gtzan/Data/genres_original"  # change if needed
wav_paths = find_files(AUDIO_ROOT, ext=["wav"], recurse=True)
labels = [p.replace("\\", "/").split("/")[-2] for p in wav_paths]

# 2) Robust/fast MFCC extractor (skips unreadable files)
def mfcc_mean_safe(path, n_mfcc=13, sr=22050, duration=10, hop_length=1024):
    try:
        y, sr = librosa.load(path, sr=sr, mono=True, duration=duration, res_type="kaiser_fast")
        if y is None or len(y) == 0:
            return None
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc, hop_length=hop_length)
        feat = mfcc.mean(axis=1)  # ndarray supports .mean without importing numpy
        return [float(v) for v in feat]
    except Exception:
        return None

# 3) Build tabular dataset
X_raw = [mfcc_mean_safe(p) for p in wav_paths]
X, y = [], []
for xi, yi in zip(X_raw, labels):
    if xi is not None:
        X.append(xi); y.append(yi)

# 4) Encode labels, split, scale
le = LabelEncoder()
y_enc = le.fit_transform(y)

Xtr, Xte, ytr, yte = train_test_split(X, y_enc, test_size=0.2, stratify=y_enc, random_state=42)

scaler = StandardScaler()
Xtr = scaler.fit_transform(Xtr)
Xte = scaler.transform(Xte)

# 5) Train a multi-class model (fast strong baseline)
clf = LogisticRegression(max_iter=1000, n_jobs=-1, multi_class="auto")
clf.fit(Xtr, ytr)

# 6) Evaluate
yhat = clf.predict(Xte)
print("Accuracy:", f"{accuracy_score(yte, yhat):.4f}")
print(classification_report(yte, yhat, target_names=list(le.classes_)))


  y, sr = librosa.load(path, sr=sr, mono=True, duration=duration, res_type="kaiser_fast")
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Accuracy: 0.4400
              precision    recall  f1-score   support

       blues       0.35      0.40      0.37        20
   classical       0.88      0.75      0.81        20
     country       0.32      0.35      0.33        20
       disco       0.27      0.20      0.23        20
      hiphop       0.25      0.30      0.27        20
        jazz       0.26      0.30      0.28        20
       metal       0.68      0.75      0.71        20
         pop       0.64      0.80      0.71        20
      reggae       0.42      0.40      0.41        20
        rock       0.30      0.15      0.20        20

    accuracy                           0.44       200
   macro avg       0.44      0.44      0.43       200
weighted avg       0.44      0.44      0.43       200



In [None]:
# ===== Mel-spectrogram image pipeline =====

# 1) Collect file paths + labels
AUDIO_ROOT = "/content/drive/MyDrive/Elevvo Internship/Task 6/gtzan/Data/genres_original"
wav_paths = find_files(AUDIO_ROOT, ext=["wav"], recurse=True)
labels = [p.replace("\\", "/").split("/")[-2] for p in wav_paths]

# Build integer labels without sklearn
classes = sorted(set(labels))
cls2id = {c:i for i,c in enumerate(classes)}
y_all = tf.constant([cls2id[c] for c in labels], dtype=tf.int32)

# 2) Robust mel-spectrogram tensor (skips unreadable files if any)
TARGET_MELS, TARGET_FRAMES = 128, 128
def mel_db_tensor_safe(path, sr=22050, duration=10, n_mels=128, hop_length=512,
                       target_mels=TARGET_MELS, target_frames=TARGET_FRAMES):
    try:
        y, sr = librosa.load(path, sr=sr, mono=True, duration=duration, res_type="kaiser_fast")
        if y is None or len(y) == 0:
            return None
        S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels, hop_length=hop_length)
        S_db = librosa.power_to_db(S, ref=lambda x: S.max() if S.max() != 0 else 1.0)  # [mels,T]
        t = tf.convert_to_tensor(S_db, dtype=tf.float32)[:, :, None]                   # [mels,T,1]
        # Resize mel bins, then pad/crop time frames
        t = tf.image.resize(t, size=(target_mels, tf.shape(t)[1]))
        cur_T = tf.shape(t)[1]
        t = tf.cond(cur_T < target_frames,
                    lambda: tf.pad(t, [[0,0],[0, target_frames-cur_T],[0,0]]),
                    lambda: t[:, :target_frames, :])
        t = t[:, :target_frames, :]
        # Min-max normalize per sample
        t_min, t_max = tf.reduce_min(t), tf.reduce_max(t)
        t = tf.cond(t_max > t_min, lambda: (t - t_min) / (t_max - t_min), lambda: tf.zeros_like(t))
        return t
    except Exception:
        return None

# 3) Build tensor dataset (skip failures)
X_tensors, y_clean = [], []
for p, lab_id in zip(wav_paths, y_all.numpy().tolist()):
    t = mel_db_tensor_safe(p)
    if t is not None:
        X_tensors.append(t); y_clean.append(lab_id)

X_all = tf.stack(X_tensors, axis=0)             # [N, 128, 128, 1]
y_all = tf.constant(y_clean, dtype=tf.int32)    # [N]

# 4) Train/validation split (pure TF, stratified-ish via shuffle)
N = tf.shape(X_all)[0]
idx = tf.random.shuffle(tf.range(N), seed=42)
val_n = tf.cast(tf.math.round(0.2 * tf.cast(N, tf.float32)), tf.int32)
idx_val, idx_tr = idx[:val_n], idx[val_n:]
Xtr, ytr = tf.gather(X_all, idx_tr), tf.gather(y_all, idx_tr)
Xva, yva = tf.gather(X_all, idx_val), tf.gather(y_all, idx_val)

train_ds = tf.data.Dataset.from_tensor_slices((Xtr, ytr)).batch(16).prefetch(tf.data.AUTOTUNE)
val_ds   = tf.data.Dataset.from_tensor_slices((Xva, yva)).batch(16).prefetch(tf.data.AUTOTUNE)

# 5) Small CNN
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(TARGET_MELS, TARGET_FRAMES, 1)),
    tf.keras.layers.Conv2D(32, 3, activation="relu"), tf.keras.layers.MaxPooling2D(),
    tf.keras.layers.Conv2D(64, 3, activation="relu"), tf.keras.layers.MaxPooling2D(),
    tf.keras.layers.Conv2D(128, 3, activation="relu"),
    tf.keras.layers.GlobalAveragePooling2D(),
    tf.keras.layers.Dense(128, activation="relu"),
    tf.keras.layers.Dense(len(classes), activation="softmax"),
])
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

# 6) Train & evaluate
history = model.fit(train_ds, validation_data=val_ds, epochs=15, verbose=1)
print("Validation accuracy:", history.history["val_accuracy"][-1])

# Optional: confusion matrix (pure TF)
y_pred = tf.argmax(model.predict(val_ds, verbose=0), axis=1)
cm = tf.math.confusion_matrix(
    tf.concat([y for _, y in val_ds], axis=0),
    y_pred,
    num_classes=len(classes)
)
print("Confusion matrix (val):")
tf.print(cm)


  y, sr = librosa.load(path, sr=sr, mono=True, duration=duration, res_type="kaiser_fast")
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Epoch 1/15
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 444ms/step - accuracy: 0.0829 - loss: 2.3057 - val_accuracy: 0.0800 - val_loss: 2.3034
Epoch 2/15
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 483ms/step - accuracy: 0.1369 - loss: 2.2839 - val_accuracy: 0.2300 - val_loss: 2.1991
Epoch 3/15
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 434ms/step - accuracy: 0.2371 - loss: 2.1315 - val_accuracy: 0.2550 - val_loss: 2.0813
Epoch 4/15
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 465ms/step - accuracy: 0.2465 - loss: 2.0335 - val_accuracy: 0.2700 - val_loss: 2.0359
Epoch 5/15
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 428ms/step - accuracy: 0.2290 - loss: 2.0001 - val_accuracy: 0.2350 - val_loss: 1.9812
Epoch 6/15
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 475ms/step - accuracy: 0.2676 - loss: 1.9557 - val_accuracy: 0.3650 - val_loss: 1.9098
Epoch 7/15
[1m50/50[

In [None]:
# ===== Mel-spectrograms → CNN =====

# -----------------------
# 1) Collect WAV paths + labels
# -----------------------
AUDIO_ROOT = "/content/drive/MyDrive/Elevvo Internship/Task 6/gtzan/Data/genres_original"        # root folder with 10 genre subfolders
wav_paths = find_files(AUDIO_ROOT, ext=["wav"], recurse=True)
labels = [p.replace("\\", "/").split("/")[-2] for p in wav_paths]   # parent folder name

# Map class names <-> ids (no sklearn)
classes = sorted(set(labels))
cls2id = {c: i for i, c in enumerate(classes)}
y_all = tf.constant([cls2id[c] for c in labels], dtype=tf.int32)

# -----------------------
# 2) WAV -> mel-spectrogram tensor (robust; skips unreadable files)
# -----------------------
TARGET_MELS, TARGET_FRAMES = 128, 128  # final “image” size HxW
SR = 22050                              # fixed sample rate
DUR = 10                                # seconds to load (speed knob)
HOP = 512

def mel_db_tensor_safe(path,
                       sr=SR, duration=DUR, n_mels=128, hop_length=HOP,
                       target_mels=TARGET_MELS, target_frames=TARGET_FRAMES):
    """Return [target_mels, target_frames, 1] tensor or None if file fails to decode."""
    try:
        y, sr = librosa.load(path, sr=sr, mono=True, duration=duration, res_type="kaiser_fast")
        if y is None or len(y) == 0:
            return None
        S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels, hop_length=hop_length)
        S_db = librosa.power_to_db(S, ref=lambda x: S.max() if S.max() != 0 else 1.0)  # [mels, T]

        # to tensor [mels, T, 1]
        t = tf.convert_to_tensor(S_db, dtype=tf.float32)
        t = tf.expand_dims(t, -1)

        # resize mel bins (H) to target, then pad/crop time (W) to target
        t = tf.image.resize(t, size=(target_mels, tf.shape(t)[1]))
        cur_T = tf.shape(t)[1]
        t = tf.cond(cur_T < target_frames,
                    lambda: tf.pad(t, [[0,0], [0, target_frames - cur_T], [0,0]]),
                    lambda: t[:, :target_frames, :])
        t = t[:, :target_frames, :]

        # per-sample min-max normalization to [0,1]
        t_min, t_max = tf.reduce_min(t), tf.reduce_max(t)
        return tf.cond(t_max > t_min, lambda: (t - t_min) / (t_max - t_min),
                       lambda: tf.zeros_like(t))
    except Exception:
        return None

# Build tensors, skipping bad files
X_tensors, y_clean = [], []
for p, lab in zip(wav_paths, tf.unstack(y_all)):
    t = mel_db_tensor_safe(p)
    if t is not None:
        X_tensors.append(t)
        y_clean.append(lab.numpy().item())

X_all = tf.stack(X_tensors, axis=0)                 # [N, 128, 128, 1]
y_all = tf.constant(y_clean, dtype=tf.int32)        # [N]
print("Kept:", X_all.shape[0], "samples")

# -----------------------
# 3) Train/val split (pure TF)
# -----------------------
N = tf.shape(X_all)[0]
idx = tf.random.shuffle(tf.range(N), seed=42)
val_n = tf.cast(tf.math.round(0.2 * tf.cast(N, tf.float32)), tf.int32)
idx_val, idx_tr = idx[:val_n], idx[val_n:]

Xtr, ytr = tf.gather(X_all, idx_tr), tf.gather(y_all, idx_tr)
Xva, yva = tf.gather(X_all, idx_val), tf.gather(y_all, idx_val)

BATCH = 16
train_ds = tf.data.Dataset.from_tensor_slices((Xtr, ytr)).batch(BATCH).prefetch(tf.data.AUTOTUNE)
val_ds   = tf.data.Dataset.from_tensor_slices((Xva, yva)).batch(BATCH).prefetch(tf.data.AUTOTUNE)

# -----------------------
# 4) CNN model
# -----------------------
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(TARGET_MELS, TARGET_FRAMES, 1)),
    tf.keras.layers.Conv2D(32, 3, activation="relu"), tf.keras.layers.MaxPooling2D(),
    tf.keras.layers.Conv2D(64, 3, activation="relu"), tf.keras.layers.MaxPooling2D(),
    tf.keras.layers.Conv2D(128, 3, activation="relu"),
    tf.keras.layers.GlobalAveragePooling2D(),
    tf.keras.layers.Dense(128, activation="relu"),
    tf.keras.layers.Dense(len(classes), activation="softmax"),
])
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
model.summary()

# -----------------------
# 5) Train & evaluate
# -----------------------
EPOCHS = 15
history = model.fit(train_ds, validation_data=val_ds, epochs=EPOCHS, verbose=1)
print("Val accuracy:", history.history["val_accuracy"][-1])

# Confusion matrix (pure TF)
y_val_true = tf.concat([y for _, y in val_ds], axis=0)
y_val_pred = tf.argmax(model.predict(val_ds, verbose=0), axis=1)
cm = tf.math.confusion_matrix(y_val_true, y_val_pred, num_classes=len(classes))
print("Confusion matrix (validation):")
tf.print(cm)

# Optional: per-class accuracy
correct_per_class = tf.linalg.diag_part(cm)
counts_per_class  = tf.reduce_sum(cm, axis=1)
per_class_acc = tf.math.divide_no_nan(tf.cast(correct_per_class, tf.float32),
                                      tf.cast(counts_per_class, tf.float32))
for i, acc in enumerate(tf.unstack(per_class_acc)):
    print(f"{classes[i]:<10s}: {acc.numpy():.3f}")


  y, sr = librosa.load(path, sr=sr, mono=True, duration=duration, res_type="kaiser_fast")
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Kept: 999 samples


Epoch 1/15
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 458ms/step - accuracy: 0.1222 - loss: 2.3078 - val_accuracy: 0.1350 - val_loss: 2.3024
Epoch 2/15
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 443ms/step - accuracy: 0.1491 - loss: 2.2934 - val_accuracy: 0.2400 - val_loss: 2.2556
Epoch 3/15
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 529ms/step - accuracy: 0.1913 - loss: 2.2211 - val_accuracy: 0.1850 - val_loss: 2.1089
Epoch 4/15
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 459ms/step - accuracy: 0.2023 - loss: 2.0606 - val_accuracy: 0.2000 - val_loss: 2.0441
Epoch 5/15
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 452ms/step - accuracy: 0.2308 - loss: 2.0535 - val_accuracy: 0.2350 - val_loss: 1.9994
Epoch 6/15
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 437ms/step - accuracy: 0.2450 - loss: 2.0183 - val_accuracy: 0.2350 - val_loss: 1.9753
Epoch 7/15
[1m50/50[

In [None]:
# ===== Compare Tabular (MFCC) vs Image (Mel-spec) =====

# -----------------------
# 0) Config
# -----------------------
AUDIO_ROOT   = "/content/drive/MyDrive/Elevvo Internship/Task 6/gtzan/Data/genres_original"  # folder with 10 genre subfolders
SR           = 22050              # fixed sample rate (faster & consistent)
DUR          = 10                 # seconds to load (speed knob)
N_MFCC       = 13                 # MFCC count (tabular features)
HOP_MFCC     = 1024
N_MELS       = 128
HOP_MEL      = 512
TARGET_MELS  = 128
TARGET_FRAMES= 128
BATCH        = 16
EPOCHS       = 12
SEED         = 42

# -----------------------
# 1) Collect paths + labels (no os)
# -----------------------
wav_paths = find_files(AUDIO_ROOT, ext=["wav"], recurse=True)
labels_raw = [p.replace("\\", "/").split("/")[-2] for p in wav_paths]

# Build integer labels (no sklearn needed here, but we'll also keep string labels)
classes = sorted(set(labels_raw))
cls2id  = {c:i for i,c in enumerate(classes)}
y_ids   = [cls2id[c] for c in labels_raw]

# -----------------------
# 2) One-pass feature builder: MFCC (tabular) + Mel-spec (image)
#    - robust: skip unreadable files
# -----------------------
def extract_both_safe(path):
    """
    Returns (mfcc_vec:list[float], mel_tensor:tf.Tensor[H,W,1]) or (None, None) on failure.
    """
    try:
        y, sr = librosa.load(path, sr=SR, mono=True, duration=DUR, res_type="kaiser_fast")
        if y is None or len(y) == 0:
            return None, None

        # MFCC mean features (tabular)
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=N_MFCC, hop_length=HOP_MFCC)
        mfcc_vec = [float(v) for v in mfcc.mean(axis=1)]  # 13 numbers

        # Mel-spectrogram image (tensor)
        S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=N_MELS, hop_length=HOP_MEL)
        S_db = librosa.power_to_db(S, ref=lambda x: S.max() if S.max() != 0 else 1.0)  # [mels,T]
        t = tf.convert_to_tensor(S_db, dtype=tf.float32)[:, :, None]                   # [mels,T,1]
        # resize mel bins, then pad/crop time to fixed width
        t = tf.image.resize(t, size=(TARGET_MELS, tf.shape(t)[1]))
        cur_T = tf.shape(t)[1]
        t = tf.cond(cur_T < TARGET_FRAMES,
                    lambda: tf.pad(t, [[0,0],[0, TARGET_FRAMES - cur_T],[0,0]]),
                    lambda: t[:, :TARGET_FRAMES, :])
        t = t[:, :TARGET_FRAMES, :]
        # per-sample min-max normalize to [0,1]
        t_min, t_max = tf.reduce_min(t), tf.reduce_max(t)
        t = tf.cond(t_max > t_min, lambda: (t - t_min) / (t_max - t_min), lambda: tf.zeros_like(t))
        return mfcc_vec, t
    except Exception:
        return None, None

X_mfcc_all, X_mel_tensors, y_all_ids, y_all_labels = [], [], [], []
skipped = 0
for p, lab_id, lab_str in zip(wav_paths, y_ids, labels_raw):
    f_tab, f_img = extract_both_safe(p)
    if f_tab is None or f_img is None:
        skipped += 1
        continue
    X_mfcc_all.append(f_tab)
    X_mel_tensors.append(f_img)
    y_all_ids.append(lab_id)
    y_all_labels.append(lab_str)

print(f"Prepared samples: {len(X_mfcc_all)}  |  Skipped unreadable: {skipped}")

# Stack mel images
X_mel_all = tf.stack(X_mel_tensors, axis=0)           # [N, 128, 128, 1]
y_all = tf.constant(y_all_ids, dtype=tf.int32)

# -----------------------
# 3) One stratified split used by BOTH models
# -----------------------
idx_all = list(range(len(y_all_ids)))
idx_tr, idx_va, y_tr_ids, y_va_ids = train_test_split(
    idx_all, y_all_ids, test_size=0.2, stratify=y_all_ids, random_state=SEED
)

# Build splits for tabular
Xtr_tab = [X_mfcc_all[i] for i in idx_tr]
Xva_tab = [X_mfcc_all[i] for i in idx_va]
ytr_tab = y_tr_ids
yva_tab = y_va_ids

# Build splits for images
Xtr_img = tf.gather(X_mel_all, tf.constant(idx_tr, dtype=tf.int32))
Xva_img = tf.gather(X_mel_all, tf.constant(idx_va, dtype=tf.int32))
ytr_img = tf.constant(ytr_tab, dtype=tf.int32)
yva_img = tf.constant(yva_tab, dtype=tf.int32)

# -----------------------
# 4) TABULAR model: MFCCs + Logistic Regression
# -----------------------
# scale features
scaler = StandardScaler()
Xtr_tab_s = scaler.fit_transform(Xtr_tab)
Xva_tab_s = scaler.transform(Xva_tab)

clf = LogisticRegression(max_iter=1000, n_jobs=-1, multi_class="auto")
clf.fit(Xtr_tab_s, ytr_tab)
yhat_tab = clf.predict(Xva_tab_s)

acc_tab = accuracy_score(yva_tab, yhat_tab)
print("\n=== Tabular (MFCC + Logistic Regression) ===")
print("Accuracy:", f"{acc_tab:.4f}")
print(classification_report(yva_tab, yhat_tab, target_names=classes))

# Confusion matrix (tabular) using TensorFlow for symmetry
cm_tab = tf.math.confusion_matrix(
    tf.constant(yva_tab, dtype=tf.int32),
    tf.constant(yhat_tab, dtype=tf.int32),
    num_classes=len(classes)
)
print("Confusion matrix (tabular):")
tf.print(cm_tab)

# -----------------------
# 5) IMAGE model: Mel-spectrogram + small CNN
# -----------------------
train_ds = tf.data.Dataset.from_tensor_slices((Xtr_img, ytr_img)).batch(BATCH).prefetch(tf.data.AUTOTUNE)
val_ds   = tf.data.Dataset.from_tensor_slices((Xva_img, yva_img)).batch(BATCH).prefetch(tf.data.AUTOTUNE)

model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(TARGET_MELS, TARGET_FRAMES, 1)),
    tf.keras.layers.Conv2D(32, 3, activation="relu"), tf.keras.layers.MaxPooling2D(),
    tf.keras.layers.Conv2D(64, 3, activation="relu"), tf.keras.layers.MaxPooling2D(),
    tf.keras.layers.Conv2D(128, 3, activation="relu"),
    tf.keras.layers.GlobalAveragePooling2D(),
    tf.keras.layers.Dense(128, activation="relu"),
    tf.keras.layers.Dense(len(classes), activation="softmax"),
])
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
history = model.fit(train_ds, validation_data=val_ds, epochs=EPOCHS, verbose=1)

# Evaluate
val_acc_img = history.history["val_accuracy"][-1]
print("\n=== Image (Mel-spectrogram + CNN) ===")
print("Validation accuracy:", f"{val_acc_img:.4f}")

y_pred_img = tf.argmax(model.predict(val_ds, verbose=0), axis=1)
cm_img = tf.math.confusion_matrix(
    tf.concat([y for _, y in val_ds], axis=0),
    y_pred_img,
    num_classes=len(classes)
)
print("Confusion matrix (image):")
tf.print(cm_img)

# -----------------------
# 6) Side-by-side comparison
# -----------------------
print("\n=== Side-by-side ===")
print(f"Tabular (MFCC + LR)  Acc: {acc_tab:.4f}")
print(f"Image   (Mel + CNN)  Acc: {val_acc_img:.4f}")


  y, sr = librosa.load(path, sr=SR, mono=True, duration=DUR, res_type="kaiser_fast")
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Prepared samples: 999  |  Skipped unreadable: 1





=== Tabular (MFCC + Logistic Regression) ===
Accuracy: 0.4400
              precision    recall  f1-score   support

       blues       0.35      0.40      0.37        20
   classical       0.88      0.75      0.81        20
     country       0.32      0.35      0.33        20
       disco       0.27      0.20      0.23        20
      hiphop       0.25      0.30      0.27        20
        jazz       0.26      0.30      0.28        20
       metal       0.68      0.75      0.71        20
         pop       0.64      0.80      0.71        20
      reggae       0.42      0.40      0.41        20
        rock       0.30      0.15      0.20        20

    accuracy                           0.44       200
   macro avg       0.44      0.44      0.43       200
weighted avg       0.44      0.44      0.43       200

Confusion matrix (tabular):
[[8 0 2 ... 0 2 2]
 [1 15 1 ... 0 0 1]
 [2 1 7 ... 0 1 1]
 ...
 [0 0 0 ... 16 2 0]
 [2 0 2 ... 1 8 0]
 [3 0 4 ... 1 0 3]]
Epoch 1/12
[1m50/50[0m [3

In [None]:
# ===== Compare CSV tabular vs PNG spectrograms =====
# - Tabular: reads features_30_sec.csv (or features_3_sec.csv), trains Logistic Regression
# - Image:   reads images_original/*/*.png, trains a small CNN

# ------------- PATHS (edit these to your dataset locations) -------------
CSV_PATH      = "/content/drive/MyDrive/Elevvo Internship/Task 6/gtzan/Data/features_30_sec.csv"     # or "features_3_sec.csv"
IMAGES_ROOT   = "/content/drive/MyDrive/Elevvo Internship/Task 6/gtzan/Data/images_original"          # with 10 genre subfolders

SEED          = 42
TAB_TEST_SIZE = 0.2
IMG_TEST_SIZE = 0.2

# ============================
# A) TABULAR: CSV -> scikit-learn
# ============================
# 1) Read header and detect label column
with open(CSV_PATH, "r", newline="", encoding="utf-8") as f:
    reader = csv.reader(f)
    header = next(reader)

label_candidates = {"label", "genre", "class", "target"}
label_idx = None
for i, col in enumerate(header):
    if col.strip().lower() in label_candidates:
        label_idx = i
        break
if label_idx is None:
    raise ValueError(f"Could not find label column in CSV header: {header}")

# 2) Choose feature columns (MFCCs + common spectral stats if present)
def want_feature(colname: str) -> bool:
    n = colname.strip().lower()
    return (
        n.startswith("mfcc") or
        n in {
            "chroma_stft_mean", "spectral_centroid_mean",
            "spectral_bandwidth_mean", "rolloff_mean",
            "zcr_mean", "rms_mean"
        }
    )

feat_indices = [i for i, c in enumerate(header) if i != label_idx and want_feature(c)]
if not feat_indices:
    raise ValueError("No feature columns detected. Inspect your CSV header and adjust selectors.")

# 3) Load rows
X_tab, y_tab = [], []
with open(CSV_PATH, "r", newline="", encoding="utf-8") as f:
    reader = csv.reader(f)
    next(reader)  # skip header
    for row in reader:
        try:
            feats = [float(row[i]) for i in feat_indices]
            label = row[label_idx]
        except Exception:
            # Skip malformed rows
            continue
        X_tab.append(feats)
        y_tab.append(label)

# 4) Encode, split, scale, train
le_tab = LabelEncoder()
y_tab_enc = le_tab.fit_transform(y_tab)

Xtr_tab, Xte_tab, ytr_tab, yte_tab = train_test_split(
    X_tab, y_tab_enc, test_size=TAB_TEST_SIZE, stratify=y_tab_enc, random_state=SEED
)
scaler = StandardScaler()
Xtr_tab_s = scaler.fit_transform(Xtr_tab)
Xte_tab_s = scaler.transform(Xte_tab)

clf = LogisticRegression(max_iter=1000, n_jobs=-1, multi_class="auto")
clf.fit(Xtr_tab_s, ytr_tab)
yhat_tab = clf.predict(Xte_tab_s)

acc_tab = accuracy_score(yte_tab, yhat_tab)
print("\n=== TABULAR (CSV + Logistic Regression) ===")
print("Accuracy:", f"{acc_tab:.4f}")
print(classification_report(yte_tab, yhat_tab, target_names=list(le_tab.classes_)))

# ==========================================
# B) IMAGE: PNG spectrograms -> Keras CNN
# ==========================================
# 1) List PNG files and infer labels from parent folders (pure TF)
#    Pattern: images_original/<genre>/*.png
png_paths = tf.io.gfile.glob(IMAGES_ROOT + "/*/*.png")
if not png_paths:
    raise ValueError(f"No PNGs found under {IMAGES_ROOT}. Check path or file extensions.")

# Normalize separators and extract folder name as label
def parent_folder(path: str) -> str:
    p = path.replace("\\", "/")
    parts = p.split("/")
    return parts[-2]  # parent directory

labels_img_str = [parent_folder(p) for p in png_paths]
classes_img = sorted(set(labels_img_str))
cls2id_img = {c: i for i, c in enumerate(classes_img)}
y_img_ids = [cls2id_img[c] for c in labels_img_str]

# Optionally ensure the same 10 genres as CSV (usually identical in GTZAN)
# If not identical, this still evaluates each branch independently.

# 2) Train/val split for images
idx_all = list(range(len(png_paths)))
idx_tr, idx_va, y_tr_ids, y_va_ids = train_test_split(
    idx_all, y_img_ids, test_size=IMG_TEST_SIZE, stratify=y_img_ids, random_state=SEED
)

paths_tr = [png_paths[i] for i in idx_tr]
paths_va = [png_paths[i] for i in idx_va]
ytr_img  = tf.constant(y_tr_ids, dtype=tf.int32)
yva_img  = tf.constant(y_va_ids, dtype=tf.int32)

IMG_SIZE = (128, 128)
BATCH    = 32
EPOCHS   = 12

# 3) tf.data pipeline: read/decode PNG, resize, normalize to [0,1]
def load_png(path, label):
    img = tf.io.read_file(path)
    img = tf.image.decode_png(img, channels=3)  # spectrograms usually RGB-like
    img = tf.image.resize(img, IMG_SIZE)
    img = tf.image.convert_image_dtype(img, tf.float32)  # [0,1]
    return img, label

ds_tr = tf.data.Dataset.from_tensor_slices((paths_tr, ytr_img)) \
        .shuffle(len(paths_tr), seed=SEED) \
        .map(load_png, num_parallel_calls=tf.data.AUTOTUNE) \
        .batch(BATCH) \
        .prefetch(tf.data.AUTOTUNE)

ds_va = tf.data.Dataset.from_tensor_slices((paths_va, yva_img)) \
        .map(load_png, num_parallel_calls=tf.data.AUTOTUNE) \
        .batch(BATCH) \
        .prefetch(tf.data.AUTOTUNE)

# 4) Simple CNN for image classification
num_classes_img = len(classes_img)
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=IMG_SIZE + (3,)),
    tf.keras.layers.Conv2D(32, 3, activation="relu"), tf.keras.layers.MaxPooling2D(),
    tf.keras.layers.Conv2D(64, 3, activation="relu"), tf.keras.layers.MaxPooling2D(),
    tf.keras.layers.Conv2D(128, 3, activation="relu"),
    tf.keras.layers.GlobalAveragePooling2D(),
    tf.keras.layers.Dense(128, activation="relu"),
    tf.keras.layers.Dense(num_classes_img, activation="softmax"),
])
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

history = model.fit(ds_tr, validation_data=ds_va, epochs=EPOCHS, verbose=1)
acc_img = history.history["val_accuracy"][-1]

# Confusion matrix (image branch)
y_pred_img = tf.argmax(model.predict(ds_va, verbose=0), axis=1, output_type=tf.int32)
y_true_img = tf.concat([y for _, y in ds_va], axis=0)
cm_img = tf.math.confusion_matrix(y_true_img, y_pred_img, num_classes=num_classes_img)

print("\n=== IMAGE (PNG spectrograms + CNN) ===")
print("Validation accuracy:", f"{acc_img:.4f}")
print("Confusion matrix (image):")
tf.print(cm_img)

# =======================
# C) Side-by-side summary
# =======================
print("\n=== Side-by-side ===")
print(f"Tabular (CSV + LR)   Acc: {acc_tab:.4f}  (classes: {len(le_tab.classes_)})")
print(f"Image   (PNG + CNN)  Acc: {acc_img:.4f}  (classes: {num_classes_img})")





=== TABULAR (CSV + Logistic Regression) ===
Accuracy: 0.6700
              precision    recall  f1-score   support

       blues       0.72      0.65      0.68        20
   classical       0.90      0.95      0.93        20
     country       0.55      0.60      0.57        20
       disco       0.60      0.45      0.51        20
      hiphop       0.50      0.60      0.55        20
        jazz       0.71      0.85      0.77        20
       metal       0.94      0.80      0.86        20
         pop       0.83      0.95      0.88        20
      reggae       0.52      0.55      0.54        20
        rock       0.40      0.30      0.34        20

    accuracy                           0.67       200
   macro avg       0.67      0.67      0.66       200
weighted avg       0.67      0.67      0.66       200

Epoch 1/12
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 2s/step - accuracy: 0.1167 - loss: 8.9742 - val_accuracy: 0.1940 - val_loss: 2.2191
Epoch 2/12
[1m25/2

In [None]:
# ===== Transfer Learning on Mel-Spectrograms =====

# -----------------------
# CONFIG
# -----------------------
AUDIO_ROOT   = "/content/drive/MyDrive/Elevvo Internship/Task 6/gtzan/Data/genres_original"  # <-- set your real path
SR           = 22050
DUR          = 10
N_MELS       = 128
HOP_LENGTH   = 512
IMG_SIZE     = (224, 224)        # MobileNet/EfficientNet default
BATCH        = 16
EPOCHS_HEAD  = 5
EPOCHS_FT    = 10
SEED         = 42

# -----------------------
# 1) Find WAVs + labels
# -----------------------
# Try wav/WAV or any audio if needed
wav_paths = []
for exts in (["wav"], ["WAV"], None):  # None = all supported audio extensions
    cand = find_files(AUDIO_ROOT, ext=exts, recurse=True)
    if len(cand) > len(wav_paths):
        wav_paths = cand

print(f"Found {len(wav_paths)} audio files under: {AUDIO_ROOT}")
if len(wav_paths) == 0:
    raise ValueError("No audio files found. Double-check AUDIO_ROOT or your Drive mount.")

labels = [p.replace("\\", "/").split("/")[-2] for p in wav_paths]
classes = sorted(set(labels))
cls2id  = {c:i for i,c in enumerate(classes)}
y_ids   = [cls2id[c] for c in labels]

# -----------------------
# 2) Robust per-class split with fallback
# -----------------------
idx_per_class = {c: [] for c in range(len(classes))}
for i, cid in enumerate(y_ids):
    idx_per_class[cid].append(i)

train_idx, val_idx = [], []
for cid, idxs in idx_per_class.items():
    if len(idxs) == 0:
        continue
    idxs_tf = tf.random.shuffle(tf.constant(idxs, dtype=tf.int32), seed=SEED)
    # use floor (not round) and keep at least one for train if possible
    k = tf.cast(tf.math.floor(0.2 * tf.cast(tf.shape(idxs_tf)[0], tf.float32)), tf.int32)
    k = tf.minimum(k, tf.shape(idxs_tf)[0] - 1)  # ensure ≥1 in train when class has ≥1 sample
    val_idx.extend(idxs_tf[:k].numpy().tolist())
    train_idx.extend(idxs_tf[k:].numpy().tolist())

# Fallback to global 80/20 if per-class split broke
if len(train_idx) == 0:
    all_idx = tf.random.shuffle(tf.range(len(wav_paths)), seed=SEED).numpy().tolist()
    cut = max(1, int(0.8 * len(all_idx)))
    train_idx, val_idx = all_idx[:cut], all_idx[cut:]

paths_tr = [wav_paths[i] for i in train_idx]
paths_va = [wav_paths[i] for i in val_idx]
labels_tr = [y_ids[i] for i in train_idx]
labels_va = [y_ids[i] for i in val_idx]

print(f"Train files: {len(paths_tr)} | Val files: {len(paths_va)}")
if len(paths_tr) == 0:
    raise ValueError("Empty training set after split — verify AUDIO_ROOT and class folders.")

# -----------------------
# 3) WAV → Mel-spectrogram → 224x224x3
# -----------------------
def _wav_to_melspec_224x224x3(path_bytes):
    path = path_bytes.numpy().decode("utf-8")
    try:
        # Force librosa to use audioread (works with mp3 disguised as wav)
        y, sr = librosa.load(path, sr=SR, mono=True, duration=DUR, res_type="kaiser_fast", backend="audioread")
    except Exception as e:
        # If unreadable, return a blank spectrogram
        #print(f"⚠️ Skipping unreadable file: {path} ({e})")
        return tf.zeros(IMG_SIZE + (3,), dtype=tf.float32)

    if y is None or len(y) == 0:
        return tf.zeros(IMG_SIZE + (3,), dtype=tf.float32)

    # Mel spectrogram
    S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=N_MELS, hop_length=HOP_LENGTH)
    S_db = librosa.power_to_db(S, ref=lambda x: S.max() if S.max() != 0 else 1.0)

    # Convert → tensor [H,W,1]
    t = tf.convert_to_tensor(S_db, dtype=tf.float32)[:, :, None]

    # Normalize per-sample
    t_min, t_max = tf.reduce_min(t), tf.reduce_max(t)
    t = tf.cond(t_max > t_min, lambda: (t - t_min) / (t_max - t_min), lambda: tf.zeros_like(t))

    # Resize + tile to 3 channels
    t = tf.image.resize(t, IMG_SIZE)
    t = tf.tile(t, [1, 1, 3])
    return t

def tf_wav_to_melspec_224x224x3(path):
    img = tf.py_function(_wav_to_melspec_224x224x3, [path], tf.float32)
    img.set_shape(IMG_SIZE + (3,))  # static shape for batching/model
    return img

# -----------------------
# 4) tf.data pipelines (from Python lists; known lengths)
# -----------------------
def make_ds(paths_list, labels_list, training=True):
    ds = tf.data.Dataset.from_tensor_slices((paths_list, labels_list))
    if training:
        ds = ds.shuffle(buffer_size=max(1, len(paths_list)), seed=SEED)
    ds = ds.map(lambda p, y: (tf_wav_to_melspec_224x224x3(p), y),
                num_parallel_calls=tf.data.AUTOTUNE)
    if training:
        aug = tf.keras.Sequential([
            tf.keras.layers.RandomFlip("horizontal"),  # time-axis flip
            tf.keras.layers.RandomContrast(0.1),
            tf.keras.layers.RandomZoom(0.1),
        ])
        ds = ds.map(lambda x, y: (aug(x, training=True), y),
                    num_parallel_calls=tf.data.AUTOTUNE)
    ds = ds.batch(BATCH).prefetch(tf.data.AUTOTUNE)
    return ds

train_ds = make_ds(paths_tr, labels_tr, training=True)
val_ds   = make_ds(paths_va, labels_va, training=False)

steps_per_epoch  = max(1, math.ceil(len(paths_tr) / BATCH))
validation_steps = max(1, math.ceil(len(paths_va) / BATCH))

# -----------------------
# 5) Transfer learning model (MobileNetV2)
# -----------------------

preproc = tf.keras.layers.Lambda(preprocess_input)
base = MobileNetV2(input_shape=IMG_SIZE + (3,), include_top=False, weights="imagenet")
base.trainable = False

model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=IMG_SIZE + (3,)),
    preproc,
    base,
    tf.keras.layers.GlobalAveragePooling2D(),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(256, activation="relu"),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(len(classes), activation="softmax"),
])

model.compile(optimizer=tf.keras.optimizers.Adam(1e-3),
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])

model.summary()

# -----------------------
# 6) Train classifier head
# -----------------------
hist_head = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=EPOCHS_HEAD,
    steps_per_epoch=steps_per_epoch,
    validation_steps=validation_steps,
    verbose=1
)
print("Val acc after head training:", hist_head.history["val_accuracy"][-1])

# -----------------------
# 7) Fine-tune last blocks
# -----------------------
# Unfreeze from block_13 (≈ last third), or fallback to 70% boundary
fine_tune_from = None
for i, layer in enumerate(base.layers[::-1]):
    if "block_13" in layer.name:
        fine_tune_from = len(base.layers) - i
        break
if fine_tune_from is None:
    fine_tune_from = int(0.7 * len(base.layers))

for i, layer in enumerate(base.layers):
    layer.trainable = (i >= fine_tune_from)

model.compile(optimizer=tf.keras.optimizers.Adam(1e-4),
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])

hist_ft = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=EPOCHS_FT,
    steps_per_epoch=steps_per_epoch,
    validation_steps=validation_steps,
    verbose=1
)
print("Val acc after fine-tuning:", hist_ft.history["val_accuracy"][-1])

# -----------------------
# 8) Evaluation: confusion matrix & per-class accuracy
# -----------------------
y_true = tf.concat([y for _, y in val_ds], axis=0)
y_prob = model.predict(val_ds, verbose=0, steps=validation_steps)
y_pred = tf.argmax(y_prob, axis=1, output_type=tf.int32)

cm = tf.math.confusion_matrix(y_true, y_pred, num_classes=len(classes))
print("Confusion matrix:")
tf.print(cm)

correct = tf.linalg.diag_part(cm)
totals  = tf.reduce_sum(cm, axis=1)
per_class_acc = tf.math.divide_no_nan(tf.cast(correct, tf.float32),
                                      tf.cast(totals, tf.float32))
print("\nPer-class accuracy:")
for i, acc in enumerate(tf.unstack(per_class_acc)):
    print(f"{classes[i]:<10s}: {acc.numpy():.3f}")


Found 1000 audio files under: /content/drive/MyDrive/Elevvo Internship/Task 6/gtzan/Data/genres_original
Train files: 800 | Val files: 200


Epoch 1/5
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 928ms/step - accuracy: 0.1127 - loss: 2.6617 - val_accuracy: 0.1000 - val_loss: 2.3775
Epoch 2/5
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 860ms/step - accuracy: 0.0876 - loss: 2.4007 - val_accuracy: 0.1000 - val_loss: 2.3214
Epoch 3/5
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 878ms/step - accuracy: 0.0985 - loss: 2.3423 - val_accuracy: 0.1000 - val_loss: 2.3044
Epoch 4/5
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 930ms/step - accuracy: 0.0834 - loss: 2.3034 - val_accuracy: 0.1000 - val_loss: 2.3036
Epoch 5/5
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 883ms/step - accuracy: 0.0982 - loss: 2.3151 - val_accuracy: 0.1000 - val_loss: 2.3058
Val acc after head training: 0.10000000149011612
Epoch 1/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 1s/step - accuracy: 0.1109 - loss: 2.3147 - val_accuracy: 0.1000



Confusion matrix:
[[20 0 0 ... 0 0 0]
 [20 0 0 ... 0 0 0]
 [20 0 0 ... 0 0 0]
 ...
 [20 0 0 ... 0 0 0]
 [20 0 0 ... 0 0 0]
 [20 0 0 ... 0 0 0]]

Per-class accuracy:
blues     : 1.000
classical : 0.000
country   : 0.000
disco     : 0.000
hiphop    : 0.000
jazz      : 0.000
metal     : 0.000
pop       : 0.000
reggae    : 0.000
rock      : 0.000
