In [24]:
# Debug helper: find leakage and verify real generalization
!pip install -q scikit-learn==1.2.2 librosa

import os, sys
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.feature_selection import mutual_info_classif
from sklearn.neighbors import NearestNeighbors

CSV_PATH = '/content/music_genre_features.csv'   # change if needed
print("Loading:", CSV_PATH)
df = pd.read_csv(CSV_PATH)
print("Shape:", df.shape)
display(df.head())

# ---- find the feature column the same way you did ----
target_col = 'genre'
feature_col = None
for col in df.columns:
    if col == target_col:
        continue
    if df[col].astype(str).str.contains(',').any():
        feature_col = col
        break
if feature_col is None:
    raise ValueError("No comma-separated feature column found. Check CSV format.")
print("Using feature column:", feature_col)

# ---- expand feature vectors ----
def parse_vector(s):
    return np.array([float(i) for i in str(s).split(',') if i.strip()!=''])

vectors = df[feature_col].apply(parse_vector)
X = np.vstack(vectors.values)
print("Feature matrix shape:", X.shape)

# ---- labels ----
le = LabelEncoder()
y_int = le.fit_transform(df[target_col].astype(str))
print("Classes:", list(le.classes_))
print("Class counts:", Counter(y_int))

# ---- 1) Duplicate/sample-leak check ----
# hash each feature vector (rounded to reduce float noise) and look for duplicates
round_dec = 6
row_hashes = [hash(tuple(np.round(row, round_dec))) for row in X]
df['_row_hash'] = row_hashes
dup_groups = df[df.duplicated('_row_hash', keep=False)].sort_values('_row_hash')
print("\nTotal duplicated feature-hash rows (equal vectors up to rounding):", dup_groups.shape[0])
if not dup_groups.empty:
    print("Showing up to 20 duplicate groups (hash + genres):")
    display(dup_groups[[feature_col, target_col, '_row_hash']].head(20))

# If duplicates exist across different genres -> strong problem
if not dup_groups.empty:
    dup_hashes = dup_groups['_row_hash'].unique()
    for h in dup_hashes[:10]:
        g = dup_groups[dup_groups['_row_hash']==h]
        if len(g[target_col].unique()) > 1:
            print(f"Hash {h} maps to multiple genres -> LEAK! Example:")
            display(g[[feature_col, target_col]].head())
            break

# ---- 2) constant features ----
stds = X.std(axis=0)
const_idx = np.where(stds == 0)[0]
print("\nConstant feature indices (std==0):", const_idx.tolist())
if len(const_idx)>0:
    print("Consider removing constant features; they cannot help generalization.")

# ---- 3) Features that strongly identify a single class (suspicious) ----
# discretize values by rounding then check purity of each discrete value
suspicious = []
min_support = max(3, int(0.005 * X.shape[0]))  # at least 3 occurrences or 0.5% of data
for j in range(X.shape[1]):
    vals = np.round(X[:,j], 4)   # discretize
    counts = {}
    for v, lab in zip(vals, y_int):
        counts.setdefault(v, set()).add(lab)
    # find discrete values that only occur with a single label and are frequent
    for v, labs in counts.items():
        if len(labs)==1:
            # support = frequency of this discrete value
            support = np.sum(vals==v)
            if support >= min_support:
                suspicious.append((j, v, support, list(labs)[0]))
                break
# Print suspicious features (index, value, frequency, label)
print("\nSuspicious features that have a discrete value mapping only to one label (index, value, freq, label):")
for t in suspicious[:20]:
    print(t)
if not suspicious:
    print("None found at the chosen discretization/support thresholds.")

# ---- 4) Mutual information ranking (continuous) ----
print("\nComputing mutual information (this may take a moment)...")
mi = mutual_info_classif(X, y_int, discrete_features=False, random_state=42)
mi_idx = np.argsort(mi)[::-1]
print("Top features by mutual information (index: score):")
for idx in mi_idx[:10]:
    print(idx, f"{mi[idx]:.4f}")

# ---- 5) Proper split BEFORE scaling (important) ----
X_train_raw, X_test_raw, y_train_int, y_test_int = train_test_split(
    X, y_int, test_size=0.2, random_state=42, stratify=y_int
)
print("\nSplit sizes (raw):", X_train_raw.shape, X_test_raw.shape)

# Scale using train stats ONLY
scaler = StandardScaler().fit(X_train_raw)
X_train = scaler.transform(X_train_raw)
X_test = scaler.transform(X_test_raw)

# ---- 6) Simple baseline models to detect "too-good-to-be-true" ----
print("\nTraining LogisticRegression baseline...")
lr = LogisticRegression(max_iter=2000)
lr.fit(X_train, y_train_int)
pred_lr = lr.predict(X_test)
acc_lr = accuracy_score(y_test_int, pred_lr)
print("LogisticRegression test accuracy:", acc_lr)
print(classification_report(y_test_int, pred_lr, target_names=le.classes_))

print("\nTraining DecisionTree baseline...")
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train_int)
pred_dt = dt.predict(X_test)
acc_dt = accuracy_score(y_test_int, pred_dt)
print("DecisionTree test accuracy:", acc_dt)
print(classification_report(y_test_int, pred_dt, target_names=le.classes_))

# ---- 7) Cross-validation (stratified) for more robust estimate ----
print("\n5-fold Stratified CV (LogisticRegression):")
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(LogisticRegression(max_iter=2000), scaler.transform(X), y_int, cv=skf, scoring='accuracy')
print("CV accuracies:", cv_scores)
print("CV mean accuracy:", cv_scores.mean())

# ---- 8) Nearest neighbor check helper (useful to compare an uploaded song) ----
nn = NearestNeighbors(n_neighbors=1).fit(X_train)
def nn_check(vec):
    # vec is raw (unscaled) feature vector -> scale it
    vec_s = scaler.transform(np.array(vec).reshape(1,-1))
    dist, idx = nn.kneighbors(vec_s)
    train_idx = idx[0][0]
    return float(dist[0][0]), train_idx, y_train_int[train_idx]

print("\nNearest-neighbor helper ready: call nn_check(your_vector) after you compute features for an uploaded song.")
print("It will return (distance, train_index, train_genre_int). You can then inspect the train sample and its label.")

# ---- Summary suggestions ----
print("\n=== QUICK SUGGESTIONS BASED ON ABOVE ===")
if (dup_groups.shape[0] > 0):
    print("- Found duplicate feature vectors. Make sure the same song (or same feature vector) is NOT in both train and test.")
if len(const_idx)>0:
    print("- Remove constant features (they shouldn't cause 100% but are useless).")
if suspicious:
    print("- Found features with discrete values tied only to one label: investigate those columns (they may contain encoded label or id).")
if acc_lr == 1.0 or acc_dt == 1.0:
    print("- Baseline models achieve perfect accuracy too -> very likely leakage or dataset trivially separable (check for label encoding inside features).")
print("- Always split BEFORE fitting scalers or any preprocessing that uses statistics of the whole dataset.")
print("- Check that the feature extraction code you use for uploaded songs is IDENTICAL to how the CSV was created.")
print("- If your dataset contains multiple frames/segments from the same underlying song, use a grouped split (group by song id) so samples from the same song don't leak between train/test.")

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject