# Method 1 Improved: Dense Neural Network with Correct Data Split

**Improvements over original LSTM:**
- **Architecture**: Replaced LSTM with a Dense Neural Network (MLP). LSTM is designed for sequential data, but the CSV features (mean/variance) are tabular summary statistics, not a time sequence. An MLP is more appropriate and effective for this data type.
- **Data Splitting**: Implemented `GroupShuffleSplit` based on Song ID. The original method split randomly, allowing 3-second segments of the *same song* to appear in both Training and Validation sets (Data Leakage). This ensures we evaluate true generalization to new unseen songs.
- **Evaluation**: We evaluate on a strictly held-out test set of songs, using both segment-level accuracy and **Song-Level Majority Voting** (aggregating predictions for a song to get the final label).

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Activation
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import GroupShuffleSplit
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Reproducibility
np.random.seed(42)
tf.random.set_seed(42)

print(f'TensorFlow version: {tf.__version__}')

## 1. Load and Prepare Data
We use the 3-second features dataset. We parse the filename to get the Song ID for grouping.

In [None]:
# Load 3-second features
df = pd.read_csv('../data/gtzan/features_3_sec.csv')

# Extract Song ID (e.g., 'blues.00000') to group segments
# Filename format: genre.00000.0.wav
df['song_id'] = df['filename'].apply(lambda x: '.'.join(x.split('.')[:2]))

print(f"Total segments: {len(df)}")
print(f"Total unique songs: {df['song_id'].nunique()}")
print(f"Genres: {df['label'].unique()}")

## 2. Proper Train/Val/Test Split
We split by **Song ID** so all segments of a song stay together. This prevents the model from "memorizing" a song from one segment and recognizing it in the validation set.

In [None]:
# Define features and label
X = df.drop(columns=['filename', 'length', 'label', 'song_id'])
y = df['label']
groups = df['song_id']

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
num_classes = len(label_encoder.classes_)
y_cat = to_categorical(y_encoded, num_classes)

# Split: 70% Train, 15% Val, 15% Test (approx)
# First split: Train vs (Val + Test)
splitter_outer = GroupShuffleSplit(test_size=0.30, n_splits=1, random_state=42)
train_idx, temp_idx = next(splitter_outer.split(X, y, groups))

X_train, X_temp = X.iloc[train_idx], X.iloc[temp_idx]
y_train, y_temp = y_cat[train_idx], y_cat[temp_idx]
groups_temp = groups.iloc[temp_idx]

# Second split: Val vs Test
splitter_inner = GroupShuffleSplit(test_size=0.50, n_splits=1, random_state=42)
val_idx, test_idx = next(splitter_inner.split(X_temp, y_temp, groups_temp))

X_val, X_test = X_temp.iloc[val_idx], X_temp.iloc[test_idx]
y_val, y_test = y_temp[val_idx], y_temp[test_idx]
groups_test = groups_temp.iloc[test_idx]

print(f"Training set shape: {X_train.shape}")
print(f"Validation set shape: {X_val.shape}")
print(f"Test set shape: {X_test.shape}")

## 3. Standardization
Standardize features based on Training statistics.

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

## 4. Build Dense Network (MLP)
We use a deep dense network with Dropout and Batch Normalization for regularization.

In [None]:
def build_model(input_shape, num_classes):
    model = Sequential([
        # Input Layer
        Dense(512, input_shape=input_shape),
        BatchNormalization(),
        Activation('relu'),
        Dropout(0.3),

        # Hidden Layer 1
        Dense(256),
        BatchNormalization(),
        Activation('relu'),
        Dropout(0.3),

        # Hidden Layer 2
        Dense(128),
        BatchNormalization(),
        Activation('relu'),
        Dropout(0.3),
        
        # Hidden Layer 3
        Dense(64),
        BatchNormalization(),
        Activation('relu'),
        Dropout(0.3),

        # Output Layer
        Dense(num_classes, activation='softmax')
    ])
    
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

model = build_model((X_train.shape[1],), num_classes)
model.summary()

## 5. Training

In [None]:
callbacks = [
    EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6)
]

history = model.fit(
    X_train_scaled, y_train,
    validation_data=(X_val_scaled, y_val),
    epochs=100,
    batch_size=64,
    callbacks=callbacks,
    verbose=1
)

## 6. Segment-Level Evaluation
Evaluation on 3-second segments.

In [None]:
# Plot accuracy and loss
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train')
plt.plot(history.history['val_accuracy'], label='Val')
plt.title('Accuracy')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train')
plt.plot(history.history['val_loss'], label='Val')
plt.title('Loss')
plt.legend()
plt.show()

# Evaluate on Test Set
test_loss, test_acc = model.evaluate(X_test_scaled, y_test, verbose=0)
print(f"\nTest Accuracy (on 3-second segments): {test_acc*100:.2f}%")

# Confusion Matrix
y_pred_prob = model.predict(X_test_scaled)
y_pred_classes = np.argmax(y_pred_prob, axis=1)
y_true_classes = np.argmax(y_test, axis=1)

print("\nSegment-Level Classification Report:")
print(classification_report(y_true_classes, y_pred_classes, target_names=label_encoder.classes_))

## 7. Song-Level Evaluation (Majority Voting)
We aggregate predictions for all segments of a song and take the majority class. This is the true metric for music classification.

In [None]:
# Create a DataFrame with true labels, predicted labels, and song IDs
results_df = pd.DataFrame({
    'song_id': groups_test.values,
    'true_label': y_true_classes,
    'pred_label': y_pred_classes
})

# Aggregate by song_id using mode (majority vote)
# If multiple modes, it picks the first one (acceptable)
song_results = results_df.groupby('song_id').agg(lambda x: x.mode()[0])

song_acc = accuracy_score(song_results['true_label'], song_results['pred_label'])
print(f"\nSong-Level Accuracy: {song_acc*100:.2f}%")

print("\nSong-Level Classification Report:")
print(classification_report(song_results['true_label'], song_results['pred_label'], target_names=label_encoder.classes_))