# Method 1: LSTM with Mel Spectrogram Preprocessing

**Uses same preprocessing as CNN methods:**
- Extract mel spectrograms directly from audio files
- GroupShuffleSplit by song ID
- Bidirectional LSTM architecture

In [None]:
import numpy as np
import librosa
import os
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (LSTM, Bidirectional, Dense, Dropout, 
                                     BatchNormalization, GlobalAveragePooling1D)
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import GroupShuffleSplit
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
import matplotlib.pyplot as plt
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)
tf.random.set_seed(42)

print(f'TensorFlow: {tf.__version__}')

## 1. Configuration

In [None]:
DATA_PATH = '../data/gtzan/genres_original'
SAMPLE_RATE = 22050
DURATION = 30
N_MELS = 128
N_FFT = 2048
HOP_LENGTH = 512

GENRES = ['blues', 'classical', 'country', 'disco', 'hiphop',
          'jazz', 'metal', 'pop', 'reggae', 'rock']
NUM_CLASSES = len(GENRES)

## 2. Feature Extraction (Same as CNN methods)

In [None]:
def extract_melspec(audio, sr):
    """
    Extract mel spectrogram - same preprocessing as CNN methods.
    Returns: (time_frames, n_mels) for LSTM input
    """
    mel = librosa.feature.melspectrogram(
        y=audio, sr=sr, n_mels=N_MELS, n_fft=N_FFT, hop_length=HOP_LENGTH
    )
    mel_db = librosa.power_to_db(mel, ref=np.max)
    
    # Transpose to (time, frequency) for LSTM
    return mel_db.T

# Test
test_audio = np.random.randn(SAMPLE_RATE * 30)
test_features = extract_melspec(test_audio, SAMPLE_RATE)
print(f"Feature shape: {test_features.shape}")  # (time_frames, 128)

## 3. Load Dataset

In [None]:
def load_dataset(data_path, target_length=1291):
    """
    Load GTZAN and extract mel spectrograms.
    """
    X, y, song_ids = [], [], []
    
    for genre in GENRES:
        genre_path = os.path.join(data_path, genre)
        if not os.path.exists(genre_path):
            continue
        
        files = sorted([f for f in os.listdir(genre_path) if f.endswith('.wav')])
        
        for filename in tqdm(files, desc=f"{genre}"):
            if 'jazz.00054' in filename:
                continue
            
            filepath = os.path.join(genre_path, filename)
            song_id = f"{genre}.{filename.split('.')[1]}"
            
            try:
                audio, sr = librosa.load(filepath, sr=SAMPLE_RATE, duration=DURATION)
                
                # Pad to exact duration
                target_samples = SAMPLE_RATE * DURATION
                if len(audio) < target_samples:
                    audio = np.pad(audio, (0, target_samples - len(audio)))
                
                # Extract mel spectrogram
                melspec = extract_melspec(audio, sr)
                
                # Pad/truncate to target length
                if melspec.shape[0] < target_length:
                    pad_width = target_length - melspec.shape[0]
                    melspec = np.pad(melspec, ((0, pad_width), (0, 0)))
                else:
                    melspec = melspec[:target_length, :]
                
                X.append(melspec)
                y.append(genre)
                song_ids.append(song_id)
                
            except Exception as e:
                print(f"Error: {filename}: {e}")
    
    return np.array(X), np.array(y), np.array(song_ids)

print("Loading dataset...")
X, y, song_ids = load_dataset(DATA_PATH)
print(f"\nDataset shape: {X.shape}")
print(f"Unique songs: {len(np.unique(song_ids))}")

## 4. GroupShuffleSplit by Song ID

In [None]:
# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
y_cat = to_categorical(y_encoded, NUM_CLASSES)

# Split by song: 80% train, 10% val, 10% test
splitter = GroupShuffleSplit(test_size=0.20, n_splits=1, random_state=42)
train_idx, test_idx = next(splitter.split(X, y, song_ids))

X_train_full, X_test = X[train_idx], X[test_idx]
y_train_full, y_test = y_cat[train_idx], y_cat[test_idx]
songs_train = song_ids[train_idx]

# Split train into train/val
splitter_val = GroupShuffleSplit(test_size=0.125, n_splits=1, random_state=42)
train_idx2, val_idx = next(splitter_val.split(X_train_full, y_train_full, songs_train))

X_train = X_train_full[train_idx2]
X_val = X_train_full[val_idx]
y_train = y_train_full[train_idx2]
y_val = y_train_full[val_idx]

print(f"Train: {X_train.shape}")
print(f"Val: {X_val.shape}")
print(f"Test: {X_test.shape}")

## 5. Normalization

In [None]:
# Normalize using training statistics
mean = X_train.mean()
std = X_train.std()

X_train = (X_train - mean) / (std + 1e-8)
X_val = (X_val - mean) / (std + 1e-8)
X_test = (X_test - mean) / (std + 1e-8)

# Save normalization params for inference
np.savez('../models/lstm_norm_params.npz', mean=mean, std=std)

print(f"LSTM input shape: {X_train.shape}")

## 6. Build LSTM Model

In [None]:
def build_lstm_model(input_shape, num_classes):
    model = Sequential([
        # Bidirectional LSTM layers
        Bidirectional(LSTM(128, return_sequences=True), input_shape=input_shape),
        Dropout(0.3),
        
        Bidirectional(LSTM(64, return_sequences=True)),
        Dropout(0.3),
        
        # Global pooling over time
        GlobalAveragePooling1D(),
        
        # Dense layers
        Dense(128, activation='relu'),
        BatchNormalization(),
        Dropout(0.4),
        
        Dense(64, activation='relu'),
        Dropout(0.3),
        
        Dense(num_classes, activation='softmax')
    ])
    
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

# Input shape: (time_steps, features) = (1291, 128)
model = build_lstm_model((X_train.shape[1], X_train.shape[2]), NUM_CLASSES)
model.summary()

## 7. Training

In [None]:
callbacks = [
    EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6)
]

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=100,
    batch_size=16,
    callbacks=callbacks,
    verbose=1
)

## 8. Evaluation

In [None]:
# Plot training history
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
axes[0].plot(history.history['accuracy'], label='Train')
axes[0].plot(history.history['val_accuracy'], label='Val')
axes[0].set_title('Accuracy')
axes[0].legend()

axes[1].plot(history.history['loss'], label='Train')
axes[1].plot(history.history['val_loss'], label='Val')
axes[1].set_title('Loss')
axes[1].legend()
plt.tight_layout()
plt.show()

# Evaluate
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=0)
print(f"\nTest Accuracy: {test_acc*100:.2f}%")

y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true_classes = np.argmax(y_test, axis=1)

print("\nClassification Report:")
print(classification_report(y_true_classes, y_pred_classes, target_names=label_encoder.classes_))

## 9. Save Model

In [None]:
model.save('../models/lstm_melspec.keras')
print("Model saved to ../models/lstm_melspec.keras")
print("Normalization params saved to ../models/lstm_norm_params.npz")