# CNN + Attention: Spectrogram Images + CSV Features

**Highest Accuracy Approach**

This notebook combines TWO data sources for best performance:
1. **PNG Spectrograms** → CNN + CBAM Attention
2. **CSV Features (57 audio features)** → Dense Network
3. **Fusion** → Multi-Head Attention → Classification

**Expected Performance:** 90-95% accuracy

## 1. Imports

In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

import numpy as np
import pandas as pd
from PIL import Image
import warnings
warnings.filterwarnings('ignore')

import tensorflow as tf
from tensorflow.keras import layers, Model, regularizers
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.utils import to_categorical

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

np.random.seed(42)
tf.random.set_seed(42)

plt.style.use('seaborn-v0_8-darkgrid')

print(f"TensorFlow: {tf.__version__}")
print(f"GPU: {len(tf.config.list_physical_devices('GPU')) > 0}")

## 2. Configuration

In [None]:
# ==================== PATHS ====================
BASE_PATH = '/Users/narac0503/GIT/GTZAN Dataset Classification/GTZAN-Dataset-Classification/gtzan-classification/data/gtzan'
IMAGE_PATH = os.path.join(BASE_PATH, 'images_original')
CSV_PATH = os.path.join(BASE_PATH, 'features_30_sec.csv')

print(f"Images exist: {os.path.exists(IMAGE_PATH)}")
print(f"CSV exists: {os.path.exists(CSV_PATH)}")

# ==================== IMAGE SETTINGS ====================
TARGET_SIZE = (224, 224)  # Larger size for more detail

# ==================== MODEL ====================
NUM_CLASSES = 10
GENRES = ['blues', 'classical', 'country', 'disco', 'hiphop',
          'jazz', 'metal', 'pop', 'reggae', 'rock']

# ==================== HYPERPARAMETERS ====================
CNN_FILTERS = [64, 128, 256, 512]
ATTENTION_HEADS = 8
DENSE_UNITS = 512
DROPOUT_RATE = 0.5
LEARNING_RATE = 0.001
BATCH_SIZE = 32
EPOCHS = 150

## 3. CBAM Attention Layer

In [None]:
class CBAM(layers.Layer):
    """Convolutional Block Attention Module."""
    
    def __init__(self, reduction=16, **kwargs):
        super().__init__(**kwargs)
        self.reduction = reduction
    
    def build(self, input_shape):
        channels = input_shape[-1]
        self.fc1 = layers.Dense(max(channels // self.reduction, 1), activation='relu')
        self.fc2 = layers.Dense(channels)
        self.conv_spatial = layers.Conv2D(1, 7, padding='same')
    
    def call(self, x):
        # Channel attention
        avg_pool = tf.reduce_mean(x, axis=[1, 2])
        max_pool = tf.reduce_max(x, axis=[1, 2])
        
        avg_out = self.fc2(self.fc1(avg_pool))
        max_out = self.fc2(self.fc1(max_pool))
        
        channel_attn = tf.nn.sigmoid(avg_out + max_out)
        channel_attn = tf.reshape(channel_attn, [-1, 1, 1, tf.shape(x)[-1]])
        x = x * channel_attn
        
        # Spatial attention
        avg_spatial = tf.reduce_mean(x, axis=-1, keepdims=True)
        max_spatial = tf.reduce_max(x, axis=-1, keepdims=True)
        concat = tf.concat([avg_spatial, max_spatial], axis=-1)
        spatial_attn = tf.nn.sigmoid(self.conv_spatial(concat))
        
        return x * spatial_attn

print("CBAM defined.")

## 4. Load Images

In [None]:
def load_image(path, target_size=TARGET_SIZE):
    try:
        img = Image.open(path)
        if img.mode != 'RGB':
            img = img.convert('RGB')
        img = img.resize(target_size)
        return np.array(img) / 255.0
    except:
        return None

def load_images(image_path):
    X_img, filenames, labels = [], [], []
    
    print("Loading images...\n")
    for genre in GENRES:
        genre_path = os.path.join(image_path, genre)
        if not os.path.exists(genre_path):
            continue
        
        files = sorted([f for f in os.listdir(genre_path) if f.endswith('.png')])
        print(f"{genre}: {len(files)} images")
        
        for f in tqdm(files, desc=genre):
            img = load_image(os.path.join(genre_path, f))
            if img is not None:
                X_img.append(img)
                # Create matching filename for CSV lookup
                # e.g., "pop00005.png" → "pop.00005.wav"
                base = f.replace('.png', '')
                wav_name = f"{genre}.{base[len(genre):].zfill(5)}.wav"
                filenames.append(wav_name)
                labels.append(genre)
    
    return np.array(X_img), filenames, labels

X_images, filenames, labels = load_images(IMAGE_PATH)
print(f"\nLoaded {len(X_images)} images")
print(f"Shape: {X_images.shape}")

## 5. Load CSV Features

In [None]:
# Load CSV
df = pd.read_csv(CSV_PATH)
print(f"CSV shape: {df.shape}")
print(f"Columns: {list(df.columns[:10])}...")

# Get feature columns (drop filename, length, label)
feature_cols = [c for c in df.columns if c not in ['filename', 'length', 'label']]
print(f"\nNumber of features: {len(feature_cols)}")

# Match CSV features to images by filename
X_csv = []
matched_indices = []

for i, fname in enumerate(filenames):
    # Try different filename formats
    row = df[df['filename'] == fname]
    if len(row) == 0:
        # Try without leading zeros
        parts = fname.split('.')
        if len(parts) >= 2:
            alt_fname = f"{parts[0]}.{int(parts[1]):05d}.wav"
            row = df[df['filename'] == alt_fname]
    
    if len(row) > 0:
        X_csv.append(row[feature_cols].values[0])
        matched_indices.append(i)

X_csv = np.array(X_csv)
print(f"\nMatched {len(X_csv)} samples with CSV features")

# Filter images and labels to matched only
X_images = X_images[matched_indices]
labels = [labels[i] for i in matched_indices]

print(f"Final images: {X_images.shape}")
print(f"Final CSV: {X_csv.shape}")

## 6. Preprocessing

In [None]:
# Encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(labels)
y_onehot = to_categorical(y_encoded, NUM_CLASSES)

print(f"Labels: {y_onehot.shape}")

## 7. Train/Val/Test Split

In [None]:
# Indices for splitting
indices = np.arange(len(X_images))

# Split
idx_temp, idx_test = train_test_split(indices, test_size=0.1, stratify=y_encoded, random_state=42)
idx_train, idx_val = train_test_split(idx_temp, test_size=0.111, stratify=y_encoded[idx_temp], random_state=42)

# Images
X_img_train, X_img_val, X_img_test = X_images[idx_train], X_images[idx_val], X_images[idx_test]

# CSV features
X_csv_train, X_csv_val, X_csv_test = X_csv[idx_train], X_csv[idx_val], X_csv[idx_test]

# Labels
y_train, y_val, y_test = y_onehot[idx_train], y_onehot[idx_val], y_onehot[idx_test]

print(f"Train: {len(idx_train)}")
print(f"Val: {len(idx_val)}")
print(f"Test: {len(idx_test)}")

## 8. Normalize CSV Features

In [None]:
# Normalize CSV features
scaler = StandardScaler()
X_csv_train = scaler.fit_transform(X_csv_train)
X_csv_val = scaler.transform(X_csv_val)
X_csv_test = scaler.transform(X_csv_test)

print(f"CSV normalized - Mean: {X_csv_train.mean():.4f}, Std: {X_csv_train.std():.4f}")

## 9. Build Dual-Input Model

In [None]:
def build_dual_input_model(image_shape, csv_dim):
    """
    Dual-input model:
    1. CNN + CBAM for images
    2. Dense network for CSV features
    3. Multi-head attention for fusion
    """
    
    # ==================== IMAGE BRANCH ====================
    img_input = layers.Input(shape=image_shape, name='image_input')
    x = img_input
    
    for filters in CNN_FILTERS:
        x = layers.Conv2D(filters, 3, padding='same')(x)
        x = layers.BatchNormalization()(x)
        x = layers.Activation('relu')(x)
        x = CBAM()(x)
        x = layers.MaxPooling2D(2)(x)
        x = layers.Dropout(0.25)(x)
    
    x = layers.GlobalAveragePooling2D()(x)
    img_features = layers.Dense(256, activation='relu')(x)
    img_features = layers.Dropout(0.3)(img_features)
    
    # ==================== CSV BRANCH ====================
    csv_input = layers.Input(shape=(csv_dim,), name='csv_input')
    
    y = layers.Dense(256, activation='relu')(csv_input)
    y = layers.BatchNormalization()(y)
    y = layers.Dropout(0.3)(y)
    y = layers.Dense(128, activation='relu')(y)
    y = layers.BatchNormalization()(y)
    csv_features = layers.Dropout(0.3)(y)
    
    # ==================== FUSION WITH ATTENTION ====================
    # Stack as sequence for attention
    img_expanded = layers.Reshape((1, 256))(img_features)
    csv_expanded = layers.Reshape((1, 128))(csv_features)
    
    # Pad CSV to match dimensions
    csv_padded = layers.Dense(256)(layers.Reshape((128,))(csv_expanded))
    csv_padded = layers.Reshape((1, 256))(csv_padded)
    
    # Concatenate as sequence
    combined = layers.Concatenate(axis=1)([img_expanded, csv_padded])
    
    # Multi-head attention
    attn = layers.MultiHeadAttention(
        num_heads=ATTENTION_HEADS,
        key_dim=32
    )(combined, combined)
    
    attended = layers.GlobalAveragePooling1D()(attn)
    
    # ==================== CLASSIFICATION ====================
    z = layers.Dense(DENSE_UNITS, activation='relu')(attended)
    z = layers.BatchNormalization()(z)
    z = layers.Dropout(DROPOUT_RATE)(z)
    
    z = layers.Dense(256, activation='relu')(z)
    z = layers.Dropout(0.3)(z)
    
    outputs = layers.Dense(NUM_CLASSES, activation='softmax')(z)
    
    # ==================== COMPILE ====================
    model = Model(inputs=[img_input, csv_input], outputs=outputs)
    
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    
    return model


model = build_dual_input_model(X_img_train.shape[1:], X_csv_train.shape[1])
model.summary()

## 10. Training

In [None]:
callbacks = [
    EarlyStopping(monitor='val_accuracy', patience=25, restore_best_weights=True, verbose=1),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=8, min_lr=1e-7, verbose=1),
    ModelCheckpoint('best_dual_input.keras', monitor='val_accuracy', save_best_only=True, verbose=1)
]

print("\nTraining dual-input model...\n")
history = model.fit(
    [X_img_train, X_csv_train], y_train,
    validation_data=([X_img_val, X_csv_val], y_val),
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    callbacks=callbacks,
    verbose=1
)

## 11. Training History

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

ax1.plot(history.history['accuracy'], label='Train')
ax1.plot(history.history['val_accuracy'], label='Val')
ax1.set_title('Accuracy', fontweight='bold')
ax1.legend()
ax1.grid(alpha=0.3)

ax2.plot(history.history['loss'], label='Train')
ax2.plot(history.history['val_loss'], label='Val')
ax2.set_title('Loss', fontweight='bold')
ax2.legend()
ax2.grid(alpha=0.3)

plt.tight_layout()
plt.savefig('dual_input_history.png', dpi=300)
plt.show()

print(f"\nBest Val Accuracy: {max(history.history['val_accuracy']):.4f}")

## 12. Evaluation

In [None]:
model.load_weights('best_dual_input.keras')

test_loss, test_acc = model.evaluate([X_img_test, X_csv_test], y_test, verbose=0)

print("\n" + "="*60)
print(f"TEST ACCURACY: {test_acc:.4f} ({test_acc*100:.2f}%)")
print(f"TEST LOSS: {test_loss:.4f}")
print("="*60)

## 13. Classification Report & Confusion Matrix

In [None]:
y_pred = model.predict([X_img_test, X_csv_test], verbose=0)
y_pred_labels = np.argmax(y_pred, axis=1)
y_true_labels = np.argmax(y_test, axis=1)

print("\nClassification Report:")
print("="*60)
print(classification_report(y_true_labels, y_pred_labels, target_names=GENRES, digits=3))

cm = confusion_matrix(y_true_labels, y_pred_labels)

plt.figure(figsize=(12, 10))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=GENRES, yticklabels=GENRES)
plt.xlabel('Predicted', fontweight='bold')
plt.ylabel('True', fontweight='bold')
plt.title(f'Dual-Input CNN+Attention (Acc: {test_acc:.2%})', fontweight='bold')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('dual_input_cm.png', dpi=300)
plt.show()

## 14. Save

In [None]:
model.save('dual_input_final.keras')
np.save('dual_input_history.npy', history.history)

import joblib
joblib.dump(scaler, 'dual_input_scaler.pkl')

print("Saved:")
print("  ✓ dual_input_final.keras")
print("  ✓ best_dual_input.keras")
print("  ✓ dual_input_scaler.pkl")

## Summary

**Dual-Input Architecture:**
```
PNG Spectrogram → CNN + CBAM → Image Features (256-dim)
                                                        ↘
                                                         → Attention Fusion → Classifier
                                                        ↗
CSV Features → Dense Network → Audio Features (128-dim)
```

**Why This Works Better:**
- Images capture **visual patterns** (texture, structure)
- CSV features capture **statistical audio properties** (MFCC means, spectral features)
- Attention learns **which modality is more important** for each genre
- Fusion combines complementary information

**Expected:** 90-95% accuracy (much better than single-input!)