# Imports

In [None]:
import os
import numpy as np
import pandas as pd
import librosa
import matplotlib.pyplot as plt
import tensorflow as tf
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from tensorflow.keras.applications import EfficientNetV2B0
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Use GPU for tensorflow

In [4]:
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print("GPU detected and enabled:", gpus)
    except RuntimeError as e:
        print("Error enabling GPU:", e)
else:
    print("No GPU detected. Running on CPU.")

GPU detected and enabled: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


# Genres

In [3]:
genre_profiles = {
    'blues':     np.array([-1,  0,  2,  2,  1,  0, -1,  0,  1,  1]),
    'classical': np.array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0]),
    'country':   np.array([ 0,  1,  1,  2,  1,  0,  0,  0,  0,  0]),
    'disco':     np.array([ 2,  3,  2,  1,  0,  0,  1,  2,  3,  2]),
    'hiphop':    np.array([ 3,  4,  2,  0, -1, -1,  0,  1,  1,  0]),
    'jazz':      np.array([ 0,  1,  1,  2,  2,  1,  0,  0,  1,  0]),
    'metal':     np.array([ 2,  3,  0, -3, -4, -3,  0,  3,  3,  2]),
    'pop':       np.array([ 0,  1,  2,  2,  1,  0,  1,  1,  2,  2]),
    'reggae':    np.array([ 0,  1,  0,  0, -1, -1,  0,  0,  1,  0]),
    'rock':      np.array([ 1,  1,  0,  0,  1,  1,  0,  0,  1,  1])
}

# Ensure this order matches the model's output order.
genres = ['blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal', 'pop', 'reggae', 'rock']

def weighted_eq_profile(predictions, genre_list=genres, profiles=genre_profiles):
    """
    Given a predictions vector (softmax output over genres), compute a weighted
    equalization profile based on the top three predicted genres.
    
    Parameters:
      predictions: numpy array of shape (10,) with probabilities for each genre.
      genre_list: list of genre names in the order corresponding to predictions.
      profiles: dictionary mapping genre names to their equalization profile (numpy array).
      
    Returns:
      weighted_profile: numpy array of shape (10,) representing the weighted EQ profile.
    """
    # Get indices of the top 3 predicted genres.
    top3_idx = predictions.argsort()[-3:][::-1]
    top3_probs = predictions[top3_idx]
    
    # Normalize the top 3 probabilities so they sum to 1.
    weight_sum = np.sum(top3_probs)
    if weight_sum == 0:
        norm_weights = np.ones_like(top3_probs) / 3
    else:
        norm_weights = top3_probs / weight_sum
    
    # Compute the weighted combination of the equalization profiles.
    weighted_profile = np.zeros_like(profiles[genre_list[0]], dtype=float)
    for i, idx in enumerate(top3_idx):
        genre = genre_list[idx]
        profile = profiles[genre]
        weighted_profile += norm_weights[i] * profile
    
    return weighted_profile

# EfficientNet

In [5]:
def create_efficientnet_model(input_shape=(224, 224, 3), num_classes=10):
    """
    Create EfficientNetV2B0 model for music genre classification.
    
    Parameters:
      input_shape: tuple, shape of input images (height, width, channels)
      num_classes: int, number of genre classes
      
    Returns:
      model: compiled Keras model
    """
    # Create input layer
    inputs = Input(shape=input_shape)
    
    # Load pre-trained EfficientNetV2B0 without top layers
    base_model = EfficientNetV2B0(
        weights='imagenet',
        include_top=False,
        input_tensor=inputs
    )
    
    # Freeze base model layers initially
    base_model.trainable = False
    
    # Add custom top layers
    x = base_model.output
    x = GlobalAveragePooling2D()(x)
    x = Dropout(0.3)(x)
    x = Dense(512, activation='relu')(x)
    x = Dropout(0.5)(x)
    predictions = Dense(num_classes, activation='softmax')(x)
    
    # Create the model
    model = Model(inputs=inputs, outputs=predictions)
    
    return model, base_model

model_save_path_keras = "./saved_models/CONEqNet_EfficientNetV2.keras"
model_save_path_h5 = "./saved_models/CONEqNet_EfficientNetV2.h5"

if os.path.exists(model_save_path_keras):
    model = tf.keras.models.load_model(model_save_path_keras)
    print(f"Loaded existing model from {model_save_path_keras}")
    base_model = None  # We'll handle fine-tuning separately if needed
elif os.path.exists(model_save_path_h5):
    model = tf.keras.models.load_model(model_save_path_h5)
    print(f"Loaded existing model from {model_save_path_h5}")
    base_model = None
else:
    print("No saved model found, creating a new EfficientNetV2 model...")
    # Input shape for EfficientNetV2 (224x224x3 for RGB-like mel spectrograms)
    input_shape = (224, 224, 3)
    
    model, base_model = create_efficientnet_model(input_shape=input_shape, num_classes=len(genres))
    
    # Compile the model
    optimizer = Adam(learning_rate=0.001)
    model.compile(
        optimizer=optimizer,
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    
    print("New EfficientNetV2 model successfully compiled.")
    model.summary()

No saved model found, creating a new EfficientNetV2 model...
New EfficientNetV2 model successfully compiled.
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 224, 224, 3  0           []                               
                                )]                                                                
                                                                                                  
 rescaling (Rescaling)          (None, 224, 224, 3)  0           ['input_1[0][0]']                
                                                                                                  
 normalization (Normalization)  (None, 224, 224, 3)  0           ['rescaling[0][0]']              
                                                                                    

# Define Data and Mel Spectrogram extraction

In [6]:
data_dir = './Data'
audio_dir = os.path.normpath(os.path.join(data_dir, 'genres_original'))
csv_path = os.path.normpath(os.path.join(data_dir, 'features_30_sec.csv'))

df_features = pd.read_csv(csv_path)

df_features['filepath'] = df_features.apply(lambda row: os.path.join(audio_dir, row['label'], row['filename']), axis=1)
print("\nAdded filepath to dataframe features.")

genres = ['blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal', 'pop', 'reggae', 'rock']
label_to_index = {genre: i for i, genre in enumerate(genres)}

def extract_mel_spectrogram(file_path, n_mels=128, hop_length=512, n_fft=2048, target_shape=(224, 224)):
    """
    Extract mel spectrogram from audio file and resize to target shape.
    
    Parameters:
      file_path: str, path to audio file
      n_mels: int, number of mel bands
      hop_length: int, hop length for STFT
      n_fft: int, FFT window size
      target_shape: tuple, target image shape (height, width)
      
    Returns:
      mel_spec: numpy array of shape (target_shape[0], target_shape[1], 3)
    """
    try:
        # Load audio file
        y, sr = librosa.load(file_path, duration=30, sr=22050)
        
        # Extract mel spectrogram
        mel_spec = librosa.feature.melspectrogram(
            y=y, 
            sr=sr, 
            n_mels=n_mels, 
            hop_length=hop_length,
            n_fft=n_fft
        )
        
        # Convert to log scale (dB)
        mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
        
        # Normalize to [0, 1] range
        mel_spec_norm = (mel_spec_db - mel_spec_db.min()) / (mel_spec_db.max() - mel_spec_db.min())
        
        # Resize to target shape
        from scipy.ndimage import zoom
        zoom_factors = (target_shape[0] / mel_spec_norm.shape[0], 
                       target_shape[1] / mel_spec_norm.shape[1])
        mel_spec_resized = zoom(mel_spec_norm, zoom_factors)
        
        # Convert to 3-channel image (RGB-like)
        # Method 1: Stack the same spectrogram 3 times
        mel_spec_3ch = np.stack([mel_spec_resized] * 3, axis=-1)
        
        return mel_spec_3ch
        
    except Exception as e:
        print(f"Error extracting mel spectrogram from {file_path}: {e}")
        # Return a default spectrogram in case of error
        return np.zeros((*target_shape, 3))

def prepare_data_mel(df, target_shape=(224, 224), n_mels=128):
    """
    Prepare mel spectrogram data for training.
    
    Parameters:
      df: pandas DataFrame with file paths and labels
      target_shape: tuple, target image shape
      n_mels: int, number of mel bands
      
    Returns:
      X: numpy array of mel spectrograms
      y: numpy array of labels
    """
    X = []
    y = []
    
    for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing audio files"):
        try:
            mel_spec = extract_mel_spectrogram(
                row['filepath'], 
                n_mels=n_mels, 
                target_shape=target_shape
            )
            
            X.append(mel_spec)
            y.append(label_to_index[row['label']])
            
        except Exception as e:
            print(f"Error processing {row['filepath']}: {e}")
            
    return np.array(X), np.array(y)


Added filepath to dataframe features.


# Train test split

In [7]:
train_df, val_df = train_test_split(df_features, test_size=0.2, stratify=df_features['label'], random_state=42)
print("Train set shape:", train_df.shape)
print("Validation set shape:", val_df.shape)

# Prepare training and validation data with mel spectrograms
print("Preparing training data...")
X_train, y_train = prepare_data_mel(train_df)
print("Preparing validation data...")
X_val, y_val = prepare_data_mel(val_df)

# One-hot encode the labels
y_train_cat = to_categorical(y_train, num_classes=len(genres))
y_val_cat = to_categorical(y_val, num_classes=len(genres))

print("Data preparation completed!")
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train_cat.shape)
print("X_val shape:", X_val.shape)
print("y_val shape:", y_val_cat.shape)

Train set shape: (800, 61)
Validation set shape: (200, 61)
Preparing training data...


  y, sr = librosa.load(file_path, duration=30, sr=22050)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
Processing audio files:  81%|████████▏ | 650/800 [00:29<00:05, 27.97it/s]

Error extracting mel spectrogram from Data\genres_original\jazz\jazz.00054.wav: 


Processing audio files: 100%|██████████| 800/800 [00:35<00:00, 22.41it/s]


Preparing validation data...


Processing audio files: 100%|██████████| 200/200 [00:07<00:00, 25.33it/s]

Data preparation completed!
X_train shape: (800, 224, 224, 3)
y_train shape: (800, 10)
X_val shape: (200, 224, 224, 3)
y_val shape: (200, 10)





# Data Augmentation

In [8]:
datagen = ImageDataGenerator(
    rotation_range=10,
    width_shift_range=0.1,
    height_shift_range=0.1,
    horizontal_flip=True,
    zoom_range=0.1,
    fill_mode='nearest'
)

# Fit the generator to the training data
datagen.fit(X_train)

# Train

In [9]:
early_stopping = EarlyStopping(
    monitor='val_loss', 
    patience=15, 
    restore_best_weights=True,
    verbose=1
)

reduce_lr = ReduceLROnPlateau(
    monitor='val_loss', 
    factor=0.5,
    patience=5,
    verbose=1,
    mode='min',
    min_delta=0.001,
    cooldown=2,
    min_lr=1e-8
)

## Phase 1: Train with frozen model

In [None]:
print("Phase 1: Training with frozen EfficientNetV2 base...")
history_phase1 = model.fit(
    datagen.flow(X_train, y_train_cat, batch_size=32),
    steps_per_epoch=len(X_train) // 32,
    epochs=20,
    validation_data=(X_val, y_val_cat),
    callbacks=[early_stopping, reduce_lr],
    verbose=1
)

Phase 2: Fine tune with unfrozen base model

In [None]:
# Phase 2: Fine-tune with unfrozen base model (if base_model is available)
if base_model is not None:
    print("\nPhase 2: Fine-tuning with unfrozen EfficientNetV2 base...")
    
    # Unfreeze the base model
    base_model.trainable = True
    
    # Use a lower learning rate for fine-tuning
    model.compile(
        optimizer=Adam(learning_rate=0.0001),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    
    # Fine-tune for additional epochs
    history_phase2 = model.fit(
        datagen.flow(X_train, y_train_cat, batch_size=16),
        steps_per_epoch=len(X_train) // 16,
        epochs=15,
        validation_data=(X_val, y_val_cat),
        callbacks=[early_stopping, reduce_lr],
        verbose=1
    )
    
    # Combine histories
    history = history_phase1
    for key in history_phase2.history:
        history.history[key].extend(history_phase2.history[key])
else:
    history = history_phase1

# Print final training and validation accuracy
final_train_acc = history.history['accuracy'][-1]
final_val_acc = history.history['val_accuracy'][-1]
print(f"\nFinal Training Accuracy: {final_train_acc * 100:.2f}%")
print(f"Final Validation Accuracy: {final_val_acc * 100:.2f}%")

# %% [markdown]
# ### Plot Training History

# %%
def plot_training_history(history):
    """Plot training and validation accuracy and loss."""
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
    
    # Plot accuracy
    ax1.plot(history.history['accuracy'], label='Training Accuracy')
    ax1.plot(history.history['val_accuracy'], label='Validation Accuracy')
    ax1.set_title('Model Accuracy')
    ax1.set_xlabel('Epoch')
    ax1.set_ylabel('Accuracy')
    ax1.legend()
    ax1.grid(True)
    
    # Plot loss
    ax2.plot(history.history['loss'], label='Training Loss')
    ax2.plot(history.history['val_loss'], label='Validation Loss')
    ax2.set_title('Model Loss')
    ax2.set_xlabel('Epoch')
    ax2.set_ylabel('Loss')
    ax2.legend()
    ax2.grid(True)
    
    plt.tight_layout()
    plt.show()

# Plot the training history
plot_training_history(history)

# %% [markdown]
# ### Save model

# %%
# Define the save paths
model_save_path_keras = "./saved_models/CONEqNet_EfficientNetV2.keras"
model_save_path_h5 = "./saved_models/CONEqNet_EfficientNetV2.h5"

# Create the directory if it doesn't exist
os.makedirs(os.path.dirname(model_save_path_keras), exist_ok=True)

# Save the model in TensorFlow's recommended format (.keras)
model.save(model_save_path_keras)
print(f"Model saved successfully to {model_save_path_keras}")

# Save the model in HDF5 format (.h5) for compatibility with older Keras versions
model.save(model_save_path_h5)
print(f"Model saved successfully to {model_save_path_h5}")

# %% [markdown]
# ### Model Evaluation and Prediction Examples

# %%
def predict_genre_with_eq(file_path, model, genres=genres):
    """
    Predict genre and return EQ profile for a single audio file.
    
    Parameters:
      file_path: str, path to audio file
      model: trained Keras model
      genres: list of genre names
      
    Returns:
      predicted_genre: str, predicted genre name
      confidence: float, prediction confidence
      eq_profile: numpy array, weighted EQ profile
    """
    # Extract mel spectrogram
    mel_spec = extract_mel_spectrogram(file_path, target_shape=(224, 224))
    mel_spec = np.expand_dims(mel_spec, axis=0)  # Add batch dimension
    
    # Make prediction
    predictions = model.predict(mel_spec, verbose=0)[0]
    
    # Get predicted genre
    predicted_idx = np.argmax(predictions)
    predicted_genre = genres[predicted_idx]
    confidence = predictions[predicted_idx]
    
    # Calculate weighted EQ profile
    eq_profile = weighted_eq_profile(predictions, genres, genre_profiles)
    
    return predicted_genre, confidence, eq_profile, predictions

# Example usage (uncomment to test with a specific file)
# file_path = "path/to/your/audio/file.wav"
# genre, conf, eq, preds = predict_genre_with_eq(file_path, model)
# print(f"Predicted Genre: {genre} (Confidence: {conf:.2f})")
# print(f"EQ Profile: {eq}")

print("Model training and setup completed successfully!")
print("Use the predict_genre_with_eq() function to classify new audio files.")