# ADA Project : Bollywood Songs Genre Classification

## Yash Gawhale (PES1UG22AM915)

## Vinay Palled (PES1UG22AM914)

## K Musadiq Pasha (PES1UG22AM079)

In [None]:
# Import necessary libraries
import os
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set paths to dataset folders (adjust to match your Kaggle dataset paths)
genres = ['bollypop', 'carnatic', 'ghazal', 'semiclassical', 'sufi']
data_dir = '/kaggle/input/indian-music-genre-dataset/genrenew'  # Update with your dataset directory

# Initialize dictionary to store file paths by genre
audio_files = {genre: [] for genre in genres}

# Collect audio file paths
for genre in genres:
    genre_path = os.path.join(data_dir, genre)
    audio_files[genre] = [os.path.join(genre_path, file) for file in os.listdir(genre_path) if file.endswith('.mp3')]

# Summary: Number of files per genre
print("Summary of dataset:")
for genre, files in audio_files.items():
    print(f"{genre.capitalize()}: {len(files)} files")

# Function to plot waveform and spectrogram for a sample file
def plot_audio_sample(file_path, genre):
    # Load audio file
    y, sr = librosa.load(file_path, sr=None)
    
    # Plot waveform
    plt.figure(figsize=(12, 4))
    plt.subplot(1, 2, 1)
    librosa.display.waveshow(y, sr=sr)
    plt.title(f'Waveform - {genre}')
    plt.xlabel("Time (s)")
    plt.ylabel("Amplitude")

    # Plot spectrogram
    plt.subplot(1, 2, 2)
    D = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max)
    librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='log')
    plt.colorbar(format="%+2.0f dB")
    plt.title(f'Spectrogram - {genre}')
    plt.xlabel("Time (s)")
    plt.ylabel("Frequency (Hz)")
    plt.tight_layout()
    plt.show()

# Plot waveform and spectrogram for a sample file from each genre
for genre, files in audio_files.items():
    print(f"Plotting sample audio from genre: {genre}")
    plot_audio_sample(files[0], genre)  # Plot the first file from each genre

# Statistical Analysis: Duration of each file in each genre
durations = {genre: [] for genre in genres}

for genre, files in audio_files.items():
    for file in files:
        y, sr = librosa.load(file, sr=None)
        durations[genre].append(librosa.get_duration(y=y, sr=sr))

# Plotting the duration distribution by genre
plt.figure(figsize=(10, 6))
for genre in genres:
    sns.histplot(durations[genre], kde=True, label=genre, bins=20)
plt.title("Duration Distribution by Genre")
plt.xlabel("Duration (seconds)")
plt.ylabel("Count")
plt.legend()
plt.show()

# Display summary statistics for durations
print("Summary statistics for durations (seconds):")
for genre, duration_list in durations.items():
    print(f"{genre.capitalize()} - Mean: {np.mean(duration_list):.2f}, Std Dev: {np.std(duration_list):.2f}, Max: {np.max(duration_list):.2f}, Min: {np.min(duration_list):.2f}")


# Audio Data Processing and Visualization

This block organizes the dataset, visualizes sample audio files, and analyzes file durations.

## Key Steps:
1. **Import Libraries**: 
   - `librosa` for audio processing.
   - `matplotlib` and `seaborn` for visualizations.
2. **Dataset Setup**: 
   - Genres include Bollypop, Carnatic, Ghazal, Semiclassical, and Sufi.
   - Audio file paths are organized by genre into a dictionary.
3. **Dataset Summary**:
   - Total files per genre are printed for an overview.
4. **Audio Visualization**:
   - A `plot_audio_sample` function generates waveforms (amplitude vs. time) and spectrograms (frequency vs. time) for sample files.
5. **Duration Analysis**:
   - Calculates and stores durations of audio files by genre.
   - Plots duration distributions and displays summary statistics (mean, std, min, max).

This setup ensures the dataset is well-organized, with initial insights into its structure and characteristics, laying the groundwork for feature extraction and modeling.


In [None]:
# Import necessary libraries
import os
import librosa
import numpy as np
import random

# Set parameters for preprocessing
SAMPLE_RATE = 16000  # Standard sample rate
DURATION = 30  # Target duration for each audio file (in seconds)
AUDIO_LENGTH = SAMPLE_RATE * DURATION  # Number of samples needed for target duration

# Function to preprocess audio files: resampling, trimming/padding, normalization
def preprocess_audio(file_path, sample_rate=SAMPLE_RATE, audio_length=AUDIO_LENGTH):
    # Load audio file
    y, sr = librosa.load(file_path, sr=sample_rate)
    
    # Trim or pad to ensure fixed length
    if len(y) > audio_length:
        y = y[:audio_length]
    else:
        y = np.pad(y, (0, max(0, audio_length - len(y))), mode='constant')
    
    # Normalize audio
    y = librosa.util.normalize(y)
    
    return y

# Optional: Function for data augmentation
def augment_audio(y, sr=SAMPLE_RATE):
    # Apply random pitch shift
    y_pitch = librosa.effects.pitch_shift(y, sr=sr, n_steps=random.uniform(-2, 2))
    
    # Apply random time stretch (avoiding extreme stretching)
    y_stretch = librosa.effects.time_stretch(y, rate=random.uniform(0.8, 1.2))
    
    # Add random noise
    noise = np.random.normal(0, 0.005, len(y))
    y_noise = y + noise
    
    # Return augmented versions
    return [y_pitch, y_stretch, y_noise]


# Preprocess and optionally augment dataset
preprocessed_data = {genre: [] for genre in genres}

for genre, files in audio_files.items():
    print(f"Processing genre: {genre}")
    for file in files:
        # Preprocess audio
        y = preprocess_audio(file)
        
        # Store preprocessed audio
        preprocessed_data[genre].append(y)
        
        # Optional: Data augmentation
        augmented_samples = augment_audio(y)
        preprocessed_data[genre].extend(augmented_samples)

# Check shape of preprocessed audio data to ensure consistency
for genre, audio_list in preprocessed_data.items():
    print(f"{genre.capitalize()}: {len(audio_list)} samples, Sample shape: {audio_list[0].shape}")

# Save preprocessed audio data (optional)
# Example: Save preprocessed data in .npy format for easy loading later
output_dir = '/kaggle/working/preprocessed_audio/'
os.makedirs(output_dir, exist_ok=True)
for genre, audio_list in preprocessed_data.items():
    genre_dir = os.path.join(output_dir, genre)
    os.makedirs(genre_dir, exist_ok=True)
    for idx, audio in enumerate(audio_list):
        np.save(os.path.join(genre_dir, f'{genre}_{idx}.npy'), audio)
print("Preprocessed audio data saved.")


# Audio Preprocessing and Augmentation

This block preprocesses audio files to prepare them for consistent analysis and applies optional data augmentation to increase data diversity.

## Key Steps:
1. **Set Parameters**:
   - `SAMPLE_RATE` (16 kHz): Standard sampling rate for consistency.
   - `DURATION` (30 seconds): Target duration for all audio files.
   - `AUDIO_LENGTH`: Total number of samples needed per file.

2. **Preprocessing Function**:
   - Resamples audio to `SAMPLE_RATE`.
   - Trims or pads files to ensure a fixed length of `AUDIO_LENGTH`.
   - Normalizes audio signals for uniform scaling.

3. **Data Augmentation**:
   - **Pitch Shift**: Alters pitch randomly within a small range.
   - **Time Stretch**: Speeds up or slows down audio slightly.
   - **Noise Addition**: Adds small random noise for robustness.

4. **Dataset Preprocessing**:
   - Each file is preprocessed and optionally augmented with pitch-shifted, time-stretched, and noisy versions.
   - Results are stored in a dictionary `preprocessed_data` organized by genre.

5. **Validation**:
   - Prints the number of processed samples and shape consistency for each genre.

6. **Optional Save**:
   - Saves preprocessed audio files as `.npy` files for quick access in future analyses.

This process ensures uniformity in audio length and scale, while augmentation enhances model robustness by introducing variability in the dataset.


In [None]:
# Import necessary libraries
import os
import numpy as np
import librosa
import pandas as pd
from tqdm import tqdm

# Paths to preprocessed data
output_dir = '/kaggle/working/preprocessed_audio/'
genres = ['bollypop', 'carnatic', 'ghazal', 'semiclassical', 'sufi']

# Initialize lists to store features and labels
features = []
labels = []

# Feature extraction function
def extract_features(y, sr):
    # Extract MFCCs
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    mfccs_mean = np.mean(mfccs, axis=1)
    
    # Extract Chroma feature
    chroma = librosa.feature.chroma_stft(y=y, sr=sr)
    chroma_mean = np.mean(chroma, axis=1)
    
    # Spectral features
    spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr))
    spectral_bandwidth = np.mean(librosa.feature.spectral_bandwidth(y=y, sr=sr))
    spectral_rolloff = np.mean(librosa.feature.spectral_rolloff(y=y, sr=sr))
    
    # Rhythm feature: Tempo
    onset_env = librosa.onset.onset_strength(y=y, sr=sr)
    tempo = librosa.beat.tempo(onset_envelope=onset_env, sr=sr)[0]
    
    # Combine all features into a single array
    feature_vector = np.hstack([
        mfccs_mean,            # 13 MFCC features
        chroma_mean,           # 12 Chroma features
        spectral_centroid,     # Spectral centroid
        spectral_bandwidth,    # Spectral bandwidth
        spectral_rolloff,      # Spectral rolloff
        tempo                  # Tempo
    ])
    
    return feature_vector

# Extract features for each genre and save labels
for genre in genres:
    genre_dir = os.path.join(output_dir, genre)
    for file_name in tqdm(os.listdir(genre_dir), desc=f"Processing {genre}"):
        file_path = os.path.join(genre_dir, file_name)
        
        # Load the preprocessed audio sample
        y = np.load(file_path)
        
        # Extract features
        feature_vector = extract_features(y, sr=SAMPLE_RATE)
        
        # Append features and label to lists
        features.append(feature_vector)
        labels.append(genre)

# Convert features and labels to a DataFrame
features_df = pd.DataFrame(features)
features_df['label'] = labels

# Save features to CSV for easy loading later
features_csv_path = '/kaggle/working/audio_features.csv'
features_df.to_csv(features_csv_path, index=False)
print(f"Feature extraction complete. Features saved to {features_csv_path}")


# Audio Feature Extraction

This block focuses on extracting meaningful audio features from preprocessed audio files to prepare the dataset for machine learning models.

## Key Steps:
1. **Setup**:
   - Specifies the path to preprocessed audio files.
   - Defines genres and initializes lists for storing extracted features and their corresponding labels.

2. **Feature Extraction Function**:
   - **MFCCs (Mel Frequency Cepstral Coefficients)**: Captures timbral features (13 coefficients).
   - **Chroma Features**: Represents harmonic and pitch content (12 features).
   - **Spectral Features**:
     - Centroid: Indicates brightness of sound.
     - Bandwidth: Measures spread of the spectrum.
     - Rolloff: Frequency below which 85% of the spectral energy lies.
   - **Rhythm Feature (Tempo)**: Captures tempo using onset strength.
   - Combines all extracted features into a single vector.

3. **Feature Extraction for Dataset**:
   - Iterates through each genre and file in the preprocessed directory.
   - Extracts features for each audio file and appends them to the `features` list.
   - Stores the corresponding genre label in the `labels` list.

4. **Dataframe Creation**:
   - Converts the extracted features and labels into a `pandas` DataFrame.
   - Adds a `label` column for genre information.

5. **Save Features**:
   - Saves the feature DataFrame as a `.csv` file for easy loading in subsequent analyses or model training.

## Output:
The extracted features are saved in `/kaggle/working/audio_features.csv`, ready for use in training machine learning models. This process ensures a rich set of audio descriptors are captured for each sample, covering timbral, harmonic, and rhythmic aspects.


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical

# Load feature data
features_csv_path = '/kaggle/working/audio_features.csv'
data = pd.read_csv(features_csv_path)

# Separate features and labels
X = data.drop(columns=['label']).values  # Feature values
y = data['label'].values                 # Genre labels

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
y_categorical = to_categorical(y_encoded)  # Convert labels to categorical format for classification

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_categorical, test_size=0.2, random_state=42)

# Define the neural network model
model = Sequential([
    Dense(128, input_shape=(X_train.shape[1],), activation='relu'),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(5, activation='softmax')  # 5 output units for each genre
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=30, batch_size=32, validation_split=0.2)

# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Accuracy: {test_accuracy:.2f}")

# Save the trained model
model.save('/kaggle/working/audio_genre_classification_model.h5')
print("Model training complete and saved as 'audio_genre_classification_model.h5'")


# Neural Network for Audio Genre Classification

This block builds, trains, and evaluates a neural network to classify audio samples into genres based on extracted features.

## Key Steps:
1. **Load Feature Data**:
   - Reads the feature dataset from the `.csv` file.
   - Separates features (`X`) and genre labels (`y`).

2. **Label Encoding**:
   - Converts genre labels into numerical format using `LabelEncoder`.
   - Applies one-hot encoding (`to_categorical`) for multi-class classification.

3. **Train-Test Split**:
   - Splits the data into training (80%) and testing (20%) sets for model evaluation.

4. **Define Neural Network**:
   - A sequential model with:
     - **Dense Layers**: Fully connected layers with 128, 64, and 32 neurons for feature extraction.
     - **Dropout Layers**: Prevent overfitting by randomly deactivating 30% of neurons.
     - **Output Layer**: 5 neurons (one for each genre) with `softmax` activation for multi-class classification.

5. **Compile the Model**:
   - Optimizer: **Adam** for efficient training.
   - Loss Function: **Categorical Crossentropy** for multi-class classification.
   - Metric: **Accuracy** to track performance.

6. **Train the Model**:
   - Trains for 30 epochs with a batch size of 32.
   - Splits the training data further for validation.

7. **Evaluate the Model**:
   - Tests the model on unseen test data and prints the test accuracy.

8. **Save the Model**:
   - Saves the trained model as `audio_genre_classification_model.h5` for future predictions or deployment.

## Output:
The trained model achieves classification accuracy on the test set and is saved for reuse. This step completes the pipeline from feature extraction to genre prediction using a neural network.


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Conv1D, MaxPooling1D, Flatten, BatchNormalization
from sklearn.preprocessing import StandardScaler

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)  # X is loaded from the feature extraction output
X_scaled = np.expand_dims(X_scaled, axis=2)  # Reshape for Conv1D layer

# Define the CNN model
cnn_model = Sequential([
    Conv1D(64, kernel_size=3, activation='relu', input_shape=(X_scaled.shape[1], 1)),
    BatchNormalization(),
    MaxPooling1D(pool_size=2),
    Dropout(0.3),

    Conv1D(128, kernel_size=3, activation='relu'),
    BatchNormalization(),
    MaxPooling1D(pool_size=2),
    Dropout(0.3),

    Conv1D(256, kernel_size=3, activation='relu'),
    BatchNormalization(),
    MaxPooling1D(pool_size=2),
    Dropout(0.3),

    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.4),
    Dense(64, activation='relu'),
    Dropout(0.4),
    Dense(5, activation='softmax')
])

# Compile the CNN model
cnn_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the CNN model
history = cnn_model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2)

# Evaluate the CNN model
test_loss, test_accuracy = cnn_model.evaluate(X_test, y_test, verbose=0)
print(f"Test Accuracy: {test_accuracy:.2f}")

# Save the trained model
cnn_model.save('/kaggle/working/audio_genre_cnn_model.h5')
print("CNN model training complete and saved as 'audio_genre_cnn_model.h5'")


# Convolutional Neural Network (CNN) for Audio Genre Classification

This block implements a Convolutional Neural Network (CNN) to classify audio samples into genres, leveraging spatial features in the data.

## Key Steps:

1. **Feature Standardization**:
   - Uses `StandardScaler` to standardize the feature values to have a mean of 0 and a standard deviation of 1.
   - Reshapes the standardized data for compatibility with the 1D convolutional layers by adding an additional dimension.

2. **CNN Architecture**:
   - **Conv1D Layers**: Extracts local patterns and relationships in the feature space.
   - **BatchNormalization**: Normalizes intermediate outputs to stabilize training and improve convergence.
   - **MaxPooling1D**: Reduces feature dimensionality and captures dominant patterns.
   - **Dropout Layers**: Prevents overfitting by randomly deactivating neurons during training.
   - **Flatten Layer**: Converts multi-dimensional output into a 1D vector for dense layers.
   - **Dense Layers**: Fully connected layers to combine extracted features for classification.
   - **Output Layer**: Uses `softmax` activation with 5 neurons (one for each genre) for multi-class classification.

3. **Compile the Model**:
   - **Optimizer**: Adam, for efficient weight updates during training.
   - **Loss Function**: Categorical Crossentropy for multi-class classification.
   - **Metric**: Accuracy to monitor classification performance.

4. **Train the Model**:
   - Trains the model for 50 epochs with a batch size of 32.
   - Uses 20% of the training data as a validation set.

5. **Evaluate the Model**:
   - Evaluates the CNN on the test set and prints the test accuracy.

6. **Save the Model**:
   - Saves the trained CNN model as `audio_genre_cnn_model.h5` for future use.

## Output:
The trained CNN model leverages convolutional layers to capture intricate patterns in the audio features and achieves a high test accuracy. The saved model can be used for deployment or further analysis.


In [None]:
pip install librosa tensorflow


In [None]:
!pip install -q -U keras-tuner


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras.utils import to_categorical

# Load features and labels
features_df = pd.read_csv('/kaggle/working/audio_features.csv')
X = features_df.drop(columns=['label']).values
y = features_df['label'].astype('category').cat.codes.values  # Encode labels

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = np.expand_dims(X_scaled, axis=2)

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
y_train = to_categorical(y_train, num_classes=5)
y_test = to_categorical(y_test, num_classes=5)


# Data Preparation for Model Training

This block handles the installation of required libraries and prepares the dataset for training a deep learning model.

## Key Steps:

1. **Install Dependencies**:
   - Installs `librosa` for audio processing and `tensorflow` for building and training deep learning models.
   - Installs `keras-tuner` (optional) for hyperparameter optimization if needed in future steps.

2. **Load Features and Labels**:
   - Reads the extracted features from `audio_features.csv` into a pandas DataFrame.
   - Separates the feature columns (`X`) from the labels (`y`).
   - Encodes genre labels into numerical categories using pandas `.cat.codes`.

3. **Feature Scaling**:
   - Standardizes features using `StandardScaler` to ensure all feature values are on the same scale (mean = 0, std = 1).
   - Adds an extra dimension to `X_scaled` for compatibility with models like CNNs.

4. **Split Dataset**:
   - Splits the dataset into training (80%) and testing (20%) sets using `train_test_split`.
   - Converts the labels (`y_train` and `y_test`) into a one-hot encoded format with `to_categorical` to support multi-class classification.

## Output:
The dataset is now ready for deep learning:
- **Features (`X_train` and `X_test`)**: Scaled and reshaped.
- **Labels (`y_train` and `y_test`)**: One-hot encoded.
These processed inputs ensure compatibility with TensorFlow/Keras models.


In [None]:
from keras_tuner import RandomSearch
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, BatchNormalization

def build_model(hp):
    model = Sequential()
    model.add(Conv1D(hp.Int('conv1_filters', min_value=32, max_value=256, step=32),
                     kernel_size=3, activation='relu', input_shape=(X_scaled.shape[1], 1)))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Dropout(hp.Float('dropout1', 0.2, 0.5, step=0.1)))

    # Additional layers with variable parameters
    for i in range(hp.Int('n_layers', 1, 3)):
        model.add(Conv1D(hp.Int(f'conv{i+2}_filters', min_value=64, max_value=256, step=32),
                         kernel_size=3, activation='relu'))
        model.add(MaxPooling1D(pool_size=2))
        model.add(Dropout(hp.Float(f'dropout{i+2}', 0.2, 0.5, step=0.1)))

    model.add(Flatten())
    model.add(Dense(hp.Int('dense_units', min_value=64, max_value=512, step=64), activation='relu'))
    model.add(Dropout(hp.Float('dense_dropout', 0.2, 0.5, step=0.1)))
    model.add(Dense(5, activation='softmax'))

    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Run Random Search for tuning
tuner = RandomSearch(
    build_model,
    objective='val_accuracy',
    max_trials=5,
    executions_per_trial=2,
    directory='/kaggle/working/tuner',
    project_name='indian_song_genre_classification'
)

tuner.search(X_train, y_train, epochs=20, validation_split=0.2, batch_size=32)
best_model = tuner.get_best_models(num_models=1)[0]


In [None]:
# Train the best model
history = best_model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2)


# Hyperparameter Tuning for CNN Model

This block uses `Keras Tuner` to optimize the hyperparameters of a Convolutional Neural Network (CNN) for audio genre classification.

## Key Steps:

1. **Model Building Function**:
   - Defines a CNN architecture with tunable hyperparameters:
     - **Convolutional Layers**:
       - Number of filters (`conv1_filters`, `conv2_filters`, etc.) varies between 32 and 256 in steps of 32.
       - Supports up to 3 layers, determined by the `n_layers` parameter.
     - **Dropout Rate**:
       - Adjustable for each layer between 0.2 and 0.5.
     - **Dense Layer**:
       - Number of neurons (`dense_units`) ranges from 64 to 512 in steps of 64.
       - Includes dropout (`dense_dropout`) for regularization.

2. **Hyperparameter Tuning**:
   - `RandomSearch` is used to explore different combinations of hyperparameters.
   - Key settings:
     - **Objective**: Maximizing validation accuracy (`val_accuracy`).
     - **Trials**: Up to 5 combinations of hyperparameters, with 2 executions per trial for consistency.
     - **Search Space**: Configured through `build_model`.

3. **Tuning Execution**:
   - Conducts training on `X_train` with a validation split of 20%.
   - Searches for the optimal combination of hyperparameters.

4. **Best Model Selection and Training**:
   - Extracts the best-performing model (`best_model`) based on validation accuracy.
   - Retrains the selected model for 50 epochs to fully leverage the best hyperparameter configuration.

## Output:
The optimal CNN model is identified and trained with the best hyperparameters for genre classification. This approach ensures a well-optimized architecture for high classification accuracy.


In [None]:
# Evaluate model on test data
test_loss, test_accuracy = best_model.evaluate(X_test, y_test, verbose=0)
print(f"Test Accuracy: {test_accuracy:.2f}")

# Save model for later use
best_model.save('/kaggle/working/best_audio_genre_model.h5')
print("Model saved as 'best_audio_genre_model.h5'")


In [None]:
import librosa

# Function for predicting new samples
def predict_genre(audio_path):
    # Preprocess new audio file
    y, sr = librosa.load(audio_path, sr=16000)
    y = np.pad(y, (0, max(0, 16000*30 - len(y))), mode='constant')
    y = librosa.util.normalize(y)
    
    # Extract features and scale
    feature_vector = extract_features(y, sr)  # Reuse `extract_features` function from before
    feature_vector = scaler.transform([feature_vector])
    feature_vector = np.expand_dims(feature_vector, axis=2)
    
    # Predict
    prediction = best_model.predict(feature_vector)
    genre_idx = np.argmax(prediction)
    genre_labels = ['bollypop', 'carnatic', 'ghazal', 'semiclassical', 'sufi']
    
    return genre_labels[genre_idx], prediction[0]

# Example usage
predicted_genre, prediction_scores = predict_genre('/kaggle/input/testdata/Dunki_ O Maahi.mp3')
print(f"Predicted Genre: {predicted_genre}, Scores: {prediction_scores}")


# Model Evaluation and Genre Prediction

This block evaluates the trained model on test data and demonstrates its usage for predicting the genre of new audio samples.

## Key Steps:

### 1. Evaluate the Model
- **Test Set Evaluation**:
  - The best model obtained from hyperparameter tuning is evaluated on the test set.
  - Prints the test accuracy, providing an objective measure of the model's performance on unseen data.

### 2. Save the Model
- The trained model is saved as `best_audio_genre_model.h5` for future predictions or deployment.

### 3. Predict New Samples
- **Genre Prediction Function**:
  - Preprocesses the input audio:
    - Resamples it to 16 kHz.
    - Pads or trims it to a fixed length of 30 seconds.
    - Normalizes the audio signal.
  - Extracts features using the previously defined `extract_features` function.
  - Scales the features using the trained `StandardScaler`.
  - Makes predictions using the trained model (`best_model`).
  - Returns the predicted genre and confidence scores for all classes.

### 4. Example Prediction
- Uses a sample audio file (`Dunki_O_Maahi.mp3`) to test the prediction pipeline.
- Outputs the predicted genre and associated confidence scores.

## Output:
- **Model Performance**: Displays the test accuracy on unseen data.
- **Saved Model**: The trained model is stored as `best_audio_genre_model.h5`.
- **Prediction Example**: Demonstrates how to predict the genre of a new audio file, showcasing the model's real-world application capabilities.


In [None]:
predicted_genre, prediction_scores = predict_genre('/kaggle/input/testdata/Kahan se aaye badra.mp3')
print(f"Predicted Genre: {predicted_genre}, Scores: {prediction_scores}")

In [None]:
predicted_genre, prediction_scores = predict_genre('/kaggle/input/testdata/kun faya kun.mp3')
print(f"Predicted Genre: {predicted_genre}, Scores: {prediction_scores}")

In [None]:
import matplotlib.pyplot as plt
# Define genre labels
genre_labels = ['bollypop', 'carnatic', 'ghazal', 'semiclassical', 'sufi']

# Confusion matrix
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns

y_pred = best_model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true = np.argmax(y_test, axis=1)

cm = confusion_matrix(y_true, y_pred_classes)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=genre_labels, yticklabels=genre_labels)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

# Classification report
print("Classification Report:")
print(classification_report(y_true, y_pred_classes, target_names=genre_labels))


# Plot training & validation accuracy and loss
plt.figure(figsize=(12, 4))

# Accuracy
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Model Accuracy')
plt.legend()

# Loss
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Model Loss')
plt.legend()

plt.tight_layout()
plt.show()

# Confusion matrix
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns

y_pred = best_model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true = np.argmax(y_test, axis=1)

cm = confusion_matrix(y_true, y_pred_classes)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=genre_labels, yticklabels=genre_labels)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

# Classification report
print("Classification Report:")
print(classification_report(y_true, y_pred_classes, target_names=genre_labels))


# Model Evaluation Metrics and Performance Visualization

This block provides a detailed evaluation of the trained model's performance, using a confusion matrix and classification metrics, and visualizes the training process.

## Key Steps:

### 1. Confusion Matrix
- **Purpose**: Shows the relationship between true labels and predictions, providing insight into how well the model distinguishes between genres.
- **Implementation**:
  - Computes the confusion matrix using `confusion_matrix` from `sklearn`.
  - Visualizes it as a heatmap with `seaborn`.
  - Includes genre labels on the axes for easy interpretation.

### 2. Classification Report
- **Purpose**: Summarizes precision, recall, F1-score, and support for each genre, providing a comprehensive evaluation of model performance.
- **Implementation**:
  - Generates the classification report with `classification_report` from `sklearn`.
  - Targets are labeled with genre names for clarity.

### 3. Training and Validation Metrics Visualization
- **Accuracy**:
  - Plots training and validation accuracy across epochs to monitor how well the model learns over time.
- **Loss**:
  - Plots training and validation loss to assess the model's convergence and detect potential overfitting.

### Output:
1. **Confusion Matrix**:
   - A heatmap showing prediction accuracy for each genre.
   - Misclassified samples are highlighted for further analysis.
2. **Classification Report**:
   - Metrics for precision, recall, and F1-score for each genre, along with overall accuracy.
3. **Training Curves**:
   - Accuracy and loss curves to visualize model performance during training and validation phases.

These visualizations and metrics offer a clear picture of the model's strengths and areas for improvement, enabling deeper insights into its performance.
