In [52]:
import os
import librosa
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import tensorflow as tf
from tensorflow.keras import layers, models

# Function for extracting features from audio files
def extract_features(audio_file):
    try:
        y, sr = librosa.load(audio_file, sr=None)
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)

        # this is to fix lenght of the mfcc as 1 element has mfcc size to be (13, 1299) and rest were (13, 1293)
        mfccs = librosa.util.fix_length(mfccs, size=1293, axis=1)

        spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
        zero_crossing_rate = librosa.feature.zero_crossing_rate(y)
        # print({
        #     'mfccs': mfccs,
        #     'spectral_centroid': spectral_centroid,
        #     'zero_crossing_rate': zero_crossing_rate
        # })
        return {
            'mfccs': mfccs,
            'spectral_centroid': spectral_centroid,
            'zero_crossing_rate': zero_crossing_rate
        }
    except Exception as e:
        return None

genre_to_label = {'blues':0,'classical':1, 'country':2, 'disco':3, 'hiphop':4, 'jazz':5, 'metal':6, 'pop':7, 'reggae':8, 'rock':9}
# Function to load preprocessed data
def load_preprocessed_data(dataset_path='your_dataset_path'):
    extracted_features = []

    for genre_label in os.listdir(dataset_path):
        genre_path = os.path.join(dataset_path, genre_label)

        if not os.path.isdir(genre_path):
            continue

        # Check if the genre label is in the mapping
        if genre_label in genre_to_label:
            label = genre_to_label[genre_label]

            for audio_file in os.listdir(genre_path):
                if audio_file.endswith(".wav"):
                    audio_file_path = os.path.join(genre_path, audio_file)
                    features = extract_features(audio_file_path)

                    # Check if features are successfully extracted
                    if features is not None:
                        extracted_features.append({'features': features, 'label': label})
                    else:
                        print(f"Skipping {audio_file_path} due to feature extraction error.")

    return extracted_features

# Function to prepare data for machine learning models
def prepare_data(extracted_features):
    print(f"Total Number of Entries: {len(extracted_features)}")
    # Print sizes before concatenation
    # for i, entry in enumerate(extracted_features):
    #     print(f"MFCCs Size for Entry {i}: {entry['features']['mfccs'].shape}")

    # X = np.concatenate([entry['features']['mfccs'] for entry in extracted_features], axis=0)
    X = np.array([entry['features']['mfccs'].reshape(-1) for entry in extracted_features])
    y = np.array([entry['label'] for entry in extracted_features])
    # Print shapes after concatenation
    print(f"X Shape after Concatenation: {X.shape}")
    print(f"y Shape after Concatenation: {y.shape}")
    return X, y

# Function to train Support Vector Machine (SVM) model
def train_svm(X_train, y_train, X_test, y_test):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    svm_model = SVC()
    svm_model.fit(X_train_scaled, y_train)
    
    svm_predictions = svm_model.predict(X_test_scaled)
    svm_accuracy = np.mean(svm_predictions == y_test)
    
    return svm_accuracy

# Function to train Random Forest model
def train_random_forest(X_train, y_train, X_test, y_test):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    rf_model = RandomForestClassifier()
    rf_model.fit(X_train_scaled, y_train)
    
    rf_predictions = rf_model.predict(X_test_scaled)
    rf_accuracy = np.mean(rf_predictions == y_test)
    
    return rf_accuracy

# Function to train Convolutional Neural Network (CNN) model
def train_cnn(X_train_cnn, y_train, X_test_cnn, y_test, num_classes):
    cnn_model = models.Sequential()
    cnn_model.add(layers.Conv1D(32, kernel_size=3, activation='relu', input_shape=(X_train_cnn.shape[1],1)))
    cnn_model.add(layers.MaxPooling1D(pool_size=2))
    cnn_model.add(layers.Conv1D(64, kernel_size=3, activation='relu'))
    cnn_model.add(layers.MaxPooling1D(pool_size=2))
    cnn_model.add(layers.Conv1D(64, kernel_size=3, activation='relu'))
    cnn_model.add(layers.MaxPooling1D(pool_size=2))
    cnn_model.add(layers.Conv1D(128, kernel_size=3, activation='relu'))
    cnn_model.add(layers.MaxPooling1D(pool_size=2))
    cnn_model.add(layers.Conv1D(256, kernel_size=3, activation='relu'))
    cnn_model.add(layers.MaxPooling1D(pool_size=2))
    # model.add(Conv2D(32, kernel_size=(3,3), activation='relu'))
    # model.add(MaxPooling2D(pool_size=(2,2)))
    cnn_model.add(layers.Flatten())
    cnn_model.add(layers.Dense(256, activation='relu'))
    cnn_model.add(layers.Dense(num_classes, activation='softmax'))
    cnn_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


    # model = models.Sequential()
    # model.add(layers.Conv1D(32, kernel_size=3, activation='relu', input_shape=(X_train_cnn.shape[1], 1)))
    # model.add(layers.MaxPooling1D(pool_size=2))
    # model.add(layers.Flatten())
    # model.add(layers.Dense(64, activation='relu'))
    # model.add(layers.Dense(num_classes, activation='softmax'))
    
    # model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    cnn_model.fit(X_train_cnn, y_train, epochs=10, batch_size=32, validation_data=(X_test_cnn, y_test))
    
    cnn_loss, cnn_accuracy = cnn_model.evaluate(X_test_cnn, y_test)
    
    # Assuming 'model' is your trained model
    cnn_model.save('/Users/sunnytamang/Documents/Research_Papers/Music_Genre_Classification/models/my_audio_cnn_model.h5')
    
    return cnn_accuracy

# Replace 'your_dataset_path' with the path to your dataset containing WAV files
dataset_path = '/Users/sunnytamang/Documents/Research_Papers/Music_Genre_Classification/data_dir/genres_original'

# Load preprocessed data
extracted_features = load_preprocessed_data(dataset_path)

# Prepare data for machine learning models
X, y = prepare_data(extracted_features)


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Support Vector Machine (SVM) model
svm_accuracy = train_svm(X_train, y_train, X_test, y_test)
print(f"SVM Accuracy: {svm_accuracy}")

# Train Random Forest model
rf_accuracy = train_random_forest(X_train, y_train, X_test, y_test)
print(f"Random Forest Accuracy: {rf_accuracy}")

# Reshape data for Convolutional Neural Network (CNN)
X_train_cnn = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test_cnn = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

# Number of classes in the dataset
num_classes = len(np.unique(y))

# Train Convolutional Neural Network (CNN) model
cnn_accuracy = train_cnn(X_train_cnn, y_train, X_test_cnn, y_test, num_classes)
print(f"CNN Accuracy: {cnn_accuracy}")


  y, sr = librosa.load(audio_file, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Skipping /Users/sunnytamang/Documents/Research_Papers/Music_Genre_Classification/data_dir/genres_original/jazz/jazz.00054.wav due to feature extraction error.
Total Number of Entries: 999
X Shape after Concatenation: (999, 16809)
y Shape after Concatenation: (999,)
SVM Accuracy: 0.575
Random Forest Accuracy: 0.5
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


  saving_api.save_model(


CNN Accuracy: 0.6299999952316284


In [55]:
from tensorflow.keras.models import load_model

# Load the saved model
loaded_model = load_model('/Users/sunnytamang/Documents/Research_Papers/Music_Genre_Classification/models/my_audio_cnn_model.h5')

In [109]:
# Assuming 'extract_features' is your feature extraction function
# 'audio_file_path' is the path to the audio file you want to test
test_features = extract_features('/Users/sunnytamang/Downloads/Rock Party 30 Sec Intro Preview.wav')

In [110]:
# Reshape if needed
test_features_reshaped = test_features['mfccs'].reshape(1, -1)  # Adjust the shape based on your model input

# Standardize if needed
# scaler = StandardScaler()  # Assuming you used StandardScaler during training
# test_features_scaled = scaler.transform(test_features_reshaped)

In [111]:
predictions = loaded_model.predict(test_features_reshaped)
predicted_class = np.argmax(predictions)

print(f"Predicted Class: {predicted_class}")

Predicted Class: 9


In [112]:
# Invert the genre_mapping
inverse_genre_mapping = {v: k for k, v in genre_to_label.items()}

# Make Predictions
predictions = loaded_model.predict(test_features_reshaped)
predicted_class = np.argmax(predictions)
predicted_genre = inverse_genre_mapping.get(predicted_class, 'Unknown')

print(f"Predicted Genre: {predicted_genre}")

Predicted Genre: rock
