# Simple Neural Network Music Genre Classification:

This notebook is dedicated to improving the test accuracy of the network presented in:
https://www.kaggle.com/code/aasimahmed04/music-genre-classifier

In [1]:
import numpy as np
import os
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MaxAbsScaler
import tensorflow.keras as keras
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import librosa, IPython
import librosa.display as lplt

from sklearn.metrics import accuracy_score
from tensorflow.keras import Sequential
from tensorflow.keras.layers import *
from keras import regularizers

import IPython.display as ipd
from IPython.display import Audio, clear_output




The labels are base on the following mapping:

In [2]:
map_ = {
    "blues"     : 0,
    "classical" : 1,
    "country"   : 2,
    "disco"     : 3,
    "hiphop"    : 4,
    "jazz"      : 5,
    "metal"     : 6,
    "pop"       : 7,
    "reggae"    : 8,
    "rock"      : 9,
}

### Loading the architecture:

In [3]:
# Load VGGish feature extractor from TensorFlow Hub
vggish = hub.load("https://tfhub.dev/google/vggish/1")













##### audio samples length 

In [4]:
duration = 30 
N = duration - 1 #(N-1 frames)

### Functions to load the audio file and generate the feature space:

In [5]:
# Function to extract VGGish features from audio files
def extract_vggish_features(file_path):
    waveform, _ = librosa.load(file_path, sr=16000)  # VGGish expects 16kHz audio
    waveform = waveform[:N*16000]  # Ensure the waveform is 1 second long
    if len(waveform) < N*16000:
        waveform = np.pad(waveform, (0, N*16000 - len(waveform)))
    
    # VGGish expects a 2D tensor with shape (batch_size, waveform_length)
    waveform = waveform[np.newaxis, :]  # Add batch dimension
    features = vggish(waveform[0])  # Pass the waveform to VGGish
    print("features shape = ",features.numpy().shape);
    features_ = [];
    for i in range(10):
        features_.append(features[3*i:3*i+3])
    
    return np.array(features_).mean(axis=1)

# Load and prepare dataset
def load_dataset(data_dir):
    labels = []
    features = []
    genres = os.listdir(data_dir)

    for genre in genres:
        genre_dir = os.path.join(data_dir, genre)
        genre_dir = genre_dir.replace("\\", "/")
        for file in os.listdir(genre_dir):
            file_path = os.path.join(genre_dir, file)
            file_path = file_path.replace("\\", "/")
            label = genres.index(genre)
            clear_output(wait=True)
            print("Processing: ",len(features)/10, "%")
            features.append(extract_vggish_features(file_path))
            labels.append(label)

    features = np.array(features)
    labels = np.array(labels)
    
    return features, labels

### Path to the audio dataset

In [6]:
#Download the data set to you pc and adjust the path accordingly
data_dir = r"C:/Users/420/Desktop/NeuralNetworksFinalProject/genres_original"

### Load the dataset

In [7]:
X, y = load_dataset(data_dir)

Processing:  99.8 %
features shape =  (30, 128)


In [None]:
X = np.concatenate(X, axis=0)

In [24]:
y_ = []

In [25]:
for label in y:
    y_.extend(10*[label])

y_ = np.array(y_);

[0 0 0 ... 9 9 9]


In [32]:
print(X.shape)
print(y_.shape)

(9990, 128)
(9990,)


### Create the .csv file out of the data outside the VGGish network:

This step is in order to not have to process the data through the VGGish model each run
(the process is very slow)

In [31]:
# Convert to DataFrame
df_features = pd.DataFrame(X)
df_labels = pd.DataFrame(y_, columns=['label'])

# Combine features and labels into one DataFrame
df = pd.concat([df_features, df_labels], axis=1)

# Save to CSV
csv_file_path = 'vggish_features_labels_3_sec_extended.csv'
df.to_csv(csv_file_path, index=False)

print(f"Dataset saved to {csv_file_path}")

Dataset saved to vggish_features_labels_3_sec_extended.csv
