<a href="https://colab.research.google.com/github/VaishnaviBhalodi/MusicGenreClassification/blob/main/MusicGenreClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np #linear algebra
import pandas as pd #data preprocessing
import os
import json
import json
import librosa
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import tensorflow as tf

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
SOURCE_PATH = "/content/drive/MyDrive/GTZAN/Data/genres_original/"

In [4]:
#Path to labels and processed data file, json format
JSON_PATH = "/content/drive/MyDrive/GTZAN/data.json"

In [5]:
#Sampling rate
sr = 22050

#Making sure that all files have the same amount of samples 
#and so picking a duration right under 30 seconds
TOTAL_SAMPLES = 29*sr

#Data Augmentation
#We have only 999 files, which is pretty low in number
#X Amount of Slices = X times more training samples
NUM_SLICES = 10
SAMPLES_PER_SLICE = int(TOTAL_SAMPLES/NUM_SLICES)

In [6]:
def preprocess_data(source_path, json_path):
  #Dictionary of labels and preprocessed data
  mydict = {
      "labels" : [],
      "mfcc" : []
  }

  #Browsing through each file, slice it and generate the 13 band mfcc for each slice

  for i, (dirpath, dirnames, filenames) in enumerate(os.walk(SOURCE_PATH)):
    for file in filenames:
      #Excluding the corrupted wav file
      if os.path.join(dirpath,file) != '/content/drive/MyDrive/GTZAN/Data/genres_original/jazz/jazz.00054.wav':
        song, sr = librosa.load(os.path.join(dirpath, file), duration=29)
        for s in range(NUM_SLICES):
          start_sample = SAMPLES_PER_SLICE * s
          end_sample = start_sample + SAMPLES_PER_SLICE
          mfcc = librosa.feature.mfcc(y=song[start_sample:end_sample], sr=sr, n_mfcc=13)
          mfcc = mfcc.T
          mydict["labels"].append(i-1)
          mydict["mfcc"].append(mfcc.tolist())
        else:
          pass

  #Dictinoary to JSON File
  with open(json_path, 'w') as f:
    json.dump(mydict, f)
  f.close()


In [7]:
def load_data(json_path):
  with open(json_path, 'r') as f:
    data = json.load(f)
  f.close()

  #Loading data into numpy arrays for Tensorflow compatibility
  X = np.array(data["mfcc"])
  y = np.array(data["labels"])

  return X, y

In [8]:
def prepare_datasets(inputs, targets, split_size):
  #Creating a validation set and test set
  inputs_train, inputs_val, targets_train, targets_val = train_test_split(inputs, targets, test_size=split_size)
  inputs_train, inputs_test, targets_train, targets_test = train_test_split(inputs_train, targets_train, test_size=split_size)

  #CNN Models expects 3D input shape
  inputs_train = inputs_train[..., np.newaxis]
  inputs_val = inputs_val[..., np.newaxis]
  inputs_test = inputs_test[..., np.newaxis]

  return inputs_train, inputs_val, inputs_test, targets_train, targets_val, targets_test

In [9]:
def design_model(input_shape):
  #Model Architecture
  model = tf.keras.models.Sequential([
      
      tf.keras.layers.Conv2D(32, (3,3), activation='relu', input_shape=input_shape),
      tf.keras.layers.MaxPooling2D((3,3), strides=(2,2), padding='same'),
      tf.keras.layers.BatchNormalization(),

      tf.keras.layers.Conv2D(32, (3,3), activation='relu', input_shape=input_shape),
      tf.keras.layers.MaxPooling2D((3,3), strides=(2,2), padding='same'),
      tf.keras.layers.BatchNormalization(),

      tf.keras.layers.Conv2D(32, (2,2), activation='relu', input_shape=input_shape),
      tf.keras.layers.MaxPooling2D((3,3), strides=(2,2), padding='same'),
      tf.keras.layers.BatchNormalization(),

      tf.keras.layers.Dropout(0.3),

      tf.keras.layers.Flatten(),
      tf.keras.layers.Dense(64, activation='relu'),
      tf.keras.layers.Dense(len(np.unique(targets)), activation='softmax')
  ])

  return model

In [10]:
def make_prediction(model, X, y, idx):
  genre_dict = {
      0: "blues",
      1: "classical",
      2: "country",
      3: "disco", 
      4: "hiphop",
      5: "jazz",
      6: "metal", 
      7: "pop",
      8: "reggae",
      9: "rock",
  }

  predictions = model.predict(X)
  genre = np.argmax(predictions[idx])

  print("\n Now testing the model for one audio file\n\nThe model predicts: {}, and ground truth is: {}.\n".format(
      genre_dict[genre], genre_dict[y[idx]]))

In [11]:
def plot_performance(hist):
  acc = hist.history['acc']
  val_acc = hist.history['val_acc']
  loss = hist.history['loss']
  val_loss = hist.history['val_loss']

  epochs = range(len(acc))

  plt.plot(epochs, acc, 'r', label='Training Accuracy')
  plt.plot(epochs, val_acc, 'b', label='Validation Accuracy')
  plt.title('Training and Validation Accuracy')
  plt.legend()
  plt.figure()

  plt.plot(epochs, loss, 'r', label='Training Loss')
  plt.plot(epochs, val_loss, 'b', label='Validation Loss')
  plt.title('Training and Validation Loss')
  plt.legend()

  plt.show()

In [12]:
if __name__ == "__main__":
  preprocess_data(source_path = SOURCE_PATH, json_path = JSON_PATH)
  inputs, targets = load_data(json_path=JSON_PATH)

  Xtrain, Xval, Xtest, ytrain, yval, ytest = prepare_datasets(inputs, targets, 0.2)

  input_shape = (Xtrain.shape[1], Xtrain.shape[2], 1)
  model = design_model(input_shape)

  #Selection of the optimizer, loss type and metric for performace evaluation
  model.compile(optimizer = tf.keras.optimizers.RMSprop(lr=0.001),
                loss='sparse_categorical_crossentropy',
                metrics = ['acc'])
  model.summary()

  #Training the model
  history = model.fit(Xtrain, ytrain, validation_data=(Xval, yval),
                      epochs=30,
                      batch_size=32)
  
  plot_performance(history)

  #Testing the model on never seen before data
  make_prediction(model, Xtest, ytest, 24)

FileNotFoundError: ignored