<a href="https://colab.research.google.com/github/Rohit9403/Audio-Classification-Using-CNNs/blob/master/CNN_audio_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Dataset Preprocessing

In [0]:
!wget http://opihi.cs.uvic.ca/sound/genres.tar.gz

--2020-02-29 17:56:55--  http://opihi.cs.uvic.ca/sound/genres.tar.gz
Resolving opihi.cs.uvic.ca (opihi.cs.uvic.ca)... 142.104.68.135
Connecting to opihi.cs.uvic.ca (opihi.cs.uvic.ca)|142.104.68.135|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1225571541 (1.1G) [application/x-gzip]
Saving to: ‘genres.tar.gz’


2020-02-29 18:17:52 (953 KB/s) - ‘genres.tar.gz’ saved [1225571541/1225571541]



In [0]:
!ls

drive  genres.tar.gz  sample_data


In [0]:
!tar -xf genres.tar.gz

In [0]:
!ls

drive  genres  genres.tar.gz  sample_data


In [0]:
%cd genres

/content/genres


In [0]:
!ls

bextract_single.mf  classical  country	hi.mf	  ja.mf  metal	reggae	ro.mf
bl.mf		    cl.mf      di.mf	hiphop	  jazz	 po.mf	re.mf
blues		    co.mf      disco	input.mf  me.mf  pop	rock


#Function for extracting MFCC

In [0]:
import json
import os
import math
import librosa

DATASET_PATH = "/content/genres"
JSON_PATH = "data_10.json"
SAMPLE_RATE = 22050
TRACK_DURATION = 30 # measured in seconds
SAMPLES_PER_TRACK = SAMPLE_RATE * TRACK_DURATION


def save_mfcc(dataset_path, json_path, num_mfcc=13, n_fft=2048, hop_length=512, num_segments=5):
    """Extracts MFCCs from music dataset and saves them into a json file along witgh genre labels.
        :param dataset_path (str): Path to dataset
        :param json_path (str): Path to json file used to save MFCCs
        :param num_mfcc (int): Number of coefficients to extract
        :param n_fft (int): Interval we consider to apply FFT. Measured in # of samples
        :param hop_length (int): Sliding window for FFT. Measured in # of samples
        :param: num_segments (int): Number of segments we want to divide sample tracks into
        :return:
        """

    # dictionary to store mapping, labels, and MFCCs
    data = {
        "mapping": [],
        "labels": [],
        "mfcc": []
    }

    samples_per_segment = int(SAMPLES_PER_TRACK / num_segments)
    num_mfcc_vectors_per_segment = math.ceil(samples_per_segment / hop_length)

    # loop through all genre sub-folder
    for i, (dirpath, dirnames, filenames) in enumerate(os.walk(dataset_path)):

        # ensure we're processing a genre sub-folder level
        if dirpath is not dataset_path:

            # save genre label (i.e., sub-folder name) in the mapping
            semantic_label = dirpath.split("/")[-1]
            data["mapping"].append(semantic_label)
            print("\nProcessing: {}".format(semantic_label))

            # process all audio files in genre sub-dir
            for f in filenames:

		# load audio file
                file_path = os.path.join(dirpath, f)
                signal, sample_rate = librosa.load(file_path, sr=SAMPLE_RATE)

                # process all segments of audio file
                for d in range(num_segments):

                    # calculate start and finish sample for current segment
                    start = samples_per_segment * d
                    finish = start + samples_per_segment

                    # extract mfcc
                    mfcc = librosa.feature.mfcc(signal[start:finish], sample_rate, n_mfcc=num_mfcc, n_fft=n_fft, hop_length=hop_length)
                    mfcc = mfcc.T

                    # store only mfcc feature with expected number of vectors
                    if len(mfcc) == num_mfcc_vectors_per_segment:
                        data["mfcc"].append(mfcc.tolist())
                        data["labels"].append(i-1)
                        print("{}, segment:{}".format(file_path, d+1))
    

    # save MFCCs to json file
    with open(json_path, "w") as fp:
        json.dump(data, fp, indent=4)
        
        
if __name__ == "__main__":
    save_mfcc(DATASET_PATH, JSON_PATH, num_segments=10)
    save_mfcc("/content/drive/My Drive/Colab Data/Two_Feet_-_Go_Fuck_Yourself (online-audio-converter.com).wav",
              "data_1.json")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
/content/genres/blues/blues.00080.wav, segment:10
/content/genres/blues/blues.00022.wav, segment:1
/content/genres/blues/blues.00022.wav, segment:2
/content/genres/blues/blues.00022.wav, segment:3
/content/genres/blues/blues.00022.wav, segment:4
/content/genres/blues/blues.00022.wav, segment:5
/content/genres/blues/blues.00022.wav, segment:6
/content/genres/blues/blues.00022.wav, segment:7
/content/genres/blues/blues.00022.wav, segment:8
/content/genres/blues/blues.00022.wav, segment:9
/content/genres/blues/blues.00022.wav, segment:10
/content/genres/blues/blues.00007.wav, segment:1
/content/genres/blues/blues.00007.wav, segment:2
/content/genres/blues/blues.00007.wav, segment:3
/content/genres/blues/blues.00007.wav, segment:4
/content/genres/blues/blues.00007.wav, segment:5
/content/genres/blues/blues.00007.wav, segment:6
/content/genres/blues/blues.00007.wav, segment:7
/content/genres/blues/blues.00007.wav, segment:8
/c

#Implementing CNN

In [0]:
import json
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow.keras as keras
import matplotlib.pyplot as plt

DATA_PATH = "/content/genres/data_10.json"


def load_data(data_path):
    """Loads training dataset from json file.
        :param data_path (str): Path to json file containing data
        :return X (ndarray): Inputs
        :return y (ndarray): Targets
    """

    with open(data_path, "r") as fp:
        data = json.load(fp)

    X = np.array(data["mfcc"])
    y = np.array(data["labels"])
    return X, y





def prepare_datasets(test_size, validation_size):
    """Loads data and splits it into train, validation and test sets.
    :param test_size (float): Value in [0, 1] indicating percentage of data set to allocate to test split
    :param validation_size (float): Value in [0, 1] indicating percentage of train set to allocate to validation split
    :return X_train (ndarray): Input training set
    :return X_validation (ndarray): Input validation set
    :return X_test (ndarray): Input test set
    :return y_train (ndarray): Target training set
    :return y_validation (ndarray): Target validation set
    :return y_test (ndarray): Target test set
    """

    # load data
    X, y = load_data(DATA_PATH)

    # create train, validation and test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=validation_size)

    # add an axis to input sets
    X_train = X_train[..., np.newaxis]
    X_validation = X_validation[..., np.newaxis]
    X_test = X_test[..., np.newaxis]

    return X_train, X_validation, X_test, y_train, y_validation, y_test


def build_model(input_shape):
    """Generates CNN model
    :param input_shape (tuple): Shape of input set
    :return model: CNN model
    """

    # build network topology
    model = keras.Sequential()

    # 1st conv layer
    model.add(keras.layers.Conv2D(32, (3, 3), activation='relu', input_shape=input_shape))
    model.add(keras.layers.MaxPooling2D((3, 3), strides=(2, 2), padding='same'))
    model.add(keras.layers.BatchNormalization())

    # 2nd conv layer
    model.add(keras.layers.Conv2D(32, (3, 3), activation='relu'))
    model.add(keras.layers.MaxPooling2D((3, 3), strides=(2, 2), padding='same'))
    model.add(keras.layers.BatchNormalization())

    # 3rd conv layer
    model.add(keras.layers.Conv2D(32, (2, 2), activation='relu'))
    model.add(keras.layers.MaxPooling2D((2, 2), strides=(2, 2), padding='same'))
    model.add(keras.layers.BatchNormalization())

    # flatten output and feed it into dense layer
    model.add(keras.layers.Flatten())
    model.add(keras.layers.Dense(64, activation='relu'))
    model.add(keras.layers.Dropout(0.3))

    # output layer
    model.add(keras.layers.Dense(10, activation='softmax'))

    return model

    
if __name__ == "__main__":

    # get train, validation, test splits
    X_train, X_validation, X_test, y_train, y_validation, y_test = prepare_datasets(0.25, 0.2)

    # create network
    input_shape = (X_train.shape[1], X_train.shape[2], 1)
    model = build_model(input_shape)

    # compile model
    optimiser = keras.optimizers.Adam(learning_rate=0.0001)
    model.compile(optimizer=optimiser,
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

    model.summary()
    # train model
    history = model.fit(X_train, y_train, validation_data=(X_validation, y_validation), batch_size=32, epochs=30)

    
    # evaluate model on test set
    test_loss, test_acc = model.evaluate(X_test, y_test, verbose=2)
    print('\nTest accuracy:', test_acc)

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_12 (Conv2D)           (None, 128, 11, 32)       320       
_________________________________________________________________
max_pooling2d_12 (MaxPooling (None, 64, 6, 32)         0         
_________________________________________________________________
batch_normalization_12 (Batc (None, 64, 6, 32)         128       
_________________________________________________________________
conv2d_13 (Conv2D)           (None, 62, 4, 32)         9248      
_________________________________________________________________
max_pooling2d_13 (MaxPooling (None, 31, 2, 32)         0         
_________________________________________________________________
batch_normalization_13 (Batc (None, 31, 2, 32)         128       
_________________________________________________________________
conv2d_14 (Conv2D)           (None, 30, 1, 32)        

In [0]:
X_train.shape

(5997, 130, 13, 1)

#Predicting own DATASET

**Exatracting mfcc from input file**

In [0]:
num_segments=5
num_mfcc=13 
n_fft=2048 
hop_length=512
SAMPLE_RATE = 22050
TRACK_DURATION = 30 # measured in seconds
SAMPLES_PER_TRACK = 22050*30
samples_per_segment = int(SAMPLES_PER_TRACK /5)
num_mfcc_vectors_per_segment = 130
MFCC=[]


  # load audio fil
file_path ="/content/genres/blues/blues.00003.wav"
signal, sample_rate = librosa.load(file_path, sr=SAMPLE_RATE)

# process all segments of audio file
for d in range(5):

  # calculate start and finish sample for current segment
  start = samples_per_segment * d
  finish = start + samples_per_segment
  start=int(start/2)
  finish=int(finish/2)
  # extract mfcc
  mfcc = librosa.feature.mfcc(signal[start:finish], sample_rate, n_mfcc=num_mfcc, n_fft=n_fft, hop_length=hop_length)
  mfcc = mfcc.T

**Function for the prediction**

In [0]:
def predict(model, X):
    """Predict a single sample using the trained model
    :param model: Trained classifier
    :param X: Input data
    :param y (int): Target
    """

    # add a dimension to input data for sample - model.predict() expects a 4d array in this case
    X=X[..., np.newaxis]
    X = X[np.newaxis, ...] # array shape (1, 130, 13, 1)

    # perform prediction
    prediction = model.predict(X)

    # get index with max value
    predicted_index = np.argmax(prediction, axis=1)

    return predicted_index 

if __name__ == "__main__":
  predict(model,mfcc)
print(predicted_index)

[8]
