## Audio Classification

### Problem Statement

We have given some voice files containing some sample voice data which belongs to various voice categories. We are trying to predict the exact category of file with the help of deep learning

In [None]:
# imports
import matplotlib.pyplot as plt
%matplotlib inline 
import librosa
import librosa.display
import IPython.display as ipd
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
from sklearn.model_selection import train_test_split
import tensorflow as tf
print(tf.__version__)
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout,Activation,Flatten
from tensorflow.keras.optimizers import Adam
from sklearn import metrics
from tensorflow.keras.callbacks import ModelCheckpoint
from datetime import datetime 
sr = 22050

In [None]:
# sample audio file for use 
filename = r"C:\Users\TEJAS\Desktop\Projects-membership\Audio classification\UrbanSound8K\4201-3-0-0.wav"

In [None]:
# Plotting this file
plt.figure(figsize = (14,5))
data,sample = librosa.load(filename)
librosa.display.waveshow(data,sr)
ipd.Audio(filename)

In [None]:
# Reading data from metadata folder csv
data = pd.read_csv(r'C:\Users\TEJAS\Desktop\Projects-membership\Audio classification\UrbanSound8K\metadata\UrbanSound8K.csv')
data.head()

In [None]:
# Checking shape of data
data.shape

## Data Preprocessing

In [None]:
audio_file_path=r"C:\Users\TEJAS\Desktop\Projects-membership\Audio classification\UrbanSound8K\4201-3-0-0.wav"
librosa_audio_data,librosa_sample_rate=librosa.load(audio_file_path)

In [None]:
print(librosa_audio_data)

In [None]:
### plotting data

plt.figure(figsize=(12, 4))
plt.plot(librosa_audio_data)

## Feature Extraction with MFCC


In [None]:
# mfcc
mfccs = librosa.feature.mfcc(y=librosa_audio_data, sr=librosa_sample_rate, n_mfcc=40)
print(mfccs.shape)

In [None]:
mfccs

In [None]:
# Setting audio file path
audio_dataset_path='UrbanSound8K/audio/'

In [None]:
# Feature extraction function
def features_extractor(file):
    audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast') 
    mfccs_features = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
    mfccs_scaled_features = np.mean(mfccs_features.T,axis=0)
    
    return mfccs_scaled_features

In [None]:

extracted_features=[]
for index_num,row in tqdm(data.iterrows()):
    file_name = os.path.join(os.path.abspath(audio_dataset_path),'fold'+str(row["fold"])+'/',str(row["slice_file_name"]))
    final_class_labels=row["class"]
    data=features_extractor(file_name)
    extracted_features.append([data,final_class_labels])

In [None]:
### converting extracted_features to Pandas dataframe
extracted_features_df=pd.DataFrame(extracted_features,columns=['feature','class'])
extracted_features_df.head()

In [None]:
### Split the dataset into independent and dependent dataset
X=np.array(extracted_features_df['feature'].tolist())
y=np.array(extracted_features_df['class'].tolist())

In [None]:
# Checking shape
X.shape

In [None]:
# Checkinhg co;umns
y

In [None]:
# Label encoder
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
labelencoder=LabelEncoder()
y=to_categorical(labelencoder.fit_transform(y))

In [None]:
y

#### Train - Test split

In [None]:
### Train Test Split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

In [None]:
X_train

In [None]:
y

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
y_train.shape

In [None]:
y_test.shape

## Model Creation

In [None]:
### No of classes
num_labels=y.shape[1]

In [None]:
# Creating model 
model=Sequential()
###first layer
model.add(Dense(100,input_shape=(40,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
###second layer
model.add(Dense(200))
model.add(Activation('relu'))
model.add(Dropout(0.5))
###third layer
model.add(Dense(100))
model.add(Activation('relu'))
model.add(Dropout(0.5))

###final layer
model.add(Dense(num_labels))
model.add(Activation('softmax'))

In [None]:
model.summary()

In [None]:
# Model compile
model.compile(loss='categorical_crossentropy',metrics=['accuracy'],optimizer='adam')

In [None]:
num_epochs = 100
num_batch_size = 32

checkpointer = ModelCheckpoint(filepath='saved_models/audio_classification.hdf5', 
                               verbose=1, save_best_only=True)
start = datetime.now()

model.fit(X_train, y_train, batch_size=num_batch_size, epochs=num_epochs, validation_data=(X_test, y_test), callbacks=[checkpointer], verbose=1)


duration = datetime.now() - start
print("Training completed in time: ", duration)

In [None]:
test_accuracy=model.evaluate(X_test,y_test,verbose=0)
print(test_accuracy[1])

In [None]:
X_test[1]

In [None]:
model.predict(X_test)

### Testing on Test Data

In [None]:
filename=r"C:\Users\TEJAS\Desktop\Projects-membership\Audio classification\UrbanSound8K\audio\fold1\17592-5-0-0.wav"
audio, sample_rate = librosa.load(filename, res_type='kaiser_fast') 
mfccs_features = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
mfccs_scaled_features = np.mean(mfccs_features.T,axis=0)

print(mfccs_scaled_features)
mfccs_scaled_features=mfccs_scaled_features.reshape(1,-1)
print(mfccs_scaled_features)
print(mfccs_scaled_features.shape)
predicted_label=model.predict(mfccs_scaled_features)
print(predicted_label)
prediction_class = labelencoder.inverse_transform(predicted_label) 
prediction_class