Loading Dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

!unrar x "/content/drive/MyDrive/Commands_Dataset.rar" "/content/Dataset/"

Importing Necessary Libraries

In [None]:
!pip install git+https://github.com/librosa/librosa

import librosa
import IPython.display as ipd
import numpy as np
import glob
import soundfile as sf
import pickle
from matplotlib import pyplot as plt
from librosa.feature import rms, mfcc, delta
from os import listdir
from os.path import isfile, join
from scipy.cluster.vq import kmeans, whiten

Definition of Functions for performing
1.   End Pointing
2.   Feature Extraction



In [3]:
def endPointing(x, Fs, thresh, frame_length, hop_length):

  STenergy = rms(y=x, frame_length=frame_length, hop_length=hop_length)[0]
  STenergy = STenergy/np.max(STenergy)
  STenergy = STenergy**2

  # thresh = 0.01
  audio = []
  start = 0
  end = 0
  for i in range(STenergy.size):
    if(STenergy[i] > thresh):
      start = i
      break

  for i in np.flip(range(STenergy.size)):
    if(STenergy[i] > thresh):
      end = i
      break

  start = max(0, start*hop_length - int(frame_length/2))
  end = min(x.size, end*hop_length + int(frame_length/2))

  audio = x[start:end]

  return audio

def featureExtraction(x, Fs, frame_length, hop_length):
  MFCC = mfcc(y=audio, sr=Fs, hop_length=hop_length, win_length=frame_length)
  MFCC = MFCC[:13,:]
  MFCC_delta = delta(MFCC, mode='mirror')  
  MFCC_delta2 = delta(MFCC, order=2, mode='mirror')
  feature_vector = np.concatenate((MFCC,MFCC_delta,MFCC_delta2))

  return feature_vector

Generating CODEBOOK for every word

In [4]:
classes = ["down","go","left","no","off","on","right","stop","up","yes"]
CB = [[] for i in range(10)]

for i in range(10):
  paths = glob.glob("/content/Dataset/Commands Dataset/train/"+classes[i]+"/*.wav")
  for audio_path in paths:
    x, Fs = librosa.load(audio_path, sr=None)
    window_dur = 0.03
    frame_length = int(Fs*window_dur)
    hop_length = int(Fs*window_dur/2)
    thresh = 0.01

    audio = endPointing(x, Fs, thresh, frame_length, hop_length)
    dur = audio.shape[0]/Fs
    if(dur >= 0.3):
      feature_vector = featureExtraction(audio, Fs, frame_length, hop_length)
    
      for j in range(feature_vector.shape[1]):
        CB[i].append(feature_vector[:,j])

  print(classes[i]+": ",len(CB[i]))

down:  86598
go:  66492
left:  64687
no:  78917
off:  42450
on:  78762
right:  69348
stop:  64751
up:  37352
yes:  75912


Storing the codebook \\
Note that generation of codebooks takes quite some time. \\
Hence we prefer to store it once and instead of generating it everytime, we just load it.

In [None]:
with open("CodeBook.txt", "wb") as fp:
  pickle.dump(CB, fp)

Loading the Codebook

In [None]:
with open("CodeBook.txt", "rb") as fp:
  CB = pickle.load(fp)

Definition of Functions for 

1.   Vector Quantization
2.   Calculating Minimum Distortion w.r.t a codebook



In [5]:
def VQ_CB(CB, k):
  VQ = np.array(CB)
  Kmeans, distortion = kmeans(VQ, k_or_guess=k)

  return Kmeans

def min_Distortion(test_vec, Kmeans):
  dist = Kmeans - test_vec
  dist = np.square(dist)
  dist = np.sqrt(np.sum(dist, axis=1))
  min_dist = np.min(dist)

  return min_dist

Performing Vector Quantization to obtain 64 K-means representative vectors for every codebook

In [None]:
Kmeans = []

for i in range(10):
  Kmeans.append(VQ_CB(CB[i], 64))

Storing the Kmeans list of vectors. \\
Please note that this algorithm also takes quite some time. \\
Hence, we generate them once and then next time onwards we just load it.

In [None]:
with open("kmeans.txt", "wb") as fp:
  pickle.dump(Kmeans, fp)

Loading the Kmeans list of vectors

In [6]:
#Don't run the above code. Directly load the Kmeans file that I have attached with the code named as "kmeans.txt". Because running kmeans algorithm will take a lot of time.

with open("kmeans.txt", "rb") as fp:
  Kmeans = pickle.load(fp)

TASK A \\
Performing Prediction on Clean Test utterances

In [8]:
correct = 0
count = 0
confusion_matrix = np.zeros((10,10))
min_index = 0

for i in range(10):
  paths = glob.glob("/content/Dataset/Commands Dataset/test_clean/"+classes[i]+"/*.wav")
  for audio_path in paths:

    x, Fs = librosa.load(audio_path, sr=None)
    window_dur = 0.03
    frame_length = int(Fs*window_dur)
    hop_length = int(Fs*window_dur/2)
    thresh = 0.01

    audio = endPointing(x, Fs, thresh, frame_length, hop_length)
    dur = audio.shape[0]/Fs
    Prediction = "down"
    min_tot_dist = 99999999999
    if(dur >= 0.3):
      count = count+1
      feature_vector = featureExtraction(audio, Fs, frame_length, hop_length)
      
      for k in range(10):
        total_dist = 0
        for j in range(feature_vector.shape[1]):
          min_dist = min_Distortion(feature_vector[:,j],Kmeans[k])
          total_dist = total_dist + min_dist
        
        if(total_dist < min_tot_dist):
          min_tot_dist = total_dist
          Prediction = classes[k]
          min_index = k

      confusion_matrix[i,min_index] = confusion_matrix[i,min_index]+1
      if(Prediction == classes[i]):
        correct = correct+1
  confusion_matrix[i] = confusion_matrix[i]/np.sum(confusion_matrix[i])
  print(classes[i]+" testing complete, accuracy so far:", correct/count) 

print(confusion_matrix)

down testing complete, accuracy so far: 0.7107438016528925
go testing complete, accuracy so far: 0.6621923937360179
left testing complete, accuracy so far: 0.6761904761904762
no testing complete, accuracy so far: 0.6771929824561403
off testing complete, accuracy so far: 0.6790123456790124
on testing complete, accuracy so far: 0.6942355889724311
right testing complete, accuracy so far: 0.7132667617689016
stop testing complete, accuracy so far: 0.7284183994959043
up testing complete, accuracy so far: 0.7265717674970344
yes testing complete, accuracy so far: 0.7409766454352441
[[0.7107438  0.04545455 0.02479339 0.10743802 0.01652893 0.04545455
  0.00826446 0.00826446 0.01652893 0.01652893]
 [0.05365854 0.60487805 0.02439024 0.13658537 0.02439024 0.01463415
  0.01463415 0.0097561  0.10731707 0.0097561 ]
 [0.04918033 0.01639344 0.71038251 0.04371585 0.01092896 0.01639344
  0.0273224  0.01092896 0.04371585 0.07103825]
 [0.12       0.11555556 0.02666667 0.68       0.00444444 0.01333333
  0.00

TASK B \\
Performing Prediction on Noisy Test utterances

In [15]:
correct = 0
count = 0
confusion_matrix = np.zeros((10,10))
min_index = 0

for i in range(10):
  paths = glob.glob("/content/Dataset/Commands Dataset/test_noisy/"+classes[i]+"/*.wav")
  for audio_path in paths:

    x, Fs = librosa.load(audio_path, sr=None)
    window_dur = 0.03
    frame_length = int(Fs*window_dur)
    hop_length = int(Fs*window_dur/2)
    thresh = 0.1

    audio = endPointing(x, Fs, thresh, frame_length, hop_length)
    dur = audio.shape[0]/Fs
    Prediction = "down"
    min_tot_dist = 99999999999
    if(dur >= 0.3):
      count = count+1
      feature_vector = featureExtraction(audio, Fs, frame_length, hop_length)
      
      for k in range(10):
        total_dist = 0
        for j in range(feature_vector.shape[1]):
          min_dist = min_Distortion(feature_vector[:,j],Kmeans[k])
          total_dist = total_dist + min_dist
        
        if(total_dist < min_tot_dist):
          min_tot_dist = total_dist
          Prediction = classes[k]
          min_index = k

      confusion_matrix[i,min_index] = confusion_matrix[i,min_index]+1
      if(Prediction == classes[i]):
        correct = correct+1
  confusion_matrix[i] = confusion_matrix[i]/np.sum(confusion_matrix[i])
  print(classes[i]+" testing complete, accuracy so far:", correct/count)
print(confusion_matrix)

down testing complete, accuracy so far: 0.34415584415584416
go testing complete, accuracy so far: 0.3652173913043478
left testing complete, accuracy so far: 0.39862542955326463
no testing complete, accuracy so far: 0.4056603773584906
off testing complete, accuracy so far: 0.42391304347826086
on testing complete, accuracy so far: 0.40931780366056575
right testing complete, accuracy so far: 0.4149560117302053
stop testing complete, accuracy so far: 0.4243641231593039
up testing complete, accuracy so far: 0.42213642213642216
yes testing complete, accuracy so far: 0.441073512252042
[[0.34415584 0.03246753 0.15584416 0.03896104 0.2987013  0.03896104
  0.00649351 0.         0.01948052 0.06493506]
 [0.01315789 0.40789474 0.11842105 0.03947368 0.25       0.02631579
  0.01315789 0.01315789 0.09210526 0.02631579]
 [0.04918033 0.03278689 0.52459016 0.03278689 0.21311475 0.
  0.01639344 0.01639344 0.04918033 0.06557377]
 [0.05263158 0.10526316 0.09022556 0.42105263 0.2406015  0.0075188
  0.0075188