# Extracting Features from files

In [1]:
import numpy as np
from scipy import signal
import matplotlib.pyplot as plt
import pandas as pd
import librosa
import soundfile as sf
import os

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)

### Metadata File

First we create a **metadata file** which will contain the file paths of each utterance and the corresponding (true) word. This will be handy for further preprocessing

In [136]:
path = 'Commands Dataset/train'
metadata = pd.DataFrame(columns = ['Filepath', 'Word'])

for word_folder in os.listdir(path):
    
    if word_folder[-4:] != '.zip': 
        for filename in os.listdir(path + '/' + word_folder):
            fullpath = path + '/' + word_folder + '/' + filename            
            df2 = pd.DataFrame({'Filepath':fullpath, 'Word':word_folder}, index = [0])
            metadata = pd.concat([metadata, df2], ignore_index=True)
        print('{} added to dataframe "metadata"'.format(word_folder))
        
metadata.to_csv('metadata.csv')
print('Written to file')

yes added to dataframe "metadata"
stop added to dataframe "metadata"
down added to dataframe "metadata"
right added to dataframe "metadata"
off added to dataframe "metadata"
on added to dataframe "metadata"
up added to dataframe "metadata"
left added to dataframe "metadata"
no added to dataframe "metadata"
go added to dataframe "metadata"
Written to file


In [2]:
metadata = pd.read_csv('metadata.csv')    

In [21]:
metadata

Unnamed: 0.1,Unnamed: 0,Filepath,Word
0,0,Commands Dataset/train/yes/2a89ad5c_nohash_1.wav,yes
1,1,Commands Dataset/train/yes/834f03fe_nohash_4.wav,yes
2,2,Commands Dataset/train/yes/9ff2d2f4_nohash_3.wav,yes
3,3,Commands Dataset/train/yes/e8d562ca_nohash_0.wav,yes
4,4,Commands Dataset/train/yes/fe5c4a7a_nohash_0.wav,yes
...,...,...,...
23677,23677,Commands Dataset/train/go/617de221_nohash_4.wav,go
23678,23678,Commands Dataset/train/go/1bb574f9_nohash_0.wav,go
23679,23679,Commands Dataset/train/go/e96a5020_nohash_1.wav,go
23680,23680,Commands Dataset/train/go/ad1429cf_nohash_2.wav,go


## MFCC (and delta, double delta) Feature Extraction

We use the library **librosa** for extracting MFCC and delta and double delta features

A code for extracting MFCCs written from scratch has been presented at the end of this notebook, but it takes severly long time to run on a huge dataset that is why librosa has been used

Before feature extraction, **End pointing** and **Pre-emphasis** is carried out

Rule for End pointing (heuristic): Cut those parts of the file at the beginning and end which have audio values less than **10% of the peak value** in that audio file



For pre-emphasis, we add a filter $$P(z) = 1 - 0.95z^{-1}$$ in the signal path. To find the result:

$$Y(z) = P(z)X(z)$$

$$Y(z) =  (1 - 0.95z^{-1}) X(z) $$

$$ y[n] = x[n] - 0.95 x[n-1] $$

Using this recursion equation, we can find the output of the pre-emphasis filter

In [4]:
# 16000 samples - 1 s
# 160 samples - 0.01 s --> hop length
df_file = []
df = pd.DataFrame(columns = ['Filepath', 'Word', 'FrameIndex'] + [f'mfcc{x}' for x in range(13)] + [f'del{x}' for x in range(13)] + [f'ddel{x}' for x in range(13)])

prev_word = metadata['Word'][0]

for meta_idx,filepath in enumerate(metadata['Filepath']):
    audio, samprate = sf.read(filepath)
    
    #End pointing 
    thresh = 0.1
    start_idx = 0
    for idx, value in enumerate(audio):
        if np.mean(abs(audio[idx:idx+1000])) > thresh * audio.max():
            start_idx = idx
            break
    end_idx = 0    
    for idx,value in enumerate(np.flip(audio)):
        if np.mean(abs(np.flip(audio)[idx:idx+1000])) > thresh * audio.max():
            end_idx = idx
            break
    audio = audio[start_idx:-(end_idx+1)]
    
    #Pre-emphasis
    shifted = np.array([0] + list(audio[:-1])) # x[n-1] --> x shifted to the right by 1
    audio = audio - 0.95 * shifted
    
    mfcc = librosa.feature.mfcc(audio, sr = samprate, n_mfcc=13, win_length = 320, hop_length = 160) # 160 samples - 0.01 s
    mfcc_delta = librosa.feature.delta(mfcc, mode = 'constant')
    mfcc_delta2 = librosa.feature.delta(mfcc, order=2, mode = 'constant')
    feat_mat = np.concatenate([mfcc, mfcc_delta, mfcc_delta2]).T
    word = metadata['Word'][meta_idx]
    df_file = pd.DataFrame(data = feat_mat, columns = df.columns.values[3:])
    df_file.insert(loc = 0, column = 'FrameIndex', value = range(feat_mat.shape[0]))
    df_file.insert(loc = 0, column = 'Word', value = word)
    df_file.insert(loc = 0, column = 'Filepath', value = filepath)
    
    df = pd.concat([df, df_file], ignore_index=True)
    
    if meta_idx % 100 == 0:
        print(meta_idx, "files completed")
    if word != prev_word:
        print('Word {} completed'.format(prev_word))
    prev_word = word

0 files completed
100 files completed
200 files completed
300 files completed
400 files completed
500 files completed
600 files completed
700 files completed
800 files completed
900 files completed
1000 files completed
1100 files completed
1200 files completed
1300 files completed
1400 files completed
1500 files completed
1600 files completed
1700 files completed
1800 files completed
1900 files completed
2000 files completed
2100 files completed
2200 files completed
2300 files completed
Word yes completed
2400 files completed
2500 files completed
2600 files completed
2700 files completed
2800 files completed
2900 files completed
3000 files completed
3100 files completed
3200 files completed
3300 files completed
3400 files completed
3500 files completed
3600 files completed
3700 files completed
3800 files completed
3900 files completed
4000 files completed
4100 files completed
4200 files completed
4300 files completed
4400 files completed
4500 files completed
4600 files completed
4700 f

In [5]:
df.to_csv('mfcc_librosa_NEW.csv')

In [6]:
df = pd.read_csv('mfcc_librosa_NEW.csv')

In [5]:
df

Unnamed: 0.1,Unnamed: 0,Filepath,Word,FrameIndex,mfcc0,mfcc1,mfcc2,mfcc3,mfcc4,mfcc5,mfcc6,mfcc7,mfcc8,mfcc9,mfcc10,mfcc11,mfcc12,del0,del1,del2,del3,del4,del5,del6,del7,del8,del9,del10,del11,del12,ddel0,ddel1,ddel2,ddel3,ddel4,ddel5,ddel6,ddel7,ddel8,ddel9,ddel10,ddel11,ddel12
0,0,Commands Dataset/train/yes/2a89ad5c_nohash_1.wav,yes,0,-485.843879,113.044892,11.787018,33.410287,11.498917,18.871940,2.235870,10.832730,1.002495,11.016922,2.036794,9.941528,0.983946,-85.101420,18.436482,3.450133,5.590534,1.530529,3.276542,1.195692,2.490798,0.327598,1.298070,-0.705490,0.908235,-0.387439,9.637714,-1.271370,1.281414,-0.850906,-0.044608,-0.269343,-0.528487,-0.442167,-0.532116,-0.899062,-0.488181,-0.970651,-1.018594
1,1,Commands Dataset/train/yes/2a89ad5c_nohash_1.wav,yes,1,-492.527626,91.350731,1.708101,40.743850,3.082555,18.367780,13.108957,14.311871,8.902652,12.922305,-9.561520,13.038224,5.216174,-61.225426,15.175909,0.718749,7.320242,-2.328244,0.791456,-0.612011,1.582510,-0.750182,0.287546,-0.450736,0.523495,-0.072508,43.104476,-6.936705,-2.227702,0.062698,-3.782151,-3.004581,-2.277545,-1.622191,-1.427303,-1.762937,0.012557,-1.161304,-0.375027
2,2,Commands Dataset/train/yes/2a89ad5c_nohash_1.wav,yes,2,-516.550365,108.032055,15.110626,32.094590,12.180986,18.488709,11.647327,19.084322,4.066192,13.083422,6.036615,9.542233,5.251501,-33.886801,10.508884,-2.859146,5.150939,-4.277639,-0.747125,-1.355497,1.839631,-0.719539,-0.914387,-1.243341,-0.205746,-0.119104,55.228491,-9.482325,-4.849242,-2.430135,-3.789145,-3.230507,-2.030946,-0.933250,-0.739964,-2.074923,-0.624796,-1.273551,-0.189432
3,3,Commands Dataset/train/yes/2a89ad5c_nohash_1.wav,yes,3,-520.969391,107.191059,17.705265,27.785978,6.534982,19.683354,9.341283,19.315854,4.304197,10.160522,-1.059363,7.730177,3.390404,-5.893576,1.020724,-1.299527,0.457518,-3.113460,-2.066273,-1.178592,1.084110,-0.126742,-1.432048,-1.018513,-0.452102,-0.691228,40.411597,-10.908792,-0.876211,-4.935736,-0.091788,-2.110217,-0.498278,-0.859688,0.463229,-0.967326,-0.043938,-0.375684,-0.417545
4,4,Commands Dataset/train/yes/2a89ad5c_nohash_1.wav,yes,4,-504.387168,119.300228,30.490703,36.785275,11.195562,20.549320,1.828517,9.754958,-2.572940,2.078374,-10.415752,-0.204784,-12.284189,27.072328,-10.838518,-0.138153,-4.261995,-0.647033,-2.850556,-0.674934,-0.019100,0.182095,-1.891795,-0.703291,-0.426742,-0.869693,0.723483,-6.331020,2.107685,-3.887969,4.086706,0.530263,1.329929,-0.210315,1.164722,0.644644,0.368807,1.121553,0.022069
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
255343,255343,Commands Dataset/train/go/f17be97f_nohash_4.wav,go,6,-534.720612,72.226706,62.960667,33.441599,17.464484,19.044386,12.332580,6.291500,-2.395396,0.500265,-1.783725,3.345182,-2.571872,-30.273819,-6.492131,5.280481,-0.417333,12.596757,0.847732,-2.651373,3.526669,2.574293,2.251884,-3.865489,-0.452806,-0.951256,20.369551,7.056802,-6.087088,-0.147449,-5.521948,-0.293301,1.450014,-1.414671,-0.009881,-1.128591,2.596647,-1.027379,0.275985
255344,255344,Commands Dataset/train/go/f17be97f_nohash_4.wav,go,7,-535.309297,85.203501,65.656956,32.704405,28.532332,13.290948,6.131910,6.405291,-6.288157,-2.969598,-7.532772,5.331267,-9.730384,22.566022,-9.591556,-2.486826,-2.269059,7.073195,0.052648,-2.501552,1.182782,1.825463,0.894212,-1.857490,-1.055664,-0.446218,43.507369,-0.511529,-6.145764,-2.799054,-7.319421,-1.987424,0.762062,-1.104759,-0.307521,-0.227383,1.776611,-0.247778,0.271168
255345,255345,Commands Dataset/train/go/f17be97f_nohash_4.wav,go,8,-546.626998,111.530882,64.446245,38.777113,28.013645,18.826622,10.683511,2.318098,-3.981347,-4.429572,-3.494754,-4.081353,-2.897784,59.888219,-9.782521,-6.569493,-4.365457,0.397548,-1.604081,-1.895780,0.027564,1.423690,0.515383,-0.758743,-1.736101,-1.104439,43.612391,-7.052726,-5.621715,-2.918575,-4.340210,-1.562414,-0.154875,-1.095492,-0.800762,-0.091987,1.300785,1.074031,1.644471
255346,255346,Commands Dataset/train/go/f17be97f_nohash_4.wav,go,9,-547.002303,104.090749,55.776153,38.166321,27.979548,10.585429,8.757092,15.903183,5.643147,-2.469500,-1.108073,1.445326,0.272911,80.437534,-11.736299,-9.472558,-6.168070,-2.268860,-2.757814,-1.864799,-0.681495,0.307852,0.088031,0.288253,-0.737618,0.283145,32.150431,-8.108907,-3.000636,-1.283351,-3.064660,-0.529123,-0.196676,-0.919592,-0.115373,0.271000,0.341750,0.480773,0.454264


# Test data

### Metadata File for test data

In [7]:
path = 'Commands Dataset/test_clean'
metadata_test_clean = pd.DataFrame(columns = ['Filepath', 'Word'])

for word_folder in os.listdir(path):
    
    if word_folder[-4:] != '.zip': 
        for filename in os.listdir(path + '/' + word_folder):
            fullpath = path + '/' + word_folder + '/' + filename            
            df2 = pd.DataFrame({'Filepath':fullpath, 'Word':word_folder}, index = [0])
            metadata_test_clean = pd.concat([metadata_test_clean, df2], ignore_index=True)
        print('{} added to dataframe "metadata"'.format(word_folder))
        
metadata_test_clean.to_csv('metadata_test_clean.csv')
print('Written to file')

yes added to dataframe "metadata"
stop added to dataframe "metadata"
down added to dataframe "metadata"
right added to dataframe "metadata"
off added to dataframe "metadata"
on added to dataframe "metadata"
up added to dataframe "metadata"
left added to dataframe "metadata"
no added to dataframe "metadata"
go added to dataframe "metadata"
Written to file


In [7]:
metadata_test_clean = pd.read_csv('metadata_test_clean.csv')

### Endpointing, Preemphasis and Feature Extraction using same code as above

In [8]:
# 16000 samples - 1 s
# 160samples - 0.01 s --> hop length
df_file = []
df = pd.DataFrame(columns = ['Filepath', 'Word', 'FrameIndex'] + [f'mfcc{x}' for x in range(13)] + [f'del{x}' for x in range(13)] + [f'ddel{x}' for x in range(13)])

prev_word = metadata_test_clean['Word'][0]

for meta_idx,filepath in enumerate(metadata_test_clean['Filepath']):
    audio, samprate = sf.read(filepath)
    
    #End pointing 
    thresh = 0.1
    start_idx = 0
    for idx, value in enumerate(audio):
        if np.mean(abs(audio[idx:idx+1000])) > thresh * audio.max():
            start_idx = idx
            break
    end_idx = 0    
    for idx,value in enumerate(np.flip(audio)):
        if np.mean(abs(np.flip(audio)[idx:idx+1000])) > thresh * audio.max():
            end_idx = idx
            break
    audio = audio[start_idx:-(end_idx+1)]
    
    #Pre-emphasis
    shifted = np.array([0] + list(audio[:-1])) # x[n-1] --> x shifted to the right by 1
    audio = audio - 0.95 * shifted
    
    mfcc = librosa.feature.mfcc(audio, sr = samprate, n_mfcc=13, win_length = 320, hop_length = 160)
    mfcc_delta = librosa.feature.delta(mfcc, mode = 'constant')
    mfcc_delta2 = librosa.feature.delta(mfcc, order=2, mode = 'constant')
    feat_mat = np.concatenate([mfcc, mfcc_delta, mfcc_delta2]).T
    word = metadata_test_clean['Word'][meta_idx]
    df_file = pd.DataFrame(data = feat_mat, columns = df.columns.values[3:])
    df_file.insert(loc = 0, column = 'FrameIndex', value = range(feat_mat.shape[0]))
    df_file.insert(loc = 0, column = 'Word', value = word)
    df_file.insert(loc = 0, column = 'Filepath', value = filepath)
    
    df = pd.concat([df, df_file], ignore_index=True)
    
    if meta_idx%100 == 0:
        print(meta_idx, "files completed")
    if word != prev_word:
        print('Word {} completed'.format(prev_word))
    prev_word = word

0 files completed
100 files completed
200 files completed
Word yes completed
300 files completed
400 files completed
500 files completed
Word stop completed
600 files completed
700 files completed
Word down completed
800 files completed
900 files completed
1000 files completed
Word right completed
1100 files completed
1200 files completed
Word off completed
1300 files completed
1400 files completed
1500 files completed
Word on completed
1600 files completed
1700 files completed
Word up completed
1800 files completed
1900 files completed
2000 files completed
Word left completed
2100 files completed
2200 files completed
2300 files completed
Word no completed
2400 files completed
2500 files completed


In [9]:
df

Unnamed: 0,Filepath,Word,FrameIndex,mfcc0,mfcc1,mfcc2,mfcc3,mfcc4,mfcc5,mfcc6,mfcc7,mfcc8,mfcc9,mfcc10,mfcc11,mfcc12,del0,del1,del2,del3,del4,del5,del6,del7,del8,del9,del10,del11,del12,ddel0,ddel1,ddel2,ddel3,ddel4,ddel5,ddel6,ddel7,ddel8,ddel9,ddel10,ddel11,ddel12
0,Commands Dataset/test_clean/yes/5f814c23_nohas...,yes,0,-443.777716,31.051747,16.077527,8.786824,0.862336,-3.44806,-10.363891,-15.532651,-12.551787,-17.090705,-18.53773,-10.935442,-9.903494,-88.73164,11.802724,11.801657,17.792042,-5.949925,-1.01862,-1.802282,-4.199455,-1.146943,-1.015871,-8.473689,3.534827,-2.564871,10.947491,0.757855,1.769081,4.581795,-1.198399,-0.475369,0.795536,-0.314076,0.342573,1.621765,-1.482048,0.843089,0.261949
1,Commands Dataset/test_clean/yes/5f814c23_nohas...,yes,1,-575.937638,62.349036,63.536124,84.56229,-36.404694,1.325218,-13.284411,-27.308765,-6.880876,-23.036686,-40.51793,28.345308,-17.367349,-77.779591,12.098664,11.910026,19.401759,-7.736233,-1.266198,-0.622118,-5.593587,0.428866,0.005007,-9.343442,4.579601,-3.126094,33.405831,-1.533318,-1.102151,0.916734,-1.182935,-0.363044,1.974223,-0.743817,1.827077,2.147802,-0.171884,0.700026,0.151068
2,Commands Dataset/test_clean/yes/5f814c23_nohas...,yes,2,-555.917476,68.178218,55.59877,76.804181,-31.026257,-2.752088,-18.168969,-16.997439,0.217549,-4.826383,-43.136304,19.642818,-19.29278,-59.52862,12.208518,9.853899,19.231528,-7.492386,-0.907464,0.040226,-6.672569,1.326396,0.792726,-8.62408,4.992125,-2.639111,43.361449,-2.468998,-4.433777,-2.91428,0.740852,0.298924,1.821085,-0.862023,1.768243,1.699437,1.852766,-0.077318,0.848385
3,Commands Dataset/test_clean/yes/5f814c23_nohas...,yes,3,-528.341951,70.691605,76.413584,104.353208,-29.018903,-7.892744,-14.191059,-19.904362,-16.328153,-12.503318,-41.575682,14.385709,-11.352772,-32.856554,11.959191,7.393082,15.019846,-6.193297,-0.812308,0.105064,-5.31088,1.849521,1.03809,-7.783951,4.052197,-1.025431,35.865801,-1.683146,-5.244172,-7.481463,2.431579,0.395516,0.513687,0.855019,0.825927,0.109824,2.182743,-1.667636,1.803287
4,Commands Dataset/test_clean/yes/5f814c23_nohas...,yes,4,-512.774985,74.345791,76.031247,129.073055,-42.870394,-8.315009,-3.985347,-32.737639,-3.34658,2.311786,-64.225938,25.325391,-15.970256,3.598251,9.955913,2.149355,9.564651,-3.860564,-2.532334,0.072916,-4.17918,1.720159,1.892045,-5.770071,2.836163,0.346837,5.607657,0.252948,-5.189662,-7.702904,3.139365,-1.37557,-1.245102,0.563966,-1.043427,-1.336795,1.409428,-2.293188,1.1631
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82984,Commands Dataset/test_clean/go/c518d1b1_nohash...,go,21,-424.767911,18.512303,-14.513837,1.773912,-93.513477,-11.852769,-6.473652,10.884105,-34.806247,-0.103831,-12.360681,19.01277,7.64333,-17.437782,-4.690962,10.079467,1.715489,4.238827,-0.932371,-1.202734,3.308102,2.702242,0.796004,-4.61997,-4.137132,1.282826,-5.663425,0.329856,0.796069,0.445668,3.318235,0.064504,0.33902,-0.665565,1.409863,0.847701,1.070288,-0.46959,-0.612646
82985,Commands Dataset/test_clean/go/c518d1b1_nohash...,go,22,-441.742143,15.311337,-14.510079,-1.53629,-92.330583,-8.620368,-12.907144,5.849016,-25.256751,-9.945091,-6.865628,11.524886,4.949217,13.714531,-5.518114,9.053207,1.118406,9.658386,-0.30259,-0.150622,1.864288,3.756169,1.23306,-2.689952,-3.76119,1.116649,29.35887,0.410985,-2.109156,-0.419157,5.410221,1.198316,1.26412,-1.740987,1.169982,0.113478,2.461952,0.340224,-0.804124
82986,Commands Dataset/test_clean/go/c518d1b1_nohash...,go,23,-430.349296,25.420758,-3.294094,8.115203,-84.50789,-14.144213,-12.144706,7.33048,-10.847056,-6.01193,-12.837047,1.918631,-2.054485,39.570444,-5.534363,6.733697,0.540309,12.963422,0.517734,0.735231,0.487679,4.251066,1.307897,-0.772276,-2.794872,1.009252,40.958586,0.7575,-3.27701,-0.893702,4.798624,1.438652,1.527342,-1.975325,0.452197,-0.455116,2.658608,0.467192,-1.098762
82987,Commands Dataset/test_clean/go/c518d1b1_nohash...,go,24,-478.34067,14.781527,11.941521,8.871269,-69.061151,-12.481227,-21.251066,22.483562,-9.710852,14.394413,-24.058023,1.698884,-0.362536,59.870553,-4.644424,4.383725,0.020507,14.537379,1.252882,1.548564,-0.757298,4.320942,1.066008,1.063447,-2.254935,0.280771,33.87568,0.614053,-3.590514,-1.035404,2.444227,1.096624,1.105691,-1.461429,-0.524432,-0.713837,1.7085,0.93577,-0.529179


In [10]:
df.to_csv('test_endpt_preemph_NEW.csv')

# Test Data with Noise

### Metadata for noisy test data

In [14]:
path = 'Commands Dataset/test_noisy'
metadata_test_noisy = pd.DataFrame(columns = ['Filepath', 'Word'])

for word_folder in os.listdir(path):
    
    if word_folder[-4:] != '.zip': 
        for filename in os.listdir(path + '/' + word_folder):
            fullpath = path + '/' + word_folder + '/' + filename            
            df2 = pd.DataFrame({'Filepath':fullpath, 'Word':word_folder}, index = [0])
            metadata_test_noisy = pd.concat([metadata_test_noisy, df2], ignore_index=True)
        print('{} added to dataframe "metadata"'.format(word_folder))
        
metadata_test_noisy.to_csv('metadata_test_noisy.csv')
print('Written to file')

yes added to dataframe "metadata"
stop added to dataframe "metadata"
down added to dataframe "metadata"
right added to dataframe "metadata"
off added to dataframe "metadata"
on added to dataframe "metadata"
up added to dataframe "metadata"
left added to dataframe "metadata"
no added to dataframe "metadata"
go added to dataframe "metadata"
Written to file


In [11]:
metadata_test_noisy = pd.read_csv('metadata_test_noisy.csv')

### Endpointing, Preemphasis and Feature Extraction using same code as above

In [12]:
# 16000 samples - 1 s
# 160 samples - 0.01 s
df_file = []
df = pd.DataFrame(columns = ['Filepath', 'Word', 'FrameIndex'] + [f'mfcc{x}' for x in range(13)] + [f'del{x}' for x in range(13)] + [f'ddel{x}' for x in range(13)])

prev_word = metadata_test_noisy['Word'][0]

for meta_idx,filepath in enumerate(metadata_test_noisy['Filepath']):
    audio, samprate = sf.read(filepath)
    
    #End pointing 
    thresh = 0.1
    start_idx = 0
    for idx, value in enumerate(audio):
        if np.mean(abs(audio[idx:idx+1000])) > thresh * audio.max():
            start_idx = idx
            break
    end_idx = 0    
    for idx,value in enumerate(np.flip(audio)):
        if np.mean(abs(np.flip(audio)[idx:idx+1000])) > thresh * audio.max():
            end_idx = idx
            break
    audio = audio[start_idx:-(end_idx+1)]
    
    #Pre-emphasis
    shifted = np.array([0] + list(audio[:-1])) # x[n-1] --> x shifted to the right by 1
    audio = audio - 0.95 * shifted
    
    mfcc = librosa.feature.mfcc(audio, sr = samprate, n_mfcc=13, win_length = 320, hop_length = 160) # 160 samples - 0.01 s
    mfcc_delta = librosa.feature.delta(mfcc, mode = 'constant')
    mfcc_delta2 = librosa.feature.delta(mfcc, order=2, mode = 'constant')
    feat_mat = np.concatenate([mfcc, mfcc_delta, mfcc_delta2]).T
    word = metadata_test_noisy['Word'][meta_idx]
    df_file = pd.DataFrame(data = feat_mat, columns = df.columns.values[3:])
    df_file.insert(loc = 0, column = 'FrameIndex', value = range(feat_mat.shape[0]))
    df_file.insert(loc = 0, column = 'Word', value = word)
    df_file.insert(loc = 0, column = 'Filepath', value = filepath)
    
    df = pd.concat([df, df_file], ignore_index=True)
    
    if meta_idx % 100 == 0:
        print(meta_idx, "files completed")
    if word != prev_word:
        print('Word {} completed'.format(prev_word))
    prev_word = word

0 files completed
100 files completed
200 files completed
Word yes completed
300 files completed
400 files completed
500 files completed
Word stop completed
600 files completed
700 files completed
Word down completed
800 files completed
900 files completed
1000 files completed
Word right completed
1100 files completed
1200 files completed
Word off completed
1300 files completed
1400 files completed
1500 files completed
Word on completed
1600 files completed
1700 files completed
Word up completed
1800 files completed
1900 files completed
2000 files completed
Word left completed
2100 files completed
2200 files completed
2300 files completed
Word no completed
2400 files completed
2500 files completed


In [13]:
df

Unnamed: 0,Filepath,Word,FrameIndex,mfcc0,mfcc1,mfcc2,mfcc3,mfcc4,mfcc5,mfcc6,mfcc7,mfcc8,mfcc9,mfcc10,mfcc11,mfcc12,del0,del1,del2,del3,del4,del5,del6,del7,del8,del9,del10,del11,del12,ddel0,ddel1,ddel2,ddel3,ddel4,ddel5,ddel6,ddel7,ddel8,ddel9,ddel10,ddel11,ddel12
0,Commands Dataset/test_noisy/yes/e1469561_nohas...,yes,0,-573.52558,7.052643,25.481795,18.777418,-2.343921,-17.625313,-18.350194,-2.351107,-9.578281,18.721036,-6.136459,7.085698,-2.745611,-75.227818,1.912189,5.474688,7.054457,-3.893666,-2.0217,-0.926619,-3.480356,-1.620447,1.057123,-1.459025,1.508662,0.748908,25.669905,1.734427,0.823767,3.918583,-1.449939,1.045505,2.041785,0.11624,0.493102,-1.219826,0.051859,0.04649,-0.237703
1,Commands Dataset/test_noisy/yes/e1469561_nohas...,yes,1,-543.860918,-6.692278,24.215746,2.495657,-14.247852,-13.178749,-10.62954,-19.225172,-8.408536,9.522743,-7.272079,5.144034,1.633279,-47.583175,2.933102,5.508944,10.26032,-5.456284,-1.912093,0.618867,-1.926417,-1.813023,-1.294901,-1.102413,0.772894,-0.576119,55.193813,1.820527,-0.328747,4.389654,-1.649007,1.166928,2.9688,2.296698,0.479352,-3.1033,0.634479,-0.910688,-1.600668
2,Commands Dataset/test_noisy/yes/e1469561_nohas...,yes,2,-535.377507,-10.823654,17.309874,16.86873,-18.731998,-19.550791,-23.648136,-26.722921,-13.951519,8.215622,-9.774982,11.410673,19.333411,-13.60079,2.295115,3.547467,11.192041,-4.753993,-0.774993,2.644609,0.16107,-1.624269,-2.989227,-1.754156,-0.187923,-2.413943,64.35689,0.096489,-2.540553,2.099521,0.468082,1.784221,3.329915,3.769819,0.624457,-3.050151,-0.081302,-1.516196,-2.721234
3,Commands Dataset/test_noisy/yes/e1469561_nohas...,yes,3,-510.978641,18.535471,31.939805,24.709332,-11.752656,-16.658856,-13.57947,-29.937291,-14.864069,15.988628,-8.888602,6.853866,1.052772,25.714001,0.107282,1.260534,11.484294,-3.196941,0.121445,2.975569,1.898301,-0.803095,-3.935089,-1.860429,-1.506966,-3.640586,48.219121,-2.461134,-3.190637,0.019019,2.34975,1.189272,0.97997,3.494242,0.898885,-1.400323,-0.250532,-1.793074,-2.425491
4,Commands Dataset/test_noisy/yes/e1469561_nohas...,yes,4,-341.529314,21.866133,43.456588,78.226581,-36.662534,-4.76128,10.766777,-11.584622,-4.080764,-2.623124,-8.513407,10.498186,0.369017,68.997177,-2.028579,-1.194198,9.142508,-0.48975,1.414515,3.368631,2.738386,-0.825608,-4.713059,-0.967494,-2.370166,-4.380062,2.199488,-3.672111,-1.90545,-2.995185,3.759687,0.04398,-1.611477,1.181086,-0.401375,0.87718,0.177042,-0.862029,-1.246835
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84861,Commands Dataset/test_noisy/go/b83c1acf_nohash...,go,33,-287.060873,-0.797874,-12.588393,-0.881589,-14.161045,-7.654076,-14.508529,1.6335,9.541194,1.581472,6.881883,-7.412948,-17.979866,-6.692889,-5.262885,-4.133893,-3.14808,0.13482,0.366346,-0.95813,1.38066,1.879247,3.16031,2.283983,0.723361,0.503119,-1.213397,0.108775,-0.251218,-0.17428,1.505236,0.619112,-0.903568,-1.308599,-0.785117,-0.08864,-0.900528,0.133149,-0.298842
84862,Commands Dataset/test_noisy/go/b83c1acf_nohash...,go,34,-287.964474,-18.488757,-32.066722,-1.336839,3.652683,-5.905268,-5.800646,-3.628876,6.787007,9.114917,25.440871,0.962352,-12.983656,14.440109,-2.623116,-2.228452,-2.732077,1.013072,1.312097,-0.108997,1.108589,1.404103,2.5797,1.041992,0.789049,1.510257,19.002376,1.99812,2.93998,1.377286,0.753473,0.441988,1.142684,-0.874102,-1.39941,-0.818696,-1.630481,-0.29944,-0.125547
84863,Commands Dataset/test_noisy/go/b83c1acf_nohash...,go,35,-295.243432,-12.498011,-22.052069,-10.415053,-12.193155,-10.90633,-25.415118,0.178187,0.396359,-3.116716,9.149263,11.910053,-15.142296,30.881362,-0.48908,0.601443,-1.726067,1.243838,1.092122,0.74454,0.594184,0.691848,2.049245,0.023549,0.794517,1.376521,24.806536,2.789603,3.183297,1.909707,0.303244,1.097874,2.194885,-0.162278,-1.437716,-1.469241,-1.825178,-0.728519,0.70124
84864,Commands Dataset/test_noisy/go/b83c1acf_nohash...,go,36,-313.91568,-31.91381,-34.217753,-17.540251,-2.092734,-17.971153,-28.127318,-4.786923,1.856187,-7.494387,0.744558,-1.539525,-0.363298,43.022407,1.474823,1.925189,-1.047663,1.563687,1.973119,1.913808,0.521509,-0.625134,0.944565,-0.931676,0.540466,1.269013,19.250108,2.245461,3.45126,2.367136,-0.539881,0.033833,1.870984,0.04429,-0.212784,-1.116711,-1.360684,-0.828696,1.204467


In [14]:
df.to_csv('test_endpt_preemph_noisy_NEW.csv')

# Extras:

These are some tasks which can be done but I could not complete them in time:

1. Adding Noise to Training Data
2. Code to Extract MFCC from scratch

The code is working but the running time is too long

## 1. Adding Noise to Training Data

We can add noise to the training data to increase the amount of training data.

We have 6 noise files and hence we can increase the data 7 times, adding a different kind of noise to each utterance. However this is an extremely time taking task and I could no complete it

In [19]:
from add_noise import add_noise

In [20]:
metadata = pd.read_csv('metadata.csv') 

In [23]:
doing_the_dishes = 'Commands Dataset/_background_noise_/doing_the_dishes.wav'
dude_miaowing = 'Commands Dataset/_background_noise_/dude_miaowing.wav'
exercise_bike = 'Commands Dataset/_background_noise_/exercise_bike.wav'
pink_noise = 'Commands Dataset/_background_noise_/pink_noise.wav'
running_tap = 'Commands Dataset/_background_noise_/running_tap.wav'
white_noise = 'Commands Dataset/_background_noise_/white_noise.wav'

noise_list = [doing_the_dishes, dude_miaowing, exercise_bike, pink_noise, running_tap, white_noise]

In [26]:
doing_the_dishes.split('/')[-1][:-4]

'doing_the_dishes'

In [37]:
word_dict = dict([(x, df['Word'].unique()[x]) for x in range(10)])

### Create a folder named train_noisy inside "Commands Dataset" and run the following cells. The required directory structure will be automatically created

In [55]:
for noise_file in noise_list:
        
    noise_name = noise_file.split('/')[-1][:-4]
    parent_dir = 'Commands Dataset/train_noisy/'
    os.mkdir(parent_dir + noise_name)    

In [56]:
for noise_file in noise_list:
    
    noise_name = noise_file.split('/')[-1][:-4]
    for word in word_dict.values():
        
        parent_dir =  f'Commands Dataset/train_noisy/{noise_name}/'
        os.mkdir(parent_dir + word)

### Adding noise and saving them as wav files

In [57]:
for noise_file in noise_list:
    
    noise_name = noise_file.split('/')[-1][:-4]
    for meta_idx,filepath in enumerate(metadata['Filepath']):
        
        word = metadata['Word'][meta_idx]
        
        audio_name = filepath.split('/')[-1][:-4]
        outputfile = f'Commands Dataset/train_noisy/{noise_name}/{word}/{audio_name}_{noise_name}.wav'
        add_noise(filepath, noise_file, outputfile)
        if meta_idx % 2500 == 0:
            print(meta_idx, "files completed in", noise_name)
    print(noise_name, "completed")    

0 files completed in doing_the_dishes
2500 files completed in doing_the_dishes
5000 files completed in doing_the_dishes
7500 files completed in doing_the_dishes
10000 files completed in doing_the_dishes
12500 files completed in doing_the_dishes
15000 files completed in doing_the_dishes
17500 files completed in doing_the_dishes
20000 files completed in doing_the_dishes
22500 files completed in doing_the_dishes
doing_the_dishes completed
0 files completed in dude_miaowing
2500 files completed in dude_miaowing
5000 files completed in dude_miaowing
7500 files completed in dude_miaowing
10000 files completed in dude_miaowing
12500 files completed in dude_miaowing
15000 files completed in dude_miaowing
17500 files completed in dude_miaowing
20000 files completed in dude_miaowing
22500 files completed in dude_miaowing
dude_miaowing completed
0 files completed in exercise_bike
2500 files completed in exercise_bike
5000 files completed in exercise_bike
7500 files completed in exercise_bike
1000

### Metadata for our new (expanded dataset) of noisy training files

In [64]:
metadata = pd.DataFrame(columns = ['Filepath', 'Word'])

for noise_file in noise_list:
    
    noise_name =  noise_file.split('/')[-1][:-4]
    path = f'Commands Dataset/train_noisy/{noise_name}'
    

    for word_folder in os.listdir(path):

        if word_folder[-4:] != '.zip': 
            for filename in os.listdir(path + '/' + word_folder):
                fullpath = path + '/' + word_folder + '/' + filename            
                df2 = pd.DataFrame({'Filepath':fullpath, 'Word':word_folder}, index = [0])
                metadata = pd.concat([metadata, df2], ignore_index=True)
#             print('{} added to dataframe "metadata"'.format(word_folder))
    print(noise_name, 'completed')
    
metadata.to_csv('metadata_noise_added.csv')
print('Written to file')

doing_the_dishes completed
dude_miaowing completed
exercise_bike completed
pink_noise completed
running_tap completed
white_noise completed
Written to file


In [65]:
metadata = pd.read_csv('metadata_noise_added.csv')

In [None]:
import time

start_time = time.time()

df_file = []
df = pd.DataFrame(columns = ['Filepath', 'Word', 'FrameIndex'] + [f'mfcc{x}' for x in range(13)] + [f'del{x}' for x in range(13)] + [f'ddel{x}' for x in range(13)])


for meta_idx,filepath in enumerate(metadata['Filepath']):
    audio, samprate = sf.read(filepath)
    
    #End pointing 
    thresh = 0.1
    start_idx = 0
    for idx, value in enumerate(audio):
        if np.mean(abs(audio[idx:idx+1000])) > thresh * audio.max():
            start_idx = idx
            break
    end_idx = 0    
    for idx,value in enumerate(np.flip(audio)):
        if np.mean(abs(np.flip(audio)[idx:idx+1000])) > thresh * audio.max():
            end_idx = idx
            break
    audio = audio[start_idx:-(end_idx+1)]
    
    #Pre-emphasis
    shifted = np.array([0] + list(audio[:-1])) # x[n-1] --> x shifted to the right by 1
    audio = audio - 0.95 * shifted
    
    
    
    mfcc = librosa.feature.mfcc(audio, sr = samprate, n_mfcc=13, hop_length = 1600) # 1600 samples - 0.1 s
    mfcc_delta = librosa.feature.delta(mfcc, mode = 'constant')
    mfcc_delta2 = librosa.feature.delta(mfcc, order=2, mode = 'constant')
    feat_mat = np.concatenate([mfcc, mfcc_delta, mfcc_delta2]).T
    word = metadata['Word'][meta_idx]
    df_file = pd.DataFrame(data = feat_mat, columns = df.columns.values[3:])
    df_file.insert(loc = 0, column = 'FrameIndex', value = range(feat_mat.shape[0]))
    df_file.insert(loc = 0, column = 'Word', value = word)
    df_file.insert(loc = 0, column = 'Filepath', value = filepath)
    
    df = pd.concat([df, df_file], ignore_index=True)
    
    if meta_idx % 2000 == 0:
        print(meta_idx, "files completed")
        print(f'Time Elapsed: {round((time.time() - start_time)//60)} minutes {round((time.time() - start_time)%60)} seconds')
    
print(f"Total time: {round((time.time() - start_time)//60)} minutes {round((time.time() - start_time)%60)} seconds")

0 files completed
Time Elapsed: 0 minutes 0 seconds
2000 files completed
Time Elapsed: 6 minutes 3 seconds
4000 files completed
Time Elapsed: 12 minutes 29 seconds
6000 files completed
Time Elapsed: 18 minutes 35 seconds
8000 files completed
Time Elapsed: 24 minutes 44 seconds
10000 files completed
Time Elapsed: 31 minutes 15 seconds
12000 files completed
Time Elapsed: 37 minutes 58 seconds
14000 files completed
Time Elapsed: 44 minutes 12 seconds
16000 files completed
Time Elapsed: 51 minutes 19 seconds
18000 files completed
Time Elapsed: 58 minutes 27 seconds
20000 files completed
Time Elapsed: 65 minutes 9 seconds
22000 files completed
Time Elapsed: 71 minutes 47 seconds
24000 files completed
Time Elapsed: 78 minutes 56 seconds
26000 files completed
Time Elapsed: 86 minutes 31 seconds
28000 files completed
Time Elapsed: 94 minutes 45 seconds
30000 files completed
Time Elapsed: 102 minutes 57 seconds
32000 files completed
Time Elapsed: 111 minutes 49 seconds
34000 files completed
Tim

In [None]:
df.to_csv('mfcc_noisy_data.csv')

# Code to Extract MFCC from scratch

Written by referring to:

https://haythamfayek.com/2016/04/21/speech-processing-for-machine-learning.html


However, because of lack of vectorization, this code takes too long to run on our dataset, that is why I abandoned the approach and used librosa instead

In [87]:
import time

start_time = time.time()

df_file = []
df = pd.DataFrame(columns = ['Filepath', 'Word', 'FrameIndex'] + [f'mfcc{x}' for x in range(13)] + [f'del{x}' for x in range(13)] + [f'ddel{x}' for x in range(13)])

prev_word = metadata['Word'][0]
samprate = 16000
#Reference: https://haythamfayek.com/2016/04/21/speech-processing-for-machine-learning.html
min_freq = 0
max_freq = samprate / 2
min_freq_mel = 2595 * np.log10(1 + min_freq / 700)
max_freq_mel = 2595 * np.log10(1 + max_freq / 700)  # Converting Hz to Mel
mel_points = np.linspace(min_freq_mel, max_freq_mel, 42)  # 40 filters for 8000 Hz max frequency
hz_points = 700 * (10**(mel_points / 2595) - 1)  # Converting Mel to Hz

def compute_delta(arr):
    n = len(arr)
    shifted_right = np.zeros(n+2)
    shifted_right[:-2] = arr
    shifted_left = np.zeros(n+2)
    shifted_left[2:] = arr
    delta = ((shifted_right + shifted_left)/2)[1:-1]
    return delta

bins = np.floor(257 * hz_points / max_freq)
fbank = np.zeros((40, 257))
for m in range(1, 41):
    f_m_minus = int(bins[m - 1])   # left
    f_m = int(bins[m])             # center
    f_m_plus = int(bins[m + 1])    # right
    for k in range(f_m_minus, f_m):
        fbank[m - 1, k] = (k - bins[m - 1]) / (bins[m] - bins[m - 1])
    for k in range(f_m, f_m_plus):
        fbank[m - 1, k] = (bins[m + 1] - k) / (bins[m + 1] - bins[m])


for meta_idx,filepath in enumerate(metadata['Filepath']):
    
    word = metadata['Word'][meta_idx]
    audio, samprate = sf.read(filepath)
    
    #End pointing 
    thresh = 0.1
    start_idx = 0
    for idx, value in enumerate(audio):
        if np.mean(abs(audio[idx:idx+1000])) > thresh * audio.max():
            start_idx = idx
            break
    end_idx = 0    
    for idx,value in enumerate(np.flip(audio)):
        if np.mean(abs(np.flip(audio)[idx:idx+1000])) > thresh * audio.max():
            end_idx = idx
            break
    audio = audio[start_idx:-(end_idx+1)]
    
    #Pre-emphasis
    shifted = np.array([0] + list(audio[:-1])) # x[n-1] --> x shifted to the right by 1
    audio = audio - 0.95 * shifted
    
    hop = samprate//100
    N = 512 #window length (= DFT length)
    window = np.hamming(N) #10 ms window
    
    for frame_idx, i in enumerate(range(0,len(audio),hop)):
        if len(audio[i:i+N]) == 512:
            Xk = np.fft.rfft(window * audio[i:i+N]) #windowed DFT
            Ym_list = [] 
            for m in range(40):
                Ym_list.append(np.dot(fbank[m], abs(Xk))) #Filtering through MFB
            Ym_array = np.array(Ym_list)
            logYm = 20*np.log10(Ym_array)
            idft = np.fft.irfft(logYm) #IDFT
            mfcc = idft[0:13]
            delta = compute_delta(mfcc)
            delta2 = compute_delta(delta)
            feat_vec = np.concatenate([mfcc, delta, delta2]) #39-dim feature vector
            
            df_file = pd.DataFrame(data = feat_vec.reshape((1,39)), columns = df.columns.values[3:])
            df_file.insert(loc = 0, column = 'FrameIndex', value = frame_idx)
            df_file.insert(loc = 0, column = 'Word', value = word)
            df_file.insert(loc = 0, column = 'Filepath', value = filepath)

            df = pd.concat([df, df_file], ignore_index=True)
            
            
    if meta_idx%100 == 0:
        print(meta_idx, 'files completed')
        
    if word != prev_word:
        print('Word {} completed, total time elapsed = {} seconds'.format(prev_word, round(time.time() - start_time)))
    prev_word = word

0 files completed
100 files completed
200 files completed
300 files completed
400 files completed
500 files completed
600 files completed
700 files completed
800 files completed
900 files completed
1000 files completed
1100 files completed
1200 files completed
1300 files completed
1400 files completed
1500 files completed
1600 files completed
1700 files completed
1800 files completed
1900 files completed
2000 files completed
2100 files completed
2200 files completed
2300 files completed
Word yes completed, total time elapsed = 2626 seconds
2400 files completed
2500 files completed
2600 files completed
2700 files completed
2800 files completed
2900 files completed
3000 files completed
3100 files completed
3200 files completed
3300 files completed
3400 files completed
3500 files completed
3600 files completed
3700 files completed
3800 files completed
3900 files completed
4000 files completed
4100 files completed
4200 files completed
4300 files completed
4400 files completed
4500 files co

KeyboardInterrupt: 

In [None]:
df