In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
#-- required imports
import tensorflow as tf
from tensorflow.keras import regularizers as rg
import librosa
from librosa import display
from scipy.io import wavfile
import gc
import pickle as pkl
from tqdm.notebook import tqdm, trange
import matplotlib.pyplot as plt

path = '/kaggle/input/darpa-timit-acousticphonetic-continuous-speech'
data_path = path+"/data"

In [3]:
class Callback(tf.keras.callbacks.Callback):
    def on_epoch_begin(self,epoch,logs={}):
        print("Epoch ",epoch)

    def on_epoch_end(self,epoch,logs={}):
        print('loss: {:.2f}, accuracy:{:.2f}'.format(
                logs["loss"],logs["accuracy"]*100))
        print(logs)
        gc.collect()

    def on_batch_end(self,batch,logs={}):
        if(batch%100 == 0):
            print(batch,'loss: {:.2f}, accuracy:{:.2f}'.format(
                logs["loss"],logs["accuracy"]*100))

    def on_test_batch_end(self, batch, logs=None):        
        if(batch%100 == 0):
            pass
            return
            #print('Test Batch',batch,logs)

In [4]:
import math

class CNN_ASR_MODULE_BUILDER():
    __train_desc = 'train_data.csv'
    __test_desc = 'test_data.csv'
    __data_directory = './data'
    __main_directory = './'
    f_Path = 'path_from_data_dir' #field that contains file path in train_data.csv
    f_IsAudio = 'is_converted_audio' #boolean field that tells that the record in train_data.csv contains the description of audio file we are interested in
    f_IsWord = 'is_word_file'
    f_IsPhon = 'is_phonetic_file'
    f_IsSent = 'is_sentence_file'
    # f_filename = 'filename' #field that contains filename
    f_dr = 'dialect_region' #field that contains dialect_region information
    _winlen = 0.025
    _winstep = 0.01
    
    def __init__(self,path=None):
        if path == None:
            raise Exception("Directory path to the TIMIT Data set must be provided")
        if not os.path.isdir(path):
            raise Exception("Directory doesn't exist")
        self.__main_directory = path
        if path[len(path)-1] == '/':
            self.__data_directory = path+"data/"
        else:
            self.__main_directory += "/"
            self.__data_directory = self.__main_directory+"data/"
      
        # TimitBet 61 phoneme mapping to 39 phonemes
        # by Lee, K.-F., & Hon, H.-W. (1989). Speaker-independent phone recognition using hidden Markov models. IEEE Transactions on Acoustics, Speech, and Signal Processing, 37(11), 1641–1648. doi:10.1109/29.46546 
        self.phon61_map39 = {
            'iy':'iy',  'ih':'ih',   'eh':'eh',  'ae':'ae',    'ix':'ih',  'ax':'ah',   'ah':'ah',  'uw':'uw',
            'ux':'uw',  'uh':'uh',   'ao':'aa',  'aa':'aa',    'ey':'ey',  'ay':'ay',   'oy':'oy',  'aw':'aw',
            'ow':'ow',  'l':'l',     'el':'l',  'r':'r',      'y':'y',    'w':'w',     'er':'er',  'axr':'er',
            'm':'m',    'em':'m',     'n':'n',    'nx':'n',     'en':'n',  'ng':'ng',   'eng':'ng', 'ch':'ch',
            'jh':'jh',  'dh':'dh',   'b':'b',    'd':'d',      'dx':'dx',  'g':'g',     'p':'p',    't':'t',
            'k':'k',    'z':'z',     'zh':'sh',  'v':'v',      'f':'f',    'th':'th',   's':'s',    'sh':'sh',
            'hh':'hh',  'hv':'hh',   'pcl':'h#', 'tcl':'h#', 'kcl':'h#', 'qcl':'h#','bcl':'h#','dcl':'h#',
            'gcl':'h#','h#':'h#',  '#h':'h#',  'pau':'h#', 'epi': 'h#','nx':'n',   'ax-h':'ah','q':'h#' 
        }
        
        self.phon61 = list(self.phon61_map39.keys())
        self.phon39 = list(set(self.phon61_map39.values()))

        self.label_p39 = {}
        self.p39_label = {}
        for i,p in enumerate(self.phon39):
            self.label_p39[p] = i+1
            self.p39_label[i+1] = p

        self.phon39_map61 = {}
        for p61,p39 in self.phon61_map39.items():
            if not p39 in self.phon39_map61:
                self.phon39_map61[p39] = []
            self.phon39_map61[p39].append(p61)
        #-------------------------------------------------
        #end __init__
    
    #------------------------------------------------------------------------
    def get39EquiOf61(self,p):
        return self.phon61_map39[self.removePhonStressMarker(p)]

    def removePhonStressMarker(self,phon):
        phon = phon.replace('1','')
        phon = phon.replace('2','')
        return phon
    
    def getWindow(self,sr):
        nfft = 512
        winlen = self._winlen * sr
        winstep = self._winstep * sr
        return nfft,int(winlen),int(winstep)

    def singleTrainingFrameSize(self,sr):
        return math.floor(sr/4)
        
    def readTrainingDataDescriptionCSV(self):
        file_path = self.__main_directory + 'train_data.csv' #check if train_data.csv is in correct path
        self._Tdd = pd.read_csv(file_path)
        # removing NaN entries in the train_data.csv file
        dr = ['DR1','DR2','DR3','DR4','DR5','DR6','DR7','DR8']
        self._Tdd = self._Tdd[self._Tdd['dialect_region'].isin(dr)]
        return self._Tdd

    def readTestingDataDescriptionCSV(self):
        file_path = self.__main_directory + 'test_data.csv' #check if train_data.csv is in correct path
        self._tdd = pd.read_csv(file_path)
        # removing NaN entries in the train_data.csv file
        dr = ['DR1','DR2','DR3','DR4','DR5','DR6','DR7','DR8']
        self._tdd = self._tdd[self._tdd['dialect_region'].isin(dr)]
        return self._tdd
    
    def getListAudioFiles(self,of='Train'):
        if of == 'Train':
            self.readTrainingDataDescriptionCSV()
            return self._Tdd[self._Tdd[self.f_IsAudio] == True]
        if of == 'Test':
            self.readTestingDataDescriptionCSV()
            return self._tdd[self._tdd[self.f_IsAudio] == True]
        
    def getListPhonemeFiles(self,of='Train'):
        if of == 'Train':
            self.readTrainingDataDescriptionCSV()
            return self._Tdd[self._Tdd[self.f_IsPhon] == True]
        if of == 'Test':
            self.readTestingDataDescriptionCSV()
            return self._tdd[self._tdd[self.f_IsPhon] == True]
               
    def readAudio(self,fpath=None,pre_emp = False):
        if(fpath == None):
            return np.zeros(1),0
        
        fpath = self.__data_directory+fpath
        if os.path.exists(fpath):
            S,sr = librosa.load(fpath,sr=None)
            if pre_emp:
                S = librosa.effects.preemphasis(S)
            return S,sr   
        else:
            return np.zeros(1),0
    #-----------------------end readAudio()
    
    def readPhon(self,fpath=None):
        if(fpath == None):
            raise Exception('phon file path not provided')
        
        fpath = self.__data_directory+fpath
        ph_ = pd.read_csv(fpath,sep=" ")#,usecols=['start','end','phoneme'])
        #ph_.columns = ['start','end','phoneme']
        return ph_
            
        pfn = j['filename'].split('.WAV')[0]+'.PHN'
        p_bar.set_description(f'Working on {j["filename"]} ,index: {c}  ')
        try:
            pfp = file_path+pfd[(pfd['filename']==pfn) & (pfd['speaker_id'] == j['speaker_id'])][f_Path].values[0]
        except:
            pfp = afp.replace(j['filename'],pfn)
            
        ph_ = pd.read_csv(pfp,sep=" ")#,usecols=['start','end','phoneme'])
        #ph_.columns = ['start','end','phoneme']
    #---------------end readPhon()
        
    def getFeatureAndLabel(self,ftype='mfsc',audio_path=None,phon_path=None,n_mels=128,delta=False,delta_delta=False):
        if audio_path == None:
            raise Exception("Path to audio (Wav) file must be provided")
        wav,sr = self.readAudio(fpath=audio_path,pre_emp=True)
        nfft,winlen,winstep = self.getWindow(sr)
        if(ftype == 'mfsc'):
            melspec = librosa.feature.melspectrogram(wav,sr=sr,hop_length=winstep,win_length=winlen,n_fft=nfft,n_mels=n_mels)
        if(ftype == 'mfcc'):
            melspec = librosa.feature.mfcc(wav,sr=sr,hop_length=winstep,win_length=winlen,n_fft=nfft,n_mfcc=n_mels)
            
        db_melspec = librosa.amplitude_to_db(melspec,ref=np.max)
        
        mD = None
        mDD = None
        if(delta):
            mD = librosa.feature.delta(db_melspec)
            if(delta_delta):
                mDD = librosa.feature.delta(mD)
        
        audio_phon_transcription = None
        if phon_path == None:
            tmp = audio_path.split('/')
            phon_path = "/".join(tmp[:(len(tmp)-1)])+"/"+ tmp[len(tmp)-1].split('.WAV')[0]+".PHN"
            
        audio_phon_transcription = self.readPhon(phon_path)            
        time = db_melspec.shape[1]
        
        feature_vectors = []
        db_melspec = db_melspec.T
        mD = mD.T
        mDD = mDD.T
        
        prev = None
        first = audio_phon_transcription.columns
        audio_phon_transcription.columns = ['start','end','phoneme']
        labels = []
        for i in range(time):
            #---collecting feature---
            feature = np.zeros(n_mels*3)
            feature[:n_mels] = db_melspec[i]
            feature[n_mels:n_mels*2] = mD[i]
            feature[n_mels*2:n_mels*3] = mDD[i]
            feature_vectors.append(feature)
            
            #---collecting phoneme label ---
            start = winstep * i
            end = start+winlen
            diff = start+400
            phoneme = list(
                        audio_phon_transcription[
                            ((audio_phon_transcription['start']<=start) & 
                            ((audio_phon_transcription['end']-start)>=int(winlen/1.5)))
                            |
                            ((audio_phon_transcription['start']<=end) & 
                                (audio_phon_transcription['end']>end))  
                        ].to_dict()['phoneme'].values()
            )
            if len(phoneme) == 0:
                if int(first[1]) > start:
                    phoneme = first[2]
                else:
                    phoneme = prev
            else:
                phoneme = phoneme[0]
            phoneme = self.get39EquiOf61(phoneme)
            prev = phoneme
            labels.append(phoneme)
             
        return feature_vectors,labels
                
    #--------------------end getMelSpectrogramFeatureAndLabel()
    def prepareLabelsForTraining(self,labels):
        print('Preparing Labels')
        label_vector = []
        p_bar = tqdm(range(len(labels)))
        c = 0
        for l in labels:
            label = [0 for i in range(39)]
            label[self.label_p39[l]-1] = 1
            label_vector.append(label)
            c+=1
            if c == 500:
                p_bar.set_description(f'Working on phoneme {l}')
                p_bar.update(c)
                c = 0
           
        p_bar.set_description(f'Working on phoneme {l}')
        p_bar.update(c) 
        return label_vector
    
    def collectFeatures(self,ft='Train',ftype='mfsc',n_mels=128,delta=False,delta_delta=False):
        tddA = self.getListAudioFiles(ft)
        tddA.index = range(tddA.shape[0])
        feature_vectors = []
        labels = []
        
        p_bar = tqdm(range(tddA.shape[0]))
        silent_count = 0
        for i in range(tddA.shape[0]):
            fv,lv = self.getFeatureAndLabel(ftype=ftype,audio_path=tddA.loc[i][self.f_Path],n_mels=n_mels,delta=delta,delta_delta=delta_delta)
            p_bar.set_description(f'Working on {tddA.loc[i][self.f_Path]} ,index: {i}  ')
            p_bar.update()
            feature_vectors += fv
            labels += lv
                   
        print(f"length of feature_vectors is {len(feature_vectors)} and length of labels is {len(labels)}")
        labels = np.asarray(np.array(self.prepareLabelsForTraining(labels),dtype=object)).astype(np.int16)
        feature_vectors = np.asarray(np.array(feature_vectors,dtype=object)).astype(np.float32)
        return feature_vectors,labels
    #--------------------end collectFeatures   
        
    def classTestA(self):
        gc.collect()
        tddA = self.getListAudioFiles()
        feature_vectors, labels = self.getMelSpectrogramFeatureAndLabel(tddA[self.f_Path][0],n_mels=20,delta=True,delta_delta=True)
        #------------------------------------------
        wav,sr = self.readAudio(tddA[self.f_Path][0])
        librosa.display.waveshow(wav,sr=sr)
        nfft,winlen,winstep = self.getWindow(sr)
        print(nfft,winlen,winstep)
        melspec = librosa.feature.melspectrogram(wav,sr=sr,hop_length=winstep,win_length=winlen,n_fft=nfft)
        db_melspec = librosa.amplitude_to_db(
            melspec,
            ref=np.max)
        msd = librosa.feature.delta(db_melspec)
        msdd = librosa.feature.delta(msd)
        
        fig, ax = plt.subplots(nrows=1, ncols=1, sharex=True)
        img = display.specshow(db_melspec,y_axis='linear', x_axis='time',
                               sr=sr, ax=ax)
        ax.set(title='Linear-frequency power spectrogram')
        ax.label_outer()
        fig.colorbar(img, ax=ax, format="%+2.f dB")
        
        fig, ax = plt.subplots(nrows=1, ncols=1, sharex=True)
        librosa.display.specshow(db_melspec, y_axis='log', sr=sr,
                         x_axis='time', ax=ax)
        ax.set(title='Log-frequency power spectrogram')
        ax.label_outer()
        fig.colorbar(img, ax=ax, format="%+2.f dB")
        
        fig, ax = plt.subplots(nrows=1, ncols=1, sharex=True)
        librosa.display.specshow(msd ,
                                 y_axis='linear', sr=sr,
                                 x_axis='time', ax=ax)  
        ax.set(title='Mel Spectrogram Delta')
        ax.label_outer()
        fig.colorbar(img, ax=ax, format="%+2.f dB")
        
        fig, ax = plt.subplots(nrows=1, ncols=1, sharex=True)
        librosa.display.specshow(msdd ,
                                 y_axis='linear', sr=sr,
                                 x_axis='time', ax=ax)  
        ax.set(title='Mel Spectrogram Delta Delta')
        ax.label_outer()
        fig.colorbar(img, ax=ax, format="%+2.f dB")
        
        fig, ax = plt.subplots(nrows=1, ncols=1, sharex=True)
        librosa.display.specshow(librosa.amplitude_to_db(librosa.feature.mfcc(S=melspec,sr=sr),ref=np.max) ,
                                 y_axis='linear', sr=sr,
                                 x_axis='time', ax=ax)  
        ax.set(title='MFCC')
        ax.label_outer()
        fig.colorbar(img, ax=ax, format="%+2.f dB")
        
         

In [5]:
####--------------Collecting Training Features----------------------###   
gc.collect()
cm = CNN_ASR_MODULE_BUILDER(path)
n_mels = 64
delta = True
delta_delta=True
ftype = 'mfsc'
feature_path = '/kaggle/input/timit-{}mel-spectrogramdelta-features/'.format(n_mels)#'/kaggle/input/timit-{}mfcc-and-delta-feature/'.format(n_mels)#

print('Attempting to read features file',feature_path)
if os.path.exists(feature_path+'features.pkl') or os.path.exists('/kaggle/working/features.pkl'):
    if os.path.exists(feature_path+'features.pkl'):
        print("-from input")
        ffp = open(feature_path+'features.pkl','rb')
        flp = open(feature_path+'labels.pkl','rb')   
    elif os.path.exists('/kaggle/working/features.pkl'):
        print("-from output")
        ffp = open('/kaggle/working/features.pkl','rb')
        flp = open('/kaggle/working/labels.pkl','rb')
    features = pkl.load(ffp)
    labels = pkl.load(flp)
    ffp.close()
    flp.close()
    features = np.asarray(features).astype(np.float32)
    labels = np.asarray(labels).astype(np.int16)
    print(features.shape,labels.shape)
    print('---- success')
    #-------
else:            
    print('--- Failed')
    print('Collecting Features from Audio Files')
    features,labels = cm.collectFeatures(ftype=ftype,n_mels=n_mels,delta=delta,delta_delta=delta_delta)
    # -------------
    ffp = open("/kaggle/working/features.pkl",'wb')
    pkl.dump(features,ffp)
    flp = open("/kaggle/working/labels.pkl",'wb')
    pkl.dump(labels,flp)            
    ffp.close()
    flp.close()
    print('--- Completed')
    #-------

Attempting to read features file /kaggle/input/timit-64mel-spectrogramdelta-features/
-from input
(1421707, 192) (1421707, 39)
---- success


In [8]:
####--------------Model Training----------------------###   
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(units=1024, input_shape=[n_mels*3],activation=tf.nn.relu),
    tf.keras.layers.Dense(units=1024,activation=tf.nn.relu),
    tf.keras.layers.Dense(units=1024,activation=tf.nn.relu),
    tf.keras.layers.Dense(units=39,activation=tf.nn.softmax)
])
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()

gc.collect()  
history = model.fit(
    features[:1137000],labels[:1137000],epochs=25,
     batch_size=512, verbose=1,
    validation_data=(features[1137000:],labels[1137000:]),
    validation_batch_size=128
)

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_9 (Dense)              (None, 1024)              197632    
_________________________________________________________________
dense_10 (Dense)             (None, 1024)              1049600   
_________________________________________________________________
dense_11 (Dense)             (None, 1024)              1049600   
_________________________________________________________________
dense_12 (Dense)             (None, 39)                39975     
Total params: 2,336,807
Trainable params: 2,336,807
Non-trainable params: 0
_________________________________________________________________
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25

In [9]:
###------------collecting test features -------------------
gc.collect()
if os.path.exists(feature_path+'test_features.pkl'):
    test_features = pkl.load(open(feature_path+'test_features.pkl','rb'))
    test_labels = pkl.load(open(feature_path+'test_labels.pkl','rb'))
elif os.path.exists("/kaggle/working/test_features.pkl"):
    test_features = pkl.load(open("/kaggle/working/test_features.pkl",'rb'))
    test_labels = pkl.load(open("/kaggle/working/test_labels.pkl",'rb'))
else:
    test_features,test_labels = cm.collectFeatures(ft='Test',ftype=ftype,n_mels=n_mels,delta=delta,delta_delta=delta_delta)
    pkl.dump(test_features,open('/kaggle/working/test_features.pkl','wb'))
    pkl.dump(test_labels,open('/kaggle/working/test_labels.pkl','wb'))
gc.collect()

0

In [10]:
####--------------Model Evaluating----------------------###   
evaluation = model.evaluate(test_features,test_labels,batch_size=128)

