## Isolated Word Recognition using HMM-GMMs

In [1]:
!pip install hmmlearn

Defaulting to user installation because normal site-packages is not writeable
Collecting hmmlearn
  Downloading hmmlearn-0.3.3-cp313-cp313-win_amd64.whl.metadata (3.1 kB)
Downloading hmmlearn-0.3.3-cp313-cp313-win_amd64.whl (127 kB)
Installing collected packages: hmmlearn
Successfully installed hmmlearn-0.3.3


In [2]:
from hmmlearn import hmm

In [14]:
def train_GMMHMM(dataset):
    GMMHMM_Models = {}
    states_num = 5
    GMM_mix_num = 6
    tmp_p = 1.0/(states_num-2)
    transmatPrior = np.array([[tmp_p, tmp_p, tmp_p, 0 ,0], \
                              [0, tmp_p, tmp_p, tmp_p , 0], \
                              [0, 0, tmp_p, tmp_p,tmp_p], \
                              [0, 0, 0, 0.5, 0.5], \
                              [0, 0, 0, 0, 1]],dtype=float)
    startprobPrior = np.array([0.5, 0.5, 0, 0, 0],dtype=float)
    for label in dataset.keys():
        model = hmm.GMMHMM(n_components=states_num, n_mix=GMM_mix_num, \
                           transmat_prior=transmatPrior, startprob_prior=startprobPrior, \
                           covariance_type='diag', n_iter=10)
        trainData = dataset[label]
        length = np.zeros([len(trainData), ], dtype=int)
        for m in range(len(trainData)):
            length[m] = trainData[m].shape[0]
        trainData = np.vstack(trainData)
        model.fit(trainData, lengths=length)
        GMMHMM_Models[label] = model
    return GMMHMM_Models

## Extract MFCCs

In [4]:
import librosa
import numpy as np

def extract_mfcc(full_audio_path):
    y, sr = librosa.load(full_audio_path, sr=None)
    hop_length = int(0.025 * sr)
    n_fft = 2048
    n_mfcc = 13
    mfcc_features = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc, hop_length=hop_length, n_fft=n_fft).T
    return mfcc_features

## Build the training data

In [None]:
First: download digit dataset from https://github.com/Jakobovski/free-spoken-digit-dataset/tree/master/recordings
Then:
1. liste all wave files
2. Extrat label from file name
3. split this liste to train (70%) and test (30%)
4. save train to train_audio_liste.csv and test to test_audio_liste.csv

In [None]:
!git clone https://github.com/Jakobovski/free-spoken-digit-dataset.git
# List all wav files
import glob



Cloning into 'free-spoken-digit-dataset'...
Updating files:  30% (908/3014)
Updating files:  31% (935/3014)
Updating files:  32% (965/3014)
Updating files:  33% (995/3014)
Updating files:  34% (1025/3014)
Updating files:  35% (1055/3014)
Updating files:  36% (1086/3014)
Updating files:  37% (1116/3014)
Updating files:  38% (1146/3014)
Updating files:  39% (1176/3014)
Updating files:  40% (1206/3014)
Updating files:  41% (1236/3014)
Updating files:  42% (1266/3014)
Updating files:  43% (1297/3014)
Updating files:  44% (1327/3014)
Updating files:  45% (1357/3014)
Updating files:  46% (1387/3014)
Updating files:  47% (1417/3014)
Updating files:  48% (1447/3014)
Updating files:  49% (1477/3014)
Updating files:  50% (1507/3014)
Updating files:  51% (1538/3014)
Updating files:  52% (1568/3014)
Updating files:  53% (1598/3014)
Updating files:  54% (1628/3014)
Updating files:  55% (1658/3014)
Updating files:  55% (1682/3014)
Updating files:  56% (1688/3014)
Updating files:  57% (1718/3014)
Upd

In [9]:
fileliste = glob.glob('free-spoken-digit-dataset/recordings/*.wav')

In [17]:

import csv
import random

fileliste = glob.glob('free-spoken-digit-dataset/recordings/*.wav')
random.shuffle(fileliste)
split = int(0.7 * len(fileliste))
train_files = fileliste[:split]
test_files = fileliste[split:]

with open('train_audio_liste.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    for file in train_files:
        writer.writerow([file])

with open('test_audio_liste.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    for file in test_files:
        writer.writerow([file])

In [18]:
from collections import defaultdict
def build_data(fileliste):
    import os
    dataset = defaultdict(list)
    for file in fileliste:
        if file.endswith('.wav'):
            label = os.path.basename(file).split('_')[0]
            mfcc = extract_mfcc(file)
            dataset[label].append(mfcc)
    return dataset

## Train the GMM-HMM model

In [21]:
import csv
def load_filelist_from_csv(csv_path):
    with open(csv_path, 'r') as f:
        return [row[0] for row in csv.reader(f)]

trainList = './train_audio_liste.csv'
trainDataSet = build_data(load_filelist_from_csv(trainList))

print("Finish prepare the training data")
hmmModels = train_GMMHMM(trainDataSet)
print("Finish training of the GMM_HMM models for digits 0-9")



Finish prepare the training data
Finish training of the GMM_HMM models for digits 0-9


## Evaluation

In [24]:
import csv
def load_filelist_from_csv(csv_path):
    with open(csv_path, 'r') as f:
        return [row[0] for row in csv.reader(f)]

testList = './test_audio_liste.csv'
testDataSet = build_data(load_filelist_from_csv(testList))

score_cnt = 0
for label in testDataSet.keys():
    feature = testDataSet[label]
    scoreList = {}
    for model_label in hmmModels.keys():
        model = hmmModels[model_label]
        score = model.score(feature[0])
        scoreList[model_label] = score
    predict = max(scoreList, key=scoreList.get)
    print("Test on true label ", label, ": predict result label is ", predict)
    if predict == label:
        score_cnt+=1
print("Final recognition rate is %.2f"%(100.0*score_cnt/len(testDataSet.keys())), "%")



Test on true label  0 : predict result label is  0
Test on true label  8 : predict result label is  8
Test on true label  5 : predict result label is  5
Test on true label  4 : predict result label is  4
Test on true label  3 : predict result label is  3
Test on true label  9 : predict result label is  9
Test on true label  6 : predict result label is  6
Test on true label  1 : predict result label is  1
Test on true label  2 : predict result label is  2
Test on true label  7 : predict result label is  7
Final recognition rate is 100.00 %
