In [1]:
import glob
import os
import librosa
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [23]:
def gen_mfcc_fn(fn, mfcc_window_size, mfcc_stride_size):
    
    X, sample_rate = librosa.load(fn, sr=None, mono=True)
    if sample_rate != 16000:
        return
    
    mfcc = librosa.feature.mfcc(X, sample_rate, 
                            n_fft=int(mfcc_window_size * sample_rate), 
                            hop_length=int(mfcc_stride_size * sample_rate))
    
    return mfcc.T

def generate_mfccs_for_gmm(parent_dir, 
                     sub_dirs, 
                     file_ext='*.wav', 
                     mfcc_window_size=0.02, mfcc_stride_size=0.01):
    
    mfccs = np.empty((0, 20))
    
    for label, sub_dir in enumerate(sub_dirs):
        cnt = 0
        for fn in glob.glob(os.path.join(parent_dir, sub_dir, file_ext)):
            
            mfcc = gen_mfcc_fn(fn, mfcc_window_size, mfcc_stride_size)
            if mfcc is None:
                continue
            
            mfccs = np.vstack([mfccs, mfcc])
            cnt+=1
            if cnt >= 200:
                break
                
    return mfccs

In [24]:
parent_dir = 'dataset'
tr_sub_dirs = ["down","go","left","right","up"]

mfccs_for_gmm = generate_mfccs_for_gmm(parent_dir, tr_sub_dirs)
print(mfccs_for_gmm.shape)



(99040, 20)


In [67]:
from sklearn.mixture import GaussianMixture

gmm = GaussianMixture(n_components=64, verbose=10)
gmm.fit(mfccs_for_gmm)

Initialization 0
  Iteration 0	 time lapse 16.94435s	 ll change inf
  Iteration 10	 time lapse 44.86399s	 ll change 0.20281
  Iteration 20	 time lapse 41.94040s	 ll change 0.03578
  Iteration 30	 time lapse 39.78639s	 ll change 0.01550
  Iteration 40	 time lapse 38.26863s	 ll change 0.00683
  Iteration 50	 time lapse 38.23383s	 ll change 0.01130
  Iteration 60	 time lapse 38.40861s	 ll change 0.01382
  Iteration 70	 time lapse 38.66422s	 ll change 0.00916
  Iteration 80	 time lapse 39.44960s	 ll change 0.00421
  Iteration 90	 time lapse 38.62749s	 ll change 0.00607
Initialization converged: False	 time lapse 409.47151s	 ll -69.35460




GaussianMixture(covariance_type='full', init_params='kmeans', max_iter=100,
        means_init=None, n_components=64, n_init=1, precisions_init=None,
        random_state=None, reg_covar=1e-06, tol=0.001, verbose=10,
        verbose_interval=10, warm_start=False, weights_init=None)

In [70]:
def segment_window(audio_len, segment_len, segment_stride):
    
    start = 0
    while start < audio_len:
        yield start, start + segment_len
        start += segment_stride

class_labels = {
    'down' : 0,
    'go'   : 1,
    'left' : 2,
    'right': 3,
    'up'   : 4
}

def generate_labels(sub_dir):

    return class_labels[sub_dir]

def generate_F_features(parent_dir, 
                             sub_dirs,
                             file_ext='*.wav', 
                             mfcc_window_size=0.02, 
                             mfcc_stride_size=0.01):
    
    F_features, labels = np.empty((0, 19*64)), np.array([])

    for label, sub_dir in enumerate(sub_dirs):
        
        cnt = 0
        for fn in glob.glob(os.path.join(parent_dir, sub_dir, file_ext)):
            
            X, sample_rate = librosa.load(fn, sr=None, mono=True)
            if sample_rate != 16000:
                continue

            segment_len = int(X.size / 10)
            segment_stride = int(segment_len/2)
            
            file_F_features = np.empty((0, 64))
            for start, end in segment_window(X.size, segment_len, segment_stride):
                
                if X[start:end].size != segment_len:
                    continue
                segment_mfccs = librosa.feature.mfcc(X[start:end], sample_rate, 
                                    n_fft=int(mfcc_window_size * sample_rate), 
                                    hop_length=int(mfcc_stride_size * sample_rate))
                
                segment_F_features = np.sum(gmm.predict_proba(segment_mfccs.T), axis=0) \
                                        / (segment_mfccs.shape[1])
                    
                file_F_features = np.vstack([file_F_features, segment_F_features])
            
            labels = np.append(labels, generate_labels(sub_dir))
            F_features = np.vstack([F_features, file_F_features.reshape(-1)])
            
#             F_features.append(file_F_features)
        
    print("Finished!")
    return np.array(F_features), np.array(labels, dtype=np.int)

In [71]:
parent_dir = 'dataset'
tr_sub_dirs = ["down","go","left","right","up"]

X_all, y_all = generate_F_features(parent_dir, tr_sub_dirs)



Finished!


In [76]:
print(X_all.shape)
print(y_all.shape)

(11826, 1216)
(11826,)


In [77]:
np.savetxt("gmm_features.csv", X_all, delimiter=",")
np.savetxt("gmm_labels.csv", y_all, delimiter=",")

In [82]:

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_all, y_all, stratify=y_all, train_size=0.8, random_state=100)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(9460, 1216)
(9460,)
(2366, 1216)
(2366,)


In [83]:
from sklearn.svm import SVC

clf = SVC(kernel='rbf', C=5, gamma=1)
clf.fit(X_train, y_train)

print("Training set score: {:.3f}".format(clf.score(X_train, y_train)))
print("Test set score: {:.3f}".format(clf.score(X_test, y_test)))

Training set score: 1.000
Test set score: 0.685
