## Download the archive

Download the archive files from https://www.kaggle.com/rtatman/speech-accent-archives, and extract it into the working directory.

Then convert all the mp3 files into wav by using this script: https://raw.githubusercontent.com/evonneng/Speech-Accent-Detector/master/convert_to_wav.sh

Most of the following codes are borrowed from: https://github.com/evonneng/Speech-Accent-Detector/blob/master/Speech%20Accent%20Detector.ipynb

In [11]:
import sys
import os
import numpy as np
import pandas as pd

from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

import librosa

In [2]:
RATE = 8000
N_MFCC = 12

NUM_FEATURES = 3000

In [3]:
def get_wav(filename):
    '''
    filename: file name to load
    return: numpy array down-sampled wav file
    '''
    y, sr = librosa.load('{}'.format(filename))
    return librosa.core.resample(y=y, orig_sr=sr, target_sr=RATE, scale=True)

def to_mfcc(wav):
    '''
    wav: numpy array down-sampled wav file
    return: 2d numpy array of mfcc
    '''
    return librosa.feature.mfcc(y=wav, sr=RATE, n_mfcc=N_MFCC)

def get_mfcc_from_filename(filename):
    '''

    '''
    wav = get_wav(filename)
    mfcc = to_mfcc(wav).flatten()

    return mfcc[:NUM_FEATURES]

def add_dir_to_df(dir_name, X):
    '''
    dir_name: the directory to add files to dataframe
    X: features dataframe to add to
    '''

    filenames = map(lambda rel_path: dir_name + "/" + rel_path, os.listdir(dir_name))
    
    import multiprocessing
    X = pd.DataFrame(columns=list(range(NUM_FEATURES)))
    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
    
    # Using multiprocesses
    mfccs = pool.map(get_mfcc_from_filename, filenames)
    # mfccs = list(map(get_mfcc_from_filename, filenames))

    new_rows = pd.DataFrame(mfccs, columns=X.columns)
    return pd.concat([X, new_rows])


def add_category_to_labels(dir_name, label, y):
    '''
    dir_name: the directory to add files to dataframe
    label: label of new category
    y: predictions dataframe to add to
    '''
    files = os.listdir(dir_name)
    num_rows_to_add = len(files)

    row_labels = pd.DataFrame(np.ones(num_rows_to_add) * label, columns=y.columns)
    return pd.concat([y, row_labels])

In [None]:
X = pd.DataFrame(columns=list(range(NUM_FEATURES)))
y = pd.DataFrame(columns=["y"])

dirs = ["./recordings_wav/english", "./recordings_wav/mandarin"]

for label, d in enumerate(dirs):
    X = add_dir_to_df(d, X)
    y = add_category_to_labels(d, label, y)
y = y.values.ravel()

In [42]:
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2990,2991,2992,2993,2994,2995,2996,2997,2998,2999
0,-61.520138,-45.114658,-67.347679,-64.358963,13.608142,57.482548,32.144146,-25.767134,-47.474106,13.589452,...,-19.425692,-4.964512,0.564896,-10.322146,-8.048172,-15.611216,-13.717669,-8.758387,-18.238054,-21.685688
1,-319.164764,-313.727051,-315.753265,-337.958008,-352.937073,-319.444733,-271.978241,-282.15918,-225.115707,-154.357269,...,-16.648531,-9.363829,-1.733926,-17.236471,-23.098793,-15.690676,-11.650967,-19.760632,-32.453533,-33.308979
2,-507.730377,-480.776276,-366.106293,-289.678741,-277.664673,-322.216431,-325.142181,-290.748108,-188.61055,-186.41246,...,1.39252,1.4038,1.397377,2.297293,1.126647,3.847977,7.825229,6.717238,-17.613911,-28.461082
3,-505.187653,-497.593079,-482.440308,-477.806244,-434.63089,-330.238037,-236.029419,-215.746506,-259.099365,-281.784088,...,-0.079755,-22.837614,-13.261033,-6.944836,-8.002753,-8.259872,-11.999463,-8.388346,-21.328281,-28.987991
4,-526.473816,-526.473816,-526.473816,-526.473816,-522.832092,-290.25174,-132.622635,-104.316048,-148.268097,-94.123489,...,-11.89162,-36.099201,-25.709633,-28.622822,-34.92749,-43.11195,-42.848175,-14.123367,-27.069042,-25.933346


In [44]:
clf = RandomForestClassifier(n_estimators=50)
cross_val_score(clf, X, y, cv=5)

array([0.79012346, 0.77160494, 0.70807453, 0.70807453, 0.75625   ])

In [106]:
import sklearn.model_selection
X_train, X_val, y_train, y_val = sklearn.model_selection.train_test_split(X, y)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=50,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [107]:
import sklearn.metrics
sklearn.metrics.accuracy_score(clf.predict(X_val), y_val)

0.7623762376237624

In [120]:
ninglu = get_mfcc_from_filename('./recording_val/ninglu.wav')

In [121]:
clf.predict(pd.DataFrame([ninglu], columns=X.columns))

array([1.])

In [110]:
zhangpin = get_mfcc_from_filename('./recording_val/zhangpin.wav')

In [111]:
clf.predict(pd.DataFrame([zhangpin], columns=X.columns))

array([1.])

In [112]:
xiaobao = get_mfcc_from_filename('./recording_val/xiaobao.wav')

In [113]:
clf.predict(pd.DataFrame([xiaobao], columns=X.columns))

array([1.])

In [114]:
xl = get_mfcc_from_filename('./recording_val/xueling.wav')

In [115]:
clf.predict(pd.DataFrame([xl], columns=X.columns))

array([1.])