In [140]:
import numpy as np
import IPython
import scipy.io.wavfile
import scipy.signal as sig
import scipy.spatial.distance as dis
import matplotlib.pyplot as plt
import numpy.random as rng
import os
import librosa

# Part 1

In [141]:
def get_data():
    arr_file = []
    arr_mfcc = []
    for filename in os.listdir("./data/"):
        file, frate = librosa.core.load("./data/"+filename)
        file = np.array(file)
        mfcc = np.array(librosa.feature.mfcc(file, frate, n_mfcc=60))
        arr_file.append(file)
        arr_mfcc.append(mfcc)
    return arr_file, arr_mfcc, frate

In [142]:
arr_file, arr_mfcc, frate = get_data()

In [143]:
# use the first file as the template
template = np.array_split(arr_file[1], 10)
template_mfcc = np.array_split(arr_mfcc[1], 10)
digits = []
digits_mfcc = []
for f in arr_file[1:]:
    d = np.array_split(f, 10)
    digits.append(d)
for f in arr_mfcc[1:]:
    d = np.array_split(f, 10)
    digits_mfcc.append(d)
digits = np.array(digits)

In [161]:
for dgt in template:
    IPython.display.display( IPython.display.Audio( dgt, rate=frate))

In [156]:
def dist_mtx(a, b):
    out = np.zeros((a.shape[1], b.shape[1]))
    for i, framei in enumerate(a.T):
        for j, framej in enumerate(b.T):
            out[i,j] = dis.cosine(framei,framej)
    return out

def cost_mtx(dist_mtx):
    out = dist_mtx.copy()
    out[0,-1] = dist_mtx[0,-1]
    for j in range(-out.shape[0]):
        for i in range(out.shape[1]):
            if i == 0 and j == -1:
                continue
            curr = out[i,j]
            l = out[i-1, j] + curr if i-1 >= 0 else np.nan
            d = out[i, j+1] + curr if j+1 <= -1 else np.nan
            ld = out[i-1, j+1] + curr if i-1 >= 0 and j+1 <= -1 else np.nan
            out[i,j] = np.nanmin(l,d,ld)
    return out

def classify(template_mfcc, input_mfcc):
    out = []
    for idx, t_d_mfcc in enumerate(template_mfcc):
        d_mtx = dist_mtx(t_d_mfcc, input_mfcc)
        c_mtx = cost_mtx(d_mtx)
        optimal = min(min(c_mtx[0]) , min(c_mtx.T[0]))
        print("distance for digit",idx,":",optimal)
        out.append(optimal)
    return np.argmin(np.array(out))

In [172]:
IPython.display.display( IPython.display.Audio(  digits[1][0], rate=frate))
print("the input digit is",classify(template_mfcc, digits_mfcc[1][0]))

distance for digit 0 : 2.755798746423288e-05
distance for digit 1 : 0.053976628376300906
distance for digit 2 : 0.014786728610247213
distance for digit 3 : 0.11232750871541153
distance for digit 4 : 0.10697327337463225
distance for digit 5 : 0.11774841403711622
distance for digit 6 : 0.07967934597530335
distance for digit 7 : 0.06623807675372273
distance for digit 8 : 0.03877119153758801
distance for digit 9 : 0.16568087389980968
the input digit is 0


In [178]:
IPython.display.display( IPython.display.Audio(  digits[1][1], rate=frate))
print("the input digit is",classify(template_mfcc, digits_mfcc[1][1]))

distance for digit 0 : 0.016000007201896205
distance for digit 1 : 0.004331850480135624
distance for digit 2 : 0.008548319105048607
distance for digit 3 : 0.015297228118708817
distance for digit 4 : 0.031844973050082825
distance for digit 5 : 0.07179770726823864
distance for digit 6 : 0.019569080021159424
distance for digit 7 : 0.019043310608397035
distance for digit 8 : 0.018059097671554003
distance for digit 9 : 0.01474782855785306
the input digit is 1


In [176]:
IPython.display.display( IPython.display.Audio(  digits[1][2], rate=frate))
print("the input digit is",classify(template_mfcc, digits_mfcc[1][2]))

distance for digit 0 : 0.017677923034438492
distance for digit 1 : 0.017236474074362795
distance for digit 2 : 0.0
distance for digit 3 : 0.016385376198696067
distance for digit 4 : 0.03073318318021212
distance for digit 5 : 0.01165728955473011
distance for digit 6 : 0.01443806901940714
distance for digit 7 : 0.03761211096960715
distance for digit 8 : 0.03882128949608288
distance for digit 9 : 0.026386974050371714
the input digit is 2


In [179]:
IPython.display.display( IPython.display.Audio(  digits[1][3], rate=frate))
print("the input digit is",classify(template_mfcc, digits_mfcc[1][3]))

distance for digit 0 : 0.053386261594506546
distance for digit 1 : 0.020328584687729023
distance for digit 2 : 0.013376011012045552
distance for digit 3 : 0.0005516244993739594
distance for digit 4 : 0.027304896810860124
distance for digit 5 : 0.016148176738325137
distance for digit 6 : 0.05952932965215696
distance for digit 7 : 0.0686979325986472
distance for digit 8 : 0.030372872401200346
distance for digit 9 : 0.03176350381849535
the input digit is 3


In [180]:
IPython.display.display( IPython.display.Audio(  digits[1][4], rate=frate))
print("the input digit is",classify(template_mfcc, digits_mfcc[1][4]))

distance for digit 0 : 0.11093505266661197
distance for digit 1 : 0.011402285900974474
distance for digit 2 : 0.0633837068641887
distance for digit 3 : 0.018840068067012106
distance for digit 4 : 0.004199073300286127
distance for digit 5 : 0.016932269289036905
distance for digit 6 : 0.010567017448449012
distance for digit 7 : 0.010128343811423801
distance for digit 8 : 0.013150280112170898
distance for digit 9 : 0.004675119848637332
the input digit is 4


# Part 2

In [185]:
friends = ["orga_itsuka" , "minato_aqua" , "kagura_mea", "van darkholme"]
phones = ["2176662333", "21735221498", "2172342459", "1145141919"]

In [186]:
from pysoundcard import Stream

"""Loop back five seconds of audio data."""

fs = 44100
blocksize = 16
s = Stream(samplerate=fs, blocksize=blocksize)
s.start()
for n in range(int(fs*5/blocksize)):
    s.write(s.read(blocksize))
s.stop()

  % self._get_c_name())


KeyboardInterrupt: 