In [1]:
%load_ext autoreload
%autoreload 2
import os
import librosa
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import librosa.display
from scipy import signal

In [None]:
PATH = os.getcwd()
Phonetic = {'backward' : 3,
            'bed' : 1,
            'bird' : 2,
            'cat' : 2,
            'dog' : 2,
            'down' : 1,
            'eight' : 2,
            'five' : 2,
            'follow' : 2,
            'forward' : 0,
            'four' : 0,
            'go' : 1,
            'happy' : 2,
            'house' : 2,
            'learn' : 1,
            'left' : 2,
            'marvin' : 2,
            'nine' : 1,
            'no' : 1,
            'off' : 2,
            'on' : 1,
            'one' : 1,
            'right' : 2,
            'seven' : 2,
            'sheila' : 2,
            'six' : 2,
            'stop' : 2,
            'three' : 2,
            'tree' : 1,
            'two' : 1,
            'up' : 2,
            'visual' : 2,
            'wow' : 1,
            'yes' : 2,
            'zero' : 2
           }
Bank = dict()
Class_count = 0
cluster = None
count = 0
for folder in os.listdir(PATH):
    if not '.' in folder and os.path.isdir(os.path.join(PATH, folder)):
        print(folder, '....')
        count += 1
        Bank[folder] = dict()
        Bank[folder]['sample'] = list()
        Bank[folder]['sample_32'] = list()
        Bank[folder]['sample_196'] = list()
        for file in os.listdir(folder):
            if '.wav' in file:
                p = os.path.join(PATH,folder,file)
                y, sr = librosa.load(p,sr = 16000)
                y = np.pad(y,(0,16000 - y.size))
                spec = librosa.feature.melspectrogram(y = y, sr = sr, n_mels = 256)
                spec = librosa.power_to_db(spec, ref=np.max)
                if cluster is None:
                    cluster = spec
                elif cluster.shape[1] < 3200 * count:
                    cluster = np.hstack((spec, cluster))
                Bank[folder]['sample'].append(spec)
                spec = librosa.feature.melspectrogram(y = y, sr = sr, n_mels = 32)
                spec = librosa.power_to_db(spec, ref=np.max)
                Bank[folder]['sample_32'].append(spec)
                spec = librosa.feature.melspectrogram(y = y, sr = sr, n_mels = 196)
                spec = librosa.power_to_db(spec, ref=np.max)
                Bank[folder]['sample_196'].append(spec)
        print('Done', cluster.shape)         

cat ....
Done (256, 3200)
dog ....
Done (256, 6400)
marvin ....
Done (256, 9600)
forward ....


In [None]:
Class_count = 0
for key, item in Phonetic.items():
    Class_count += item
kmeans = KMeans(n_clusters=Class_count, random_state=0).fit(cluster.transpose())
Train_Targets = list()
Train32 = list()
Train196 = list()
Test_Targets = list()
Test32 = list()
Test196 = list()
for folder in os.listdir(PATH):
    if not '.' in folder and os.path.isdir(os.path.join(PATH, folder)):
        seq = np.random.randint(0, len(Bank[folder]['sample']), int(len(Bank[folder]['sample']) * .01))
        for i in range(len(Bank[folder]['sample'])):
            label = kmeans.predict(Bank[folder]['sample'][i].transpose())
            if i in seq:
                Test_Targets.append(label)
                Test32.append(Bank[folder]['sample_32'][i])
                Test196.append(Bank[folder]['sample_196'][i])
            else:
                Train_Targets.append(label)
                Train32.append(Bank[folder]['sample_32'][i])
                Train196.append(Bank[folder]['sample_196'][i])
                               
print('Done')    

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
class Net(nn.Module):
    def __init__(self, in_features, out_features, unit_size = 196):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(in_features, unit_size)
        self.fc2 = nn.Linear(unit_size, unit_size)
        self.fc3 = nn.Linear(unit_size, unit_size)
        self.fc4 = nn.Linear(unit_size, out_features)
    
    def forward(self, x):
        x = self.fc1(x)
        x = F.sigmoid(x)
        x = self.fc2(x)
        x = F.sigmoid(x)
        x = self.fc3(x)
        x = F.sigmoid(x)
        x = self.fc4(x)
        output = F.softmax(x, dim=1)
        return output

device = torch.device('cpu')
model = Net(196, Class_count).to(device)
print(model)

criterion = nn.CrossEntropyLoss()

params = [p for p in model.parameters() if p.requires_grad]

optimizer = torch.optim.SGD(params, lr = .001, momentum = 0.9)

lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size = 3, gamma = .1)

num_epoches = 100

for epoch in range(num_epoches):
    loss_log = 0
    if epoch % 10 == 0 and epoch != 0:
        print('Evaluating...')
        model.eval()
        for sample, label in zip(Train196, Train_Targets):
            sample = torch.tensor(sample.transpose().astype('float32'), requires_grad = True).to(device)
            output = model(sample).detach().numpy()
            output = np.argmax(output, axis = 1)
            result = np.asarray([1 if i == j else 0 for i,j in zip(output, label)])
            correct += np.sum(result)
            count += len(result)
        print('Accuracy:', correct * 1000/ count,'%')
    print('Training...')
    model.train()
    for sample, label in zip(Train196, Train_Targets):
        sample = np.mean(np.reshape(sample.transpose(),(-1, 8,196)), axis = 0)
        label = [np.bincount(x).argmax() for x in np.reshape(label.transpose(), (8, -1))]
        sample = torch.tensor(sample.astype('float32'), requires_grad = True).to(device)
        label  = torch.LongTensor(label).to(device)
        output = model(sample)
        loss = criterion(sample, label)
        loss_log += loss
        loss.backward()
        optimizer.zero_grad()
        optimizer.step()
    print('Epoch:{}, loss:{}'.format(epoch, loss_log / (epoch + 1)))
                
