# SD207 Challenge Acoustic scene classification
#### WEI Chen, LUO Xi
#### Based on Python3.5

In [2]:
import numpy as np
import math
import librosa
from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPClassifier
import time
from sklearn.preprocessing import StandardScaler, minmax_scale, scale
import pickle
from sklearn.grid_search import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from scipy import stats
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
from sklearn.decomposition import NMF, PCA
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier
from sklearn.linear_model import Perceptron
import nimfa
from sklearn.mixture import GaussianMixture



divide into short frames
FT
abstract MFCC as the training features
13-20 coeffs
late fusion
    :one file containing serveral frames, which corresponds to a same scene

## Import data

### Read filename

In [10]:
txt_train = np.loadtxt("audio/train.txt",dtype=bytes).astype(str)
fn_train = txt_train[:,0]
labels_train = txt_train[:,1]
u,labels_train =  np.unique(labels_train, return_inverse=True)

txt_dev = np.loadtxt("audio/dev.txt",dtype=bytes).astype(str)
fn_dev = txt_dev[:,0]
labels_dev = txt_dev[:,1]
u,labels_dev =  np.unique(labels_dev, return_inverse=True)

fn_test = np.loadtxt("audio/test.txt",dtype=bytes).astype(str)

### Extract MFCC

In [89]:
def extract_mfcc(fn, labels):
    for i,filename in enumerate(fn):
        print("Reading %d-th audio file..." %i)
        s,sr = librosa.load(filename)
        if i==0:
            X = librosa.feature.mfcc(s, n_fft=512, hop_length=512, n_mfcc=20).T
            nframe = X.shape[0]
            y = labels[i]*np.array([1]*nframe)
        else:
            X = np.r_[X, librosa.feature.mfcc(s, n_fft=512, hop_length=512, n_mfcc=20).T]
            y = np.concatenate((y,labels[i]*np.array([1]*nframe)) , axis=0)
    return X, y

In [9]:
X_train, y_train = extract_mfcc(fn_train, labels_train)
X_dev, y_dev = extract_mfcc(fn_dev, labels_dev)

Reading 0-th audio file...
Reading 1-th audio file...
Reading 2-th audio file...
Reading 3-th audio file...


KeyboardInterrupt: 

### Extract CQT

In [90]:
def extract_cqt(fn, labels):
    M=30
    for i,filename in enumerate(fn):
        print("Reading %d-th audio file..." %i)
        s,sr = librosa.load(filename)
        s = scale(s)
        if i==0:
            cqt = librosa.cqt(s, sr=sr, hop_length=512, fmin=5, n_bins=24*11, bins_per_octave =24).T
            nframe = cqt.shape[0]
            for m in range(M):
                if m==0:
                    X = np.mean(cqt[m*int(nframe/M):(m+1)*int(nframe/M),:],0).reshape([1,cqt.shape[1]])
                    y = labels[i]*np.ones(1)
                else:
                    X = np.r_[X, np.mean(cqt[m*int(nframe/M):(m+1)*int(nframe/M),:],0).reshape([1,cqt.shape[1]])]
                    y = np.append(y,labels[i])
        else:
            cqt = librosa.cqt(s, sr=sr, hop_length=512, fmin=5, n_bins=24*11, bins_per_octave =24).T
            for m in range(M):
                X = np.r_[X, np.mean(cqt[m*int(nframe/M):(m+1)*int(nframe/M),:],0).reshape([1,cqt.shape[1]])]
                y = np.append(y,labels[i])
    return X, y

In [91]:
X_train, y_train = extract_cqt(fn_train, labels_train)
X_dev, y_dev = extract_cqt(fn_dev, labels_dev)

Reading 0-th audio file...
Reading 1-th audio file...
Reading 2-th audio file...
Reading 3-th audio file...
Reading 4-th audio file...
Reading 5-th audio file...
Reading 6-th audio file...
Reading 7-th audio file...
Reading 8-th audio file...
Reading 9-th audio file...
Reading 10-th audio file...




Reading 11-th audio file...
Reading 12-th audio file...
Reading 13-th audio file...
Reading 14-th audio file...
Reading 15-th audio file...
Reading 16-th audio file...
Reading 17-th audio file...
Reading 18-th audio file...
Reading 19-th audio file...
Reading 20-th audio file...
Reading 21-th audio file...
Reading 22-th audio file...
Reading 23-th audio file...
Reading 24-th audio file...
Reading 25-th audio file...
Reading 26-th audio file...
Reading 27-th audio file...
Reading 28-th audio file...
Reading 29-th audio file...
Reading 30-th audio file...
Reading 31-th audio file...
Reading 32-th audio file...
Reading 33-th audio file...
Reading 34-th audio file...
Reading 35-th audio file...
Reading 36-th audio file...
Reading 37-th audio file...
Reading 38-th audio file...
Reading 39-th audio file...
Reading 40-th audio file...
Reading 41-th audio file...
Reading 42-th audio file...
Reading 43-th audio file...
Reading 44-th audio file...
Reading 45-th audio file...
Reading 46-th audio 

### Test set

In [14]:
fn_test = np.loadtxt("audio/test.txt",dtype=bytes).astype(str)
for i,filename in enumerate(fn_test):
    y,sr = librosa.load(filename)
    if i==0:
        X_test = librosa.feature.mfcc(y=y, n_fft=512, hop_length=512, n_mfcc=20).T
    else:
        X_test = np.r_[X_test, librosa.feature.mfcc(y=y, n_fft=512, hop_length=512, n_mfcc=20).T]

In [17]:
fn_test = np.loadtxt("audio/test.txt",dtype=bytes).astype(str)
M=30
for i,filename in enumerate(fn_test):
    print("Reading %d-th audio file..." %i)
    s,sr = librosa.load(filename)
    s = scale(s)
    if i==0:
        cqt = librosa.cqt(s, sr=sr, hop_length=512, fmin=5, n_bins=24*11, bins_per_octave =24).T
        nframe = cqt.shape[0]
        for m in range(M):
            if m==0:
                X_test = np.mean(cqt[m*int(nframe/M):(m+1)*int(nframe/M),:],0).reshape([1,cqt.shape[1]])
            else:
                X_test = np.r_[X_test, np.mean(cqt[m*int(nframe/M):(m+1)*int(nframe/M),:],0).reshape([1,cqt.shape[1]])]
    else:
        cqt = librosa.cqt(s, sr=sr, hop_length=512, fmin=5, n_bins=24*11, bins_per_octave =24).T
        for m in range(M):
            X_test = np.r_[X_test, np.mean(cqt[m*int(nframe/M):(m+1)*int(nframe/M),:],0).reshape([1,cqt.shape[1]])]

Reading 0-th audio file...
Reading 1-th audio file...
Reading 2-th audio file...




Reading 3-th audio file...
Reading 4-th audio file...
Reading 5-th audio file...
Reading 6-th audio file...
Reading 7-th audio file...
Reading 8-th audio file...
Reading 9-th audio file...
Reading 10-th audio file...
Reading 11-th audio file...
Reading 12-th audio file...
Reading 13-th audio file...
Reading 14-th audio file...
Reading 15-th audio file...
Reading 16-th audio file...
Reading 17-th audio file...
Reading 18-th audio file...
Reading 19-th audio file...
Reading 20-th audio file...
Reading 21-th audio file...
Reading 22-th audio file...
Reading 23-th audio file...
Reading 24-th audio file...
Reading 25-th audio file...
Reading 26-th audio file...
Reading 27-th audio file...
Reading 28-th audio file...
Reading 29-th audio file...
Reading 30-th audio file...
Reading 31-th audio file...
Reading 32-th audio file...
Reading 33-th audio file...
Reading 34-th audio file...
Reading 35-th audio file...
Reading 36-th audio file...
Reading 37-th audio file...
Reading 38-th audio file...

### Save data

In [18]:
pickle.dump(X_train, open('X_train_cqt.txt', 'wb'))
pickle.dump(y_train, open('y_train_cqt.txt', 'wb'))
pickle.dump(X_dev, open('X_dev_cqt.txt', 'wb'))
pickle.dump(y_dev, open('y_dev_cqt.txt', 'wb'))
pickle.dump(X_test, open('X_test_cqt.txt', 'wb'))

### Reload data

In [3]:
X_train = pickle.load(open('X_train_cqt.txt', 'rb'))
y_train = pickle.load(open('y_train_cqt.txt', 'rb'))
X_dev = pickle.load(open('X_dev_cqt.txt', 'rb'))
y_dev = pickle.load(open('y_dev_cqt.txt', 'rb'))
X_test = pickle.load(open('X_test_cqt.txt', 'rb'))

X_select = pickle.load(open('X_select_cqt.txt', 'rb'))
y_select = pickle.load(open('y_select_cqt.txt', 'rb'))

In [73]:
# X_train = pickle.load(open('X_train_mfcc.txt', 'rb'))
# y_train = pickle.load(open('y_train_mfcc.txt', 'rb'))
# X_dev = pickle.load(open('X_dev_mfcc.txt', 'rb'))
# y_dev = pickle.load(open('y_dev_mfcc.txt', 'rb'))
X_test = pickle.load(open('X_test_mfcc.txt', 'rb'))

## Normalisation

In [19]:
X_train=np.abs(X_train)
X_dev=np.abs(X_dev)
X_test=np.abs(X_test)

In [20]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_dev = scaler.fit_transform(X_dev)
X_test = scaler.fit_transform(X_test)

In [86]:
# X_train = np.r_[X_train,X_dev]
y_train = np.r_[y_train,y_dev]
y_train.shape

(26160,)

## Data deduction via KNN

In [None]:
# KNN with 10 nearest neighbors

neigh = NearestNeighbors(n_neighbors=10)
neigh.fit(X_select)
s=neigh.kneighbors(X_test, return_distance=False)
select=list([])
for i in s:
    for j in i:
        select.append(j)
select=list(set(select))
X_select=X_select[select,:]
y_select=y_select[select]

pickle.dump(X_select, open('X_select.txt', 'wb'))
pickle.dump(y_select, open('y_select.txt', 'wb'))
print("we've selected %d samples from entire X_train" % X_select.shape[0])


## Data deduction via KNN

In [87]:
begin = time.time()
neigh = KNeighborsClassifier(n_neighbors=50)
neigh.fit(X_train, y_train)
# print(gmm.predict_proba(X_train))
X_select = X_train[neigh.predict(X_train)==y_train,:]
y_select = y_train[neigh.predict(X_train)==y_train]
print ("done in %0.3fs." % (time.time() - begin))

done in 827.520s.


In [88]:
pickle.dump(X_select, open('X_select_cqt.txt', 'wb'))
pickle.dump(y_select, open('y_select_cqt.txt', 'wb'))

In [89]:
# y_train[:10000]
# print(np.argmax(gmm.predict_proba(X_train[:300]) , axis=1))
# print(neigh.score(X_train,y_train))
# X_train.shape
X_select.shape
# neigh.predict(X_train)==y_train

(20056, 264)

## Classifier

In [125]:
# Optimise parameters
clf = MLPClassifier(solver='adam',validation_fraction=0.3)

clf = RandomizedSearchCV(clf, param_distributions={
#         'alpha': stats.uniform(0.005,0.01),
        'hidden_layer_sizes':[(240,),(260,),(300,),(350,)]})

n,p = X_train.shape
# ind = np.random.permutation(n)
clf.fit(X_train[ind][:10000], y_train[ind][:10000])
print(clf.best_estimator_)



MLPClassifier(activation='relu', alpha=0.014739811748471023,
       batch_size='auto', beta_1=0.9, beta_2=0.999, early_stopping=False,
       epsilon=1e-08, hidden_layer_sizes=(260,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.3,
       verbose=False, warm_start=False)




### MLP

In [16]:
# Multi-layer Perceptron Classifier
clf = MLPClassifier(activation='relu', alpha=0.014739811748471023,
       batch_size='auto', beta_1=0.9, beta_2=0.999, early_stopping=False,
       epsilon=1e-08, hidden_layer_sizes=(91,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=1000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.3,
       verbose=False, warm_start=False)
# clf=BaggingClassifier(base_estimator=clf, n_estimators=10)

# n,p = X_train.shape
# ind=np.random.permutation(n)
# X_train=X_train[ind]
# y_train=y_train[ind]
begin = time.time()
# clf.fit(X_train, y_train)
clf.fit(X_select, y_select)
# clf.fit(H_train,y_train)
cost = time.time() - begin

print('Training time: % s seconds' % cost)

Training time: 16.561410903930664 seconds


### Perceptron

In [57]:
clf = Perceptron(n_iter=100)
begin = time.time()
clf.fit(np.abs(X_train), y_train)
cost = time.time() - begin
print('Training time: % s seconds' % cost)

Training time: 12.841187953948975 seconds


### Boosting

### Bagging

In [91]:
clf = BaggingClassifier(base_estimator=clf, n_estimators=20, max_samples =0.7)
begin = time.time()
clf.fit(X_train, y_train)
cost = time.time() - begin

print('Training time: % s seconds' % cost)

print(clf.score(X_dev, y_dev))

Training time: 308.9609100818634 seconds
0.977586206897


In [79]:
clf_boost = AdaBoostClassifier(base_estimator=clf, n_estimators=50, algorithm ='SAMME')
begin = time.time()
clf_boost.fit(X_train[ind][:20000], y_train[ind][:20000])
cost = time.time() - begin

print('Training time: % s seconds' % cost)

print(clf_boost.score(X_dev, y_dev))

ValueError: MLPClassifier doesn't support sample_weight.

### Sparse NMF

In [124]:
snmf = nimfa.Snmf(X_train.T, seed="random_c", rank=128, max_iter=12, version='r', eta=1.,
                  beta=1e-4, i_conv=10, w_min_change=0)
snmf_fit = snmf()

  np.mat(2 ** np.array(list(range(l_var - 1, -1, -1)))), p_set)


KeyboardInterrupt: 

In [119]:
snmf = nimfa.Snmf(X_dev.T, seed="random_c", rank=10, max_iter=12, version='r', eta=1.,
                  beta=1e-4, i_conv=10, w_min_change=0)
snmf_fit = snmf()
H_dev = np.array(snmf_fit.coef()).T

  out = N.ndarray.__getitem__(self, index)
  out = N.ndarray.__getitem__(self, index)
  out = N.ndarray.__getitem__(self, index)
  out = N.ndarray.__getitem__(self, index)
  out = N.ndarray.__getitem__(self, index)
  out = N.ndarray.__getitem__(self, index)
  out = N.ndarray.__getitem__(self, index)
  out = N.ndarray.__getitem__(self, index)
  out = N.ndarray.__getitem__(self, index)
  out = N.ndarray.__getitem__(self, index)
  out = N.ndarray.__getitem__(self, index)
  out = N.ndarray.__getitem__(self, index)
  out = N.ndarray.__getitem__(self, index)
  out = N.ndarray.__getitem__(self, index)
  out = N.ndarray.__getitem__(self, index)
  out = N.ndarray.__getitem__(self, index)
  out = N.ndarray.__getitem__(self, index)
  out = N.ndarray.__getitem__(self, index)


### Majority Vote

In [8]:
def majority_vote(ys, population):
    y = np.zeros(ys.shape[0]/population)
    for i in range(y.shape[0]):
        y[i]=np.argmax(np.bincount(ys[i*population:(i+1)*population]))
    return y.astype(int)

In [23]:
print(clf.score(X_dev, y_dev))
y_frame = clf.predict(X_dev)
y_pred = majority_vote(y_frame.astype(int), X_dev.shape[0]/fn_dev.shape[0] )
print(y_pred[y_pred-labels_dev==0].shape[0]/y_pred.shape[0])

0.846551724138
0.9482758620689655


  from ipykernel import kernelapp as app


In [25]:
print(clf.score(X_dev, y_dev))
clf.predict_proba(X_dev).shape

0.846551724138


(8700, 15)

cqt with all samples, 0.43 0.55, 52s
cqt+bagging 0.7,10, 0.44, 0.60, 0.68 349s
cqt+KNNreduction+bagging 0.47, 0.62
cqt+dev+KNNreduction+mlp+bagging 0.75
mfcc

### Prediction for test set

In [93]:
y_frame=clf.predict(X_test)
y_pred=majority_vote(y_frame.astype(int), X_test.shape[0]/fn_test.shape[0] )
np.savetxt('y_pred.txt', y_pred, fmt='%d')

  from ipykernel import kernelapp as app
