In [1]:
#libraries
%matplotlib notebook

import wavio
import numpy as np
import tensorflow as tf
from matplotlib import pyplot as plt
from python_speech_features import mfcc
from IPython.core.display import clear_output



In [2]:
sess = tf.Session()

#### Input Preprocessing
* d0.wav and d1.wav contain sound produced by the footsteps of two distinct people.
* Preprocessed using audacity to remove silence.
* Imported and converted to MFCC vectors using suitable fixed window. Tune window parameters for better performance.
* Concatenated into one training vector.
* Standardized to $ \mu = 0, \sigma = 1$.

In [3]:
x0 = wavio.read('at7/d0.wav').data; x1 = wavio.read('at7/d1.wav').data
l0 = mfcc(x0[:,0],44100,0.2,0.1); l1 = mfcc(x1[:,0],44100,0.2,0.1)
fx = np.concatenate((l0,l1),0)
train_x = np.array(fx); mu = np.mean(train_x); sigma = np.std(train_x); #data standardization
train_x = (train_x - mu + 0.0)/sigma

num_classes = 2;
n_train = fx.shape[0]
n0 = l0.shape[0]
input_size = l0.shape[1]
x = tf.placeholder(tf.float32, [None, input_size])

#### Hyperparameters
* Adjust hidden_shape to add layers or neurons within the layers. Eg: [100,2,100] <=> 3 hidden layers with 100,2,100 neurons in the 1st,2nd,3rd layer respectively.
* Edit act_fns, code_lyr according to hidden_shape. Here the sparse representation is hidden layer no 2.

In [4]:
hidden_shape = [100,2,100] #No of neurons in each hidden layer
act_fns = ['tanh','sig','tanh'] #Activation functions of layers
code_lyr = 2 #index of hidden layer containing sparse vector
num_lyr = len(hidden_shape) #No of hidden layers
net_shape = [input_size] + hidden_shape + [input_size]
learning_rate = tf.placeholder(tf.float32,[])
beta = tf.placeholder(tf.float32,[]) #sparsity_penalty weight

#### Parameters
* Xavier Intialization (Gaussian distribution) is used on the weights and biases.

In [5]:
params = {'W':{},'b':{}} #W: Weights, b: biases
for lay_num in range(num_lyr+1):
    params['W'][lay_num] = tf.Variable(tf.random_normal([net_shape[lay_num], net_shape[lay_num+1]],stddev = 0.5))
    params['b'][lay_num] = tf.Variable(tf.random_normal([net_shape[lay_num+1]],stddev = 0.5))

#### Network Graph
* Cost = Recontruction error + beta*Sparsity penalty
* Recontruction error is the L2 norm between the input and output (reconstructed) vector. This is usual for continuous input autoencoders
* The sparsity penalty is nonstandard. It is applied on the sparse 2-vector. Since it is output from a sigmoid layer, it lies in unit square centered at (0.5,0.5). A decreasing function of the distance from the line y=x is used. This ensures minima that are furthest apart in 2D space [At (1,0) and (0,1)]. Modify this function according to the output space of the 2-vector layer.
* The intuition behind the setup is that if the input contains MFCCs from two distinct people (i.e feature values from distinct classes), they should settle at the two distinct minima. This is because settling farther away from each other allows greater freedom in representational power for reconstruction. 
* This also allows use to deterministically say where the clusters will converge. One class should converge above the line y = x and another, below.

In [6]:
#network graph
def act_fn(a,lay_num):
    if act_fns[lay_num] == 'tanh':
        return tf.tanh(a)
    elif act_fns[lay_num] == 'relu':
        return tf.nn.relu(a)
    elif act_fns[lay_num] == 'elu':
        return tf.nn.elu(a)
    elif act_fns[lay_num] == 'sig':
        return tf.sigmoid(a)
    
def AutoEncoder(x,params):
    z = x
    for lay_num in range(num_lyr):
        z = act_fn(tf.add(tf.matmul(z,params['W'][lay_num]),params['b'][lay_num]),lay_num)
        if lay_num == code_lyr-1:
            corner_pusher = -tf.reduce_mean(tf.abs(tf.sub(z[:,0],z[:,1])))/np.sqrt(2) #distance from line y = x
            sparsity_penalty = corner_pusher
            code = z
    x_ = tf.add(tf.matmul(z,params['W'][lay_num+1]),params['b'][lay_num+1])
    cost = tf.sqrt(tf.reduce_mean(tf.squared_difference(x,x_))) + tf.mul(beta,sparsity_penalty)
    return (cost,code)

#### Graph Construction
* Ordinary Gradient descent optimization is used. Other optimizers may be considered to automate the learning process.

In [7]:
cost,code = AutoEncoder(x,params)
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)

In [8]:
#parameter initialization
sess.run(tf.initialize_all_variables())

#### Learning
* The model should have sufficient representational power:
    * The size of the sparse vector should scale with the number of clusters (and hence classes) we want to detect. Here num_classes = 2. Size(sparse vector) = 2 proved to be sufficient.
    * Set beta = 0 and adjust the size of other layers until the reconstruction error is low.
* Beta should be small enough that it doesn't substantitally affect the recontructive capability but still pushes the points to the desired peripheries.

In [9]:
#learning
n_iter = 0
while True:
    try:
        _,cost_,code_ = sess.run([optimizer,cost,code],feed_dict={x:train_x,learning_rate:0.1,beta:0.05})
        if n_iter % 1000 == 0:
            print('iter'+str(n_iter)+' cost:'+str(cost_))
        if n_iter+1 % 20000 == 0:
            clear_output()
        n_iter += 1
    except KeyboardInterrupt:
        print "Training is stopped"
        break
sess.close()

iter0 cost:3.25783
iter1000 cost:0.591729
iter2000 cost:0.563133
iter3000 cost:0.550632
iter4000 cost:0.543199
iter5000 cost:0.537759
iter6000 cost:0.53337
iter7000 cost:0.529677
iter8000 cost:0.52668
iter9000 cost:0.524267
iter10000 cost:0.522296
iter11000 cost:0.520663
iter12000 cost:0.519284
iter13000 cost:0.51808
iter14000 cost:0.516995
iter15000 cost:0.515976
iter16000 cost:0.514992
iter17000 cost:0.514021
iter18000 cost:0.513083
iter19000 cost:0.512173
iter20000 cost:0.511282
iter21000 cost:0.510408
iter22000 cost:0.509533
iter23000 cost:0.508614
iter24000 cost:0.507711
iter25000 cost:0.506863
iter26000 cost:0.506068
iter27000 cost:0.505313
iter28000 cost:0.504581
iter29000 cost:0.503873
iter30000 cost:0.503176
iter31000 cost:0.50247
iter32000 cost:0.50175
iter33000 cost:0.500994
iter34000 cost:0.500136
iter35000 cost:0.499139
iter36000 cost:0.498044
iter37000 cost:0.496974
iter38000 cost:0.497963
iter39000 cost:0.498071
iter40000 cost:0.497936
iter41000 cost:0.497835
iter42000 c

In [10]:
#plots
plt.title('Sparse Vectors'); plt.xlabel('Dimension 1'); plt.ylabel('Dimension 2');
plt.scatter(code_[n0:][:,0],code_[n0:][:,1],c=[1,0,0], label = 'Friend')
plt.scatter(code_[:n0][:,0],code_[:n0][:,1],c=[0,0,1], label = 'Me')
plt.legend()
plt.plot([0,1],[0,1],c=[0,0,0],linewidth=2.0)
plt.show()

<IPython.core.display.Javascript object>

In [11]:
#prediction confidence and classification
np.set_printoptions(threshold=np.inf)
np.cumsum((code_[n0:,0]< code_[n0:,1])+0.0)/(range(1,l1.shape[0]+1))

array([ 1.        ,  1.        ,  1.        ,  0.75      ,  0.8       ,
        0.83333333,  0.71428571,  0.75      ,  0.66666667,  0.6       ,
        0.54545455,  0.58333333,  0.53846154,  0.57142857,  0.53333333,
        0.5       ,  0.47058824,  0.5       ,  0.52631579,  0.55      ,
        0.52380952,  0.5       ,  0.52173913,  0.54166667,  0.56      ,
        0.53846154,  0.55555556,  0.57142857,  0.5862069 ,  0.56666667,
        0.5483871 ,  0.5625    ,  0.54545455,  0.52941176,  0.54285714,
        0.52777778,  0.51351351,  0.52631579,  0.53846154,  0.525     ,
        0.51219512,  0.5       ,  0.51162791,  0.5       ,  0.48888889,
        0.5       ,  0.5106383 ,  0.5       ,  0.48979592,  0.48      ,
        0.49019608,  0.5       ,  0.49056604,  0.48148148,  0.49090909,
        0.5       ,  0.49122807,  0.48275862,  0.49152542,  0.48333333,
        0.49180328,  0.5       ,  0.50793651,  0.5       ,  0.50769231,
        0.5       ,  0.49253731,  0.48529412,  0.49275362,  0.5 

In [12]:
np.cumsum((code_[:n0,0] > code_[:n0,1])+0.0)/(range(1,l0.shape[0]+1))

array([ 1.        ,  1.        ,  1.        ,  1.        ,  1.        ,
        1.        ,  1.        ,  0.875     ,  0.88888889,  0.9       ,
        0.90909091,  0.83333333,  0.84615385,  0.85714286,  0.86666667,
        0.875     ,  0.88235294,  0.88888889,  0.89473684,  0.9       ,
        0.9047619 ,  0.86363636,  0.82608696,  0.83333333,  0.84      ,
        0.84615385,  0.85185185,  0.85714286,  0.86206897,  0.86666667,
        0.87096774,  0.875     ,  0.84848485,  0.85294118,  0.85714286,
        0.86111111,  0.86486486,  0.86842105,  0.87179487,  0.875     ,
        0.87804878,  0.88095238,  0.88372093,  0.86363636,  0.86666667,
        0.84782609,  0.85106383,  0.85416667,  0.85714286,  0.86      ,
        0.8627451 ,  0.86538462,  0.86792453,  0.87037037,  0.87272727,
        0.875     ,  0.87719298,  0.86206897,  0.86440678,  0.85      ,
        0.85245902,  0.83870968,  0.84126984,  0.84375   ,  0.83076923,
        0.83333333,  0.8358209 ,  0.83823529,  0.84057971,  0.84