In [1]:
#libraries
#%matplotlib notebook

import numpy as np
import tensorflow as tf
from matplotlib import pyplot as plt
from IPython.core.display import clear_output
from tensorflow.examples.tutorials.mnist import input_data



In [2]:
sess = tf.InteractiveSession()

#### Input Preprocessing
* d0.wav and d1.wav contain sound produced by the footsteps of two distinct people.
* Preprocessed using audacity to remove silence.
    * Galaxy S7 Edge, default recorder, high quality, mono.
    * Audacity settings:
        * Minimum silence: 1ms
        * Maximum silence: 5000ms
        * Compression ratio: 1000:1
        * Siulence level: -45dB
* Imported and converted to MFCC vectors using suitable fixed window. Tune window parameters for better performance.
* Concatenated into one training vector.
* Standardized to $ \mu = 0, \sigma = 1$.

In [3]:
mnist = input_data.read_data_sets("MNIST_data/")
train_x = np.concatenate((mnist.train.images[mnist.train.labels == 0],mnist.train.images[mnist.train.labels == 1]))
# train_x = np.array(train_x); mu = np.mean(train_x); sigma = np.std(train_x); #data standardization
# train_x = (train_x - mu + 0.0)/sigma

num_classes = 2;
n_train = train_x.shape[0]
n0 = np.sum(mnist.train.labels == 0)
input_size = train_x.shape[1]
x = tf.placeholder(tf.float32, [None, input_size])
momentum = tf.placeholder(tf.float32,[])

Extracting MNIST_data/train-images-idx3-ubyte.gz
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz


#### Hyperparameters
* Adjust hidden_shape to add layers or neurons within the layers. Eg: [100,2,100] <=> 3 hidden layers with 100,2,100 neurons in the 1st,2nd,3rd layer respectively.
* Edit act_fns, code_lyr according to hidden_shape. Here the sparse representation is hidden layer no 2.

In [45]:
hidden_shape = [50,2,50] #No of neurons in each hidden layer
act_fns = ['tanh','tanh','tanh'] #Activation functions of layers
code_lyr = 2 #index of hidden layer containing sparse vector
num_lyr = len(hidden_shape) #No of hidden layers
net_shape = [input_size] + hidden_shape + [input_size]
learning_rate = tf.placeholder(tf.float32,[])
beta = tf.placeholder(tf.float32,[]) #sparsity_penalty weight

In [47]:
#parameters
params = {'W':{},'b':{}} #W: Weights, b: biases
for lay_num in range(num_lyr+1):
    params['W'][lay_num] = tf.Variable(tf.random_normal([net_shape[lay_num], net_shape[lay_num+1]],stddev = 0.1),name='weights_layer'+str(lay_num))
    params['b'][lay_num] = tf.Variable(tf.random_normal([net_shape[lay_num+1]],stddev = 0.1),name='biases_layer'+str(lay_num))

#### Network Graph
* Cost = Recontruction error + beta*Sparsity penalty
* Recontruction error is the L2 norm between the input and output (reconstructed) vector. This is usual for continuous input autoencoders
* The sparsity penalty is nonstandard. It is applied on the "sparse" 2-vector. Since it is output from a sigmoid layer, it lies in unit square centered at (0.5,0.5). A decreasing function of the distance from the line y=x is used. This ensures minima that are furthest apart in 2D space [At (1,0) and (0,1)]. Modify this function according to the output space of the 2-vector layer.
* The intuition behind the setup is that if the input contains MFCCs from two distinct people (i.e feature values from distinct classes), they should settle at the two distinct minima. This is because settling farther away from each other allows greater freedom in representational power for reconstruction. 
* This also allows use to deterministically say where the clusters will converge. One class should converge above the line y = x and another, below.

In [48]:
#network graph
def act_fn(a,lay_num):
    if act_fns[lay_num] == 'tanh':
        return tf.tanh(a)
    elif act_fns[lay_num] == 'relu':
        return tf.nn.relu(a)
    elif act_fns[lay_num] == 'elu':
        return tf.nn.elu(a)
    elif act_fns[lay_num] == 'sig':
        return tf.sigmoid(a)
    
def AutoEncoder(x,params):
    z = x
    for lay_num in range(num_lyr):
        z = act_fn(tf.add(tf.matmul(z,params['W'][lay_num]),params['b'][lay_num]),lay_num)
        if lay_num == code_lyr-1:
            corner_pusher = -tf.reduce_mean(tf.abs(tf.sub(z[:,0],z[:,1])))/np.sqrt(2) #distance from line y = x
            sparsity_penalty = corner_pusher
            code = z
    x_ = tf.add(tf.matmul(z,params['W'][lay_num+1]),params['b'][lay_num+1])
    cost = tf.sqrt(tf.reduce_mean(tf.squared_difference(x,x_))) #+ tf.mul(beta,sparsity_penalty)
    return (cost,code,x_)

#### Graph Construction
* Ordinary gradient descent optimization is used. Other optimizers may be considered to automate the learning process better.

In [49]:
cost,code,x_ = AutoEncoder(x,params)
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
save_vars = {}
for lay_num in range(num_lyr):
    save_vars['weights_layer'+str(lay_num)] = params['W'][lay_num]
    save_vars['biases_layer'+str(lay_num)] = params['b'][lay_num]
saver = tf.train.Saver(save_vars)
restore_vars = {}
if num_lyr > 1:
    for lay_num in range(num_lyr-1):
        restore_vars['weights_layer'+str(lay_num)] = params['W'][lay_num]
        restore_vars['biases_layer'+str(lay_num)] = params['b'][lay_num]
    restorer = tf.train.Saver(restore_vars)

In [57]:
#parameter initialization
sess.run(tf.initialize_all_variables())
if num_lyr > 1:
    restorer.restore(sess,'saved_vars')

#### Learning
* The model should have sufficient representational power:
    * The size of the sparse vector should scale with the number of clusters (and hence classes) we want to detect. Here num_classes = 2. Size(sparse vector) = 2 proved to be sufficient.
    * Set beta = 0 and adjust the size of other layers until the reconstruction error is low.
* Beta should be small enough that it doesn't substantitally affect the recontructive capability but still pushes the points to the desired peripheries.

In [None]:
#learning
n_iter = 0
while True:
    try:
        _,cost_,code_,x__ = sess.run([optimizer,cost,code,x_],feed_dict={x:train_x,learning_rate:3.0,beta:1.0})
        if n_iter % 10 == 0:
            print('iter'+str(n_iter)+' cost:'+str(cost_))
        if n_iter+1 % 20 == 0:
            clear_output()
        n_iter += 1
    except KeyboardInterrupt:
        saver.save(sess,'saved_vars')
        print "Training is stopped"
        break

iter0 cost:0.19651
iter10 cost:0.196186
iter20 cost:0.195896
iter30 cost:0.195637
iter40 cost:0.195406
iter50 cost:0.195203
iter60 cost:0.195024
iter70 cost:0.194867
iter80 cost:0.194729
iter90 cost:0.194609
iter100 cost:0.194503
iter110 cost:0.194411
iter120 cost:0.194329
iter130 cost:0.194257
iter140 cost:0.194193
iter150 cost:0.194137
iter160 cost:0.194086
iter170 cost:0.19404
iter180 cost:0.193998
iter190 cost:0.193959
iter200 cost:0.193923
iter210 cost:0.19389
iter220 cost:0.193859
iter230 cost:0.193829
iter240 cost:0.1938
iter250 cost:0.193773
iter260 cost:0.193745
iter270 cost:0.193718
iter280 cost:0.193692
iter290 cost:0.193665
iter300 cost:0.193638
iter310 cost:0.193612
iter320 cost:0.193584
iter330 cost:0.193556
iter340 cost:0.193528
iter350 cost:0.193498
iter360 cost:0.193468
iter370 cost:0.193436
iter380 cost:0.193404
iter390 cost:0.193369
iter400 cost:0.193334
iter410 cost:0.193297
iter420 cost:0.193259
iter430 cost:0.193218


In [10]:
restore_vars

{}

In [None]:
np.max(code_)

In [59]:
#plots
plt.title('Sparse Vectors'); plt.xlabel('Dimension 1'); plt.ylabel('Dimension 2');
plt.scatter(code_[:n0][:,0],code_[:n0][:,1],c=[1,0,0], label = 'Person 1')
plt.scatter(code_[n0:][:,0],code_[n0:][:,1],c=[0,0,1], label = 'Person 2')
plt.legend()
plt.plot([-1,1],[-1,1],c=[0,0,0],linewidth=2.0)
plt.show()

In [None]:
np.set_printoptions(threshold=np.inf)
np.sum((code_[n0:,0] < code_[n0:,1])+0.0)

In [None]:
np.sum((code_[:n0,0] > code_[:n0,1])+0.0)

In [None]:
train_x.shape

In [15]:
img1 = train_x[1000].reshape(28,28)
plt.imshow(img1,cmap='gray')
plt.show()

In [52]:
# img2 = (x__[1100]*sigma + mu).reshape(28,28)
img2 = x__[6000].reshape(28,28)
plt.imshow(img2,cmap='gray')
plt.show()

In [None]:
sess.close()