In [0]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy.io
import math
import sklearn
import sklearn.datasets
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

In [0]:
# the dataset given for any phone:-accel or gyro / watch:-accel or gyro couldnt be fitted in a single csv file (for all 20 data in excel stacked above each other) , 
# so I loaded first 14 data set in 1 file and the rest in the 2nd file
def load_data():
    dataset = pd.read_csv('/content/drive/My Drive/Train1.csv', header=None)
    train_set_x_orig1 = dataset.iloc[:,[3,4,5]].values # your train set features
    train_set_y_orig1 = dataset.iloc[:,[1]].values

    dataset = pd.read_csv('/content/drive/My Drive/Train2.csv', header=None)
    train_set_x_orig2 = dataset.iloc[:,[3,4,5]].values # your train set features
    train_set_y_orig2 = dataset.iloc[:,[1]].values

    train_set_x_orig = np.vstack((train_set_x_orig1,train_set_x_orig2)) # stacking the two csv file one over the other in numpy array
    train_set_y_orig = np.vstack((train_set_y_orig1,train_set_y_orig2))

    train_x = train_set_x_orig.T
    train_y = train_set_y_orig
    onehotencoder=OneHotEncoder()
    train_y= onehotencoder.fit_transform(train_y[:,[0]]).toarray()
    train_y = train_y.T

    dataset = pd.read_csv('Test1.csv', header=None)
    test_set_x_orig = dataset.iloc[:,[3,4,5]].values # taking the useful train set features
    test_set_y_orig = dataset.iloc[:,[1]].values

    test_x = test_set_x_orig.T
    test_y = test_set_y_orig
    onehotencoder=OneHotEncoder()
    test_y= onehotencoder.fit_transform(test_y[:,[0]]).toarray()
    test_y = test_y.T

    return train_x,train_y,test_x,test_y

In [0]:
# dictionary for reversing onehotencoding at in the predict function at the end
dict = {0 :'A' ,
        1 :'B' ,
        2 :'C' ,
        3 :'D' ,
        4 :'E' ,
        5 :'F' ,
        6 :'G' ,
        7 :'H' ,
        8 :'I' ,
        9 :'J' ,
        10 :'K' ,
        11:'L' ,
        12 :'M' ,
        13:'O' ,
        14:'P' ,
        15:'Q' ,
        16:'R' ,
        17:'S' }

In [0]:
def softmax(Z):

    #Z -- numpy array of any shape
    
    
    #A -- output of sigmoid(z), same shape as Z
    #cache -- returns Z as well, useful during backpropagation
    
    x_exp = np.exp(Z)
    x_sum = np.sum(x_exp,axis=0,keepdims=True)
    A = x_exp/x_sum
    cache = Z
    
    return A, cache

In [0]:
def relu(Z):
  
    #Z -- Output of the linear layer, of any shape

    #A -- Post-activation parameter, of the same shape as Z
    #cache -- a python dictionary containing "A" ; stored for computing the backward pass efficiently
    
    A = np.maximum(0,Z)
    
    assert(A.shape == Z.shape)
    
    cache = Z 
    return A, cache

In [0]:
def relu_backward(dA, cache):

    #dA -- post-activation gradient, of any shape
    #cache -- 'Z' where we store for computing backward propagation efficiently

    
    #dZ -- Gradient of the cost with respect to Z
   
    
    Z = cache
    dZ = np.array(dA, copy=True) # just converting dz to a correct object.
    
    # When z <= 0, then dz to 0 
    dZ[Z <= 0] = 0
    
    assert (dZ.shape == Z.shape)
    
    return dZ


In [0]:
def softmax_backward(dA, cache):

    #dA -- post-activation gradient, of any shape
    #cache -- 'Z' where we store for computing backward propagation efficiently

    
    #dZ -- Gradient of the cost with respect to Z
    
    
    Z = cache
    
    dZ = dA * 1
    
    assert (dZ.shape == Z.shape)
    
    return dZ

In [0]:
def initialize_parameters_deep(layer_dims):

    #layer_dims -- python array (list) containing the dimensions of each layer in our network
    

    #parameters -- python dictionary containing your parameters "W1", "b1", ..., "WL", "bL":
    #Wl -- weight matrix of shape (layer_dims[l], layer_dims[l-1])
    #bl -- bias vector of shape (layer_dims[l], 1)
    
    np.random.seed(1)
    parameters = {}
    L = len(layer_dims)            # number of layers in the network

    for l in range(1, L):
        parameters['W' + str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1]) / np.sqrt(layer_dims[l-1])*0.01
        parameters['b' + str(l)] = np.zeros((layer_dims[l], 1))
        
        assert(parameters['W' + str(l)].shape == (layer_dims[l], layer_dims[l-1]))
        assert(parameters['b' + str(l)].shape == (layer_dims[l], 1))

        
    return parameters

In [0]:
def linear_forward(A, W, b):

    #A -- activations from previous layer (or input data): (size of previous layer, number of examples)
    #W -- weights matrix: numpy array of shape (size of current layer, size of previous layer)
    #b -- bias vector, numpy array of shape (size of the current layer, 1)

    #Z -- the input of the activation function
    cache -- a python dictionary containing "A", "W" and "b" ; stored for computing the backward pass efficiently
    
    Z = W.dot(A) + b
    
    assert(Z.shape == (W.shape[0], A.shape[1]))
    cache = (A, W, b)
    
    return Z, cache

In [0]:
def linear_activation_forward(A_prev, W, b, activation):

    #A_prev -- activations from previous layer (or input data): (size of previous layer, number of examples)
    #W -- weights matrix: numpy array of shape (size of current layer, size of previous layer)
    #b -- bias vector, numpy array of shape (size of the current layer, 1)
    #activation -- the activation to be used in this layer, stored as a text string: "sigmoid" or "relu"

    #A -- the output of the activation function, also called the post-activation value 
    #cache -- a dictionary containing "linear_cache" and "activation_cache";
             
    
    if activation == "softmax":
        # Inputs: "A_prev, W, b". Outputs: "A, activation_cache".
        Z, linear_cache = linear_forward(A_prev, W, b)
        A, activation_cache = softmax(Z)
    
    elif activation == "relu":
        # Inputs: "A_prev, W, b". Outputs: "A, activation_cache".
        Z, linear_cache = linear_forward(A_prev, W, b)
        A, activation_cache = relu(Z)
    
    assert (A.shape == (W.shape[0], A_prev.shape[1]))
    cache = (linear_cache, activation_cache)

    return A, cache

In [0]:
def L_model_forward(X, parameters):

    #X -- data, numpy array of shape (input size, number of examples)
    #parameters -- output of initialize_parameters_deep()
    
    #AL -- last post-activation value
    #caches -- list of caches 

    caches = []
    A = X
    L = len(parameters) // 2                  # number of layers in the neural network
    
    # Implementing [LINEAR -> RELU]*(L-1). Add "cache" to the "caches" list.
    for l in range(1, L):
        A_prev = A 
        A, cache = linear_activation_forward(A_prev, parameters['W' + str(l)], parameters['b' + str(l)], activation = "relu")
        caches.append(cache)
    
    # Implementing LINEAR -> SIGMOID. Add "cache" to the "caches" list.
    AL, cache = linear_activation_forward(A, parameters['W' + str(L)], parameters['b' + str(L)], activation = "softmax")
    caches.append(cache)
    
    assert(AL.shape == (18,X.shape[1]))
            
    return AL, caches

In [0]:
def compute_cost(AL, Y):

    #AL -- probability vector corresponding to your label predictions, shape (1, number of examples)
    #Y -- true "label" vector (for example: containing 0 if non-cat, 1 if cat), shape (1, number of examples)
    #cost -- cross-entropy cost

    
    m = Y.shape[1]

    # Loss from aL and y.
    cost = (1./m) * np.sum(np.sum(-np.multiply(Y,np.log(AL)),axis=0))
    
    cost = np.squeeze(cost)      # To make sure your cost's shape is what we expect 
    assert(cost.shape == ())
    
    return cost

In [0]:
def linear_backward(dZ, cache):

    #dZ -- Gradient of the cost with respect to the linear output (of current layer l)
    #cache -- tuple of values (A_prev, W, b) coming from the forward propagation in the current layer


    #dA_prev -- Gradient of the cost with respect to the activation (of the previous layer l-1), same shape as A_prev
    #dW -- Gradient of the cost with respect to W (current layer l), same shape as W
    #db -- Gradient of the cost with respect to b (current layer l), same shape as b

    A_prev, W, b = cache
    m = A_prev.shape[1]

    dW = 1./m * np.dot(dZ,A_prev.T)
    db = 1./m * np.sum(dZ, axis = 1, keepdims = True)
    dA_prev = np.dot(W.T,dZ)
    
    assert (dA_prev.shape == A_prev.shape)
    assert (dW.shape == W.shape)
    assert (db.shape == b.shape)
    
    return dA_prev, dW, db

In [0]:
def linear_activation_backward(dA, cache, activation):

    #dA -- post-activation gradient for current layer l 
    #cache -- tuple of values (linear_cache, activation_cache) we store for computing backward propagation efficiently
    #activation -- the activation to be used in this layer, stored as a text string: "sigmoid" or "relu"
    #dA_prev -- Gradient of the cost with respect to the activation (of the previous layer l-1), same shape as A_prev
    #dW -- Gradient of the cost with respect to W (current layer l), same shape as W
    #db -- Gradient of the cost with respect to b (current layer l), same shape as b
    
    linear_cache, activation_cache = cache
    
    if activation == "relu":
        dZ = relu_backward(dA, activation_cache)
        dA_prev, dW, db = linear_backward(dZ, linear_cache)
        
    elif activation == "softmax":
        dZ = softmax_backward(dA, activation_cache)
        dA_prev, dW, db = linear_backward(dZ, linear_cache)
    
    return dA_prev, dW, db

In [0]:
def L_model_backward(AL, Y, caches):

    #AL -- probability vector, output of the forward propagation (L_model_forward())
    #Y -- true "label" vector (containing 0 if non-cat, 1 if cat)
    #caches -- list of cache
    #grads -- A dictionary with the gradients

 
    grads = {}
    L = len(caches) # the number of layers
    m = AL.shape[1]
    Y = Y.reshape(AL.shape) # after this line, Y is the same shape as AL
    
    # Initializing the backpropagation
    dAL = AL-Y
    
    
    current_cache = caches[L-1]
    grads["dA" + str(L-1)], grads["dW" + str(L)], grads["db" + str(L)] = linear_activation_backward(dAL, current_cache, activation = "softmax")
    
    for l in reversed(range(L-1)):
        current_cache = caches[l]
        dA_prev_temp, dW_temp, db_temp = linear_activation_backward(grads["dA" + str(l + 1)], current_cache, activation = "relu")
        grads["dA" + str(l)] = dA_prev_temp
        grads["dW" + str(l + 1)] = dW_temp
        grads["db" + str(l + 1)] = db_temp

    return grads

In [0]:
def update_parameters(parameters, grads, learning_rate):

    #parameters -- python dictionary containing your parameters 
    #grads -- python dictionary containing your gradients, output of L_model_backward
    
    #parameters -- python dictionary containing teh updated parameters 
                
    
    L = len(parameters) // 2 # number of layers in the neural network

    # Updated rule for each parameter. Using a for loop.
    for l in range(L):
        parameters["W" + str(l+1)] = parameters["W" + str(l+1)] - learning_rate * grads["dW" + str(l+1)]
        parameters["b" + str(l+1)] = parameters["b" + str(l+1)] - learning_rate * grads["db" + str(l+1)]
        
    return parameters

In [0]:
train_x, train_y, test_x, test_y= load_data()

In [0]:
##reshape and standardise here
# Reshape the training and test examples 

# Standardize data to have feature values between 0 and 1.
sc = StandardScaler()

train_x =train_x.T
test_x = test_x.T

train_x[:, :] = sc.fit_transform(train_x[:, :])
test_x[:, :] = sc.transform(test_x[:, :])

train_x =train_x.T
test_x = test_x.T



train_x's shape: (3, 1580245)
test_x's shape: (3, 111026)


In [0]:
def random_mini_batches(X, Y, mini_batch_size = 64, seed = 0):
   
    #Createing a list of random minibatches from (X, Y)
    
 
    #X -- input data, of shape (input size, number of examples)
    #Y -- true "label" vector (1 for blue dot / 0 for red dot), of shape (1, number of examples)
    #mini_batch_size -- size of the mini-batches, integer
    #mini_batches -- list of synchronous (mini_batch_X, mini_batch_Y)
  
    
    np.random.seed(seed)           
    m = X.shape[1]                 
    mini_batches = []
        
    # Shuffling (X, Y)
    permutation = list(np.random.permutation(m))
    shuffled_X = X[:, permutation]
    shuffled_Y = Y[:, permutation].reshape((18,m))

    # Partition Minus the end case.
    num_complete_minibatches = math.floor(m/mini_batch_size) # number of mini batches of size mini_batch_size 
    for k in range(0, num_complete_minibatches):
      
        mini_batch_X = shuffled_X[:,k * mini_batch_size:(k + 1) * mini_batch_size]
        mini_batch_Y = shuffled_Y[:,k * mini_batch_size:(k + 1) * mini_batch_size]
 
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)
    
    # Handling the end case (last mini-batch < mini_batch_size)
    if m % mini_batch_size != 0:

        mini_batch_X = shuffled_X[:,num_complete_minibatches * mini_batch_size:]
        mini_batch_Y = shuffled_Y[:,num_complete_minibatches * mini_batch_size:]

        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)
    
    return mini_batches

In [0]:
def initialize_adam(parameters) :

    #parameters -- python dictionary containing parameters.
    #v -- python dictionary that will contain the exponentially weighted average of the gradient.
 
    #s -- python dictionary that will contain the exponentially weighted average of the squared gradient.


   
    
    L = len(parameters) // 2 # number of layers in the neural networks
    v = {}
    s = {}
    
    # Initializing v, s. Input: "parameters". Outputs: "v, s".
    for l in range(L):

        v["dW" + str(l + 1)] = np.zeros_like(parameters["W" + str(l + 1)])
        v["db" + str(l + 1)] = np.zeros_like(parameters["b" + str(l + 1)])
        s["dW" + str(l+1)] = np.zeros_like(parameters["W" + str(l + 1)])
        s["db" + str(l+1)] = np.zeros_like(parameters["b" + str(l + 1)])

    
    return v, s

In [0]:
def update_parameters_with_adam(parameters, grads, v, s, t, learning_rate=0.01,
                                beta1=0.9, beta2=0.999, epsilon=1e-8):
   
    #Updating parameters using Adam
    
 
    #parameters -- python dictionary 
    #grads -- python dictionary containing gradients 
    #v -- Adam variable
    #s -- Adam variable
    #learning_rate -- the learning rate, scalar.
    #beta1 -- Exponential decay hyperparameter for the first moment estimates 
    #beta2 -- Exponential decay hyperparameter for the second moment estimates 
    #epsilon -- hyperparameter preventing division by zero in Adam updates


    
    L = len(parameters) // 2                 # number of layers in the neural networks
    v_corrected = {}                         
    s_corrected = {}                         
    
    # Perform Adam update on all parameters
    for l in range(L):
  
        v["dW" + str(l + 1)] = beta1 * v["dW" + str(l + 1)] + (1 - beta1) * grads['dW' + str(l + 1)]
        v["db" + str(l + 1)] = beta1 * v["db" + str(l + 1)] + (1 - beta1) * grads['db' + str(l + 1)]


        # Computing bias-corrected first moment estimate
       
        v_corrected["dW" + str(l + 1)] = v["dW" + str(l + 1)] / (1 - np.power(beta1, t))
        v_corrected["db" + str(l + 1)] = v["db" + str(l + 1)] / (1 - np.power(beta1, t))
        

        # Moving average of the squared gradients.
     
        s["dW" + str(l + 1)] = beta2 * s["dW" + str(l + 1)] + (1 - beta2) * np.power(grads['dW' + str(l + 1)], 2)
        s["db" + str(l + 1)] = beta2 * s["db" + str(l + 1)] + (1 - beta2) * np.power(grads['db' + str(l + 1)], 2)
        

        # Compute bias-corrected second raw moment estimate. 
     
        s_corrected["dW" + str(l + 1)] = s["dW" + str(l + 1)] / (1 - np.power(beta2, t))
        s_corrected["db" + str(l + 1)] = s["db" + str(l + 1)] / (1 - np.power(beta2, t))
       

        # Updating parameters

        parameters["W" + str(l + 1)] = parameters["W" + str(l + 1)] - learning_rate * v_corrected["dW" + str(l + 1)] / np.sqrt(s_corrected["dW" + str(l + 1)] + epsilon)
        parameters["b" + str(l + 1)] = parameters["b" + str(l + 1)] - learning_rate * v_corrected["db" + str(l + 1)] / np.sqrt(s_corrected["db" + str(l + 1)] + epsilon)


    return parameters, v, s

In [0]:
def L_layer_model(X, Y, layers_dims, learning_rate = 0.0075, mini_batch_size=1024, beta = 0.9,beta1= 0.9,beta2=0.999,epsilon=1e-8, num_epochs = 3000,print_cost = False):
   

    #layers_dims -- list containing the input size and each layer size, of length 
    #learning_rate -- learning rate of the gradient descent update rule
    #num_iterations -- number of iterations of the optimization loop
    

    #parameters -- parameters learnt by the model.
    

    seed = 10
    costs = []                         
    t = 0
    
 
    parameters = initialize_parameters_deep(layers_dims)

    v,s = initialize_adam(parameters)
    
    # Loop of gradient descent
    for i in range(num_epochs):

        seed=seed+1
        minibatches = random_mini_batches(X,Y, mini_batch_size,seed)
        for minibatch in minibatches:

          (minibatch_X,minibatch_Y) = minibatch


          AL, caches = L_model_forward(minibatch_X, parameters)

          cost = compute_cost(AL, minibatch_Y)

          grads = L_model_backward(AL, minibatch_Y, caches)

          parameters,v,s = update_parameters_with_adam(parameters, grads,v,s,t,learning_rate,beta1,beta2,epsilon)
    
    return parameters

In [0]:
#model train
layers_dims = [3, 30, 25, 25, 18] #  We can set the number of nodes in each layer and the total number of layers
parameters = L_layer_model(train_x, train_y, layers_dims, num_epochs = 200, print_cost = True)



Cost after epoch 0: nan
Cost after epoch 100: nan


In [0]:
#accuracy prediction and result
def predict(X, y, parameters):

    #p -- predictions for the given dataset re - converted to the original way after doing onehotencoding using the help of dictionary made at the beginning
 
    
    m = X.shape[1]
    n = len(parameters) // 2 # number of layers in the neural network
    p = np.zeros((1,m))
    
    # Forward propagation
    probas, caches = L_model_forward(X, parameters)

    
    # convert probas to 0/1 predictions
    for i in range(0, probas.shape[1]):
      new_list = list(probas[:,i])
      maxm = new_list.index(max(new_list))
      probas[maxm,i]=1
    probas = (probas>=1).astype(int)
    
    prob = probas.T
    y = y.T
    incorrect = np.sum(y!=prob)/2
    accu = (m-incorrect)/m

    print("Accuracy: "  + str(accu))

    #reversing onehotencoding with help of dictionary
    nrow = y.shape[0]
    n= np.argmax(y, axis = 0)
    n_list = list(n)
    n_final = []
    for i in range(nrow):
     indi = n_list[i]
     n_dic = dict[indi]
     n_final.append(n_dic)
    n_final = np.array(n_final)
    new = np.reshape(n_final,(nrow,1))

    return None

In [0]:
pred_train = predict(train_x, train_y, parameters) #accuracy prediction

Accuracy: 0.058655461653098095




In [0]:
pred_test = predict(test_x, test_y, parameters)  #accuracu prediction

Accuracy: 0.032217678741916306


