## A: The value of S1 would be 2 since there are 2 feature and S3 would be 1 since the output is binary.

In [49]:
from collections import deque
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score



In [14]:
def get_cost_w_reg(A3, Y, parameters, lambd):
  
    m  = Y.shape[1]
    W1 = parameters["W1"]
    W2 = parameters["W2"]
    
    cross_entropy_cost     = np.squeeze(- 1 * np.sum(np.multiply(Y, np.log(A3)) \
                                                     + np.multiply((1 - Y), np.log(1 - A3)), axis=1) / Y.shape[1])
    
    L2_regularization_cost = lambd * (np.sum(np.square(W1)) + np.sum(np.square(W2))) / (2 * m) 
    
    
    return cross_entropy_cost + L2_regularization_cost

In [13]:
def init(layer_dims):

    parameters = {}
    
    for l in range(1, len(layer_dims)):
        
        parameters['W' + str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1]) * 0.01
        parameters['b' + str(l)] = np.zeros((layer_dims[l], 1))
          
    return parameters

def forward_prop_activation(A_prev, W, b, activation):
    
    Z            = np.dot(W, A) + b
    linear_cache = (A, W, b)
    
    if activation == "relu":
        
        A                = np.maximum(Z, 0)
        activation_cache = Z
        
    elif activation == "sigmoid":
        
        A                = (1 + np.exp(-Z))**(-1)
        activation_cache = Z
        
    elif activation == "tanh":
        
        activation_cache = Z
        e_z              = np.exp(Z)
        e_nz             = np.exp(-Z)
        A                = (e_z - e_nz)/(e_z + e_nz)


    cache = (linear_cache, activation_cache)
    
    return A, cache


def forward_prop(X, parameters, activations):

    caches = []
    A      = X
    L      = len(parameters) // 2 
    
    for l in range(1, L):
        
        A_prev     = A
        activation = activations.popleft()
        A, cache   = forward_prop_activation(A_prev, parameters["W" + str(l)],parameters["b" + str(l)], activation)
        caches.append(cache)

    activation = activations.popleft()
    AL, cache  = forward_prop_activation(A, parameters["W" + str(l + 1)], parameters["b" + str(l + 1)], activation)
    caches.append(cache)
    
    return AL, caches

In [19]:
def backprop(dZ, cache, lambd):

    A_prev, W, b = cache
    m            = A_prev.shape[1]
    
    dW           = (1/m) * np.dot(dZ, A_prev.T) + (lambd * W) / m
    db           = (1/m) * np.sum(dZ, axis=1, keepdims=True)
    dA_prev      = np.dot(W.T, dZ)
    

    return (dA_prev, dW, db)


def backprop_activation(dA, cache, activation, lambd):
    
    linear_cache, activation_cache = cache
    
    if activation == "relu":
        Z          = activation_cache
        dZ         = np.array(dA, copy=True)
        dZ[Z <= 0] = 0
    
        
    elif activation == "sigmoid":
        
        Z  = activation_cache
        s  = np.power((1 + np.exp(-Z)),-1)
        dZ = dA * s * (1 - s)
   
        
    elif activation == "tanh":        
        
        Z    = activation_cache
        s    = (np.exp(Z) - np.exp(-Z))/(np.exp(Z) + np.exp(-Z))    
        A    = Z * (1 - np.power(s, 2))
    
    dA_prev, dW, db = backprop(dZ, linear_cache, lambd)
    
    return (dA_prev, dW, db)

def get_backprop_gradients(AL, Y, caches, activations, lambd):

    grads = {}
    L     = len(caches)
    m     = AL.shape[1]
    Y     = Y.reshape(AL.shape)
        
    dAL   = - (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL))
    
    
    activation    = activations.pop()
    current_cache = caches[L-1]
    
    grads["dA" + str(L-1)],\
    grads["dW" + str(L)],\
    grads["db" + str(L)] = backprop_activation(dAL, current_cache, activation, lambd)
        
    for l in reversed(range(L-1)):
        
        activation    = activations.pop()
        current_cache = caches[l]
        
        dA_prev_temp, dW_temp, db_temp = backprop_activation(grads["dA" + str(l + 1)],\
                                                                    current_cache, activation, lambd)
        grads["dA" + str(l)]     = dA_prev_temp
        grads["dW" + str(l + 1)] = dW_temp
        grads["db" + str(l + 1)] = db_temp
        
    return grads
    
    
def backward_w_reg(X, Y, cache, lambd):
    
    m = X.shape[1]
    (Z1, A1, W1, b1, Z2, A2, W2, b2) = cache
    
    dZ2 = A2 - Y
    dW2 = 1. / m * np.dot(dZ2, A1.T) + (lambd * W2) / m
    db2 = 1. / m * np.sum(dZ2, axis=1, keepdims=True)
    
    dA1 = np.dot(W2.T, dZ2)
    
    dZ1 = np.multiply(dA1, np.int64(A1 > 0))
    dW1 = 1. / m * np.dot(dZ1, X.T) + (lambd * W1) / m
    db1 = 1. / m * np.sum(dZ1, axis=1, keepdims=True)
    
    gradients = {"dA2": dA2,"dZ2": dZ2, "dW2": dW2, "db2": db2, "dA1": dA1, 
                 "dZ1": dZ1, "dW1": dW1, "db1": db1}
    
    return gradients

In [47]:
def update(parameters, grads, learning_rate):

    L = len(parameters) // 2 
    
    for l in range(L):
        
        parameters["W" + str(l+1)] = parameters["W" + str(l+1)] - \
                                        learning_rate * grads["dW" + str(l + 1)]
        parameters["b" + str(l+1)] = parameters["b" + str(l+1)] - \
                                        learning_rate * grads["db" + str(l + 1)]
        
    return parameters


def train_model(X, Y, config):
            
    parameters = init(config["layers_dims"])
    
    for i in range(0, config["num_iterations"]):
        
        fact       = deque(config["activations"].copy())
        AL, caches = forward_prop(X, parameters, fact)          
        cost       = get_cost_w_reg(AL, Y, parameters, config["lambd"])
        bact       = deque(config["activations"].copy())
        grads      = get_backprop_gradients(AL, Y, caches, bact, config["lambd"])
        parameters = update(parameters, grads, config["learning_rate"])
        
        if i % 1000 == 0:  print ("Cost after iteration %i: %f" %(i, cost))
                         
    return parameters


def predict(X, y, parameters, activations):
    
    m = X.shape[1]
    n = len(parameters) // 2
    p = np.zeros((1,m))


    facts          = deque(activations.copy())
    probas, caches = forward_prop(X, parameters, facts)

    for i in range(0, probas.shape[1]):
        if probas[0,i] > 0.5:
            p[0,i] = 1
        else:
            p[0,i] = 0

    return p


def load():
    
    
    input_data    = pd.read_csv('data/HW5.csv')
    train, test   = train_test_split(input_data, test_size = 0.2)
    
    x_train       = np.array(train.loc[:, ['a', 'b']])
    y_train       = np.array(train.loc[:, ['label']])
    
    x_test        = np.array(test.loc[:, ['a', 'b']])
    y_test        = np.array(test.loc[:, ['label']])
    
    
    
    return x_train, y_train, x_test, y_test

In [27]:
x_train, y_train, x_test, y_test = load()

## B. The accuracy for S = 2 is 73%

In [61]:
config = { "activations": ["relu", "sigmoid"], "layers_dims" : [2, 2, 1],
            "learning_rate": 0.0075, "num_iterations" : 3000,
            "print_cost": True, "random_seed": random_state,v"lambd": 0.5}

parameters        = train_model(x_train.T, y_train.T, config) 
y_test_pred       = predict(x_test.T, y_test.T, parameters, config.get("activations")).T 
accuracy_score(y_test, y_test_pred)

Cost after iteration 0: 0.693154
Cost after iteration 50: 0.693137
Cost after iteration 100: 0.693122
Cost after iteration 150: 0.693104
Cost after iteration 200: 0.693073
Cost after iteration 250: 0.693006
Cost after iteration 300: 0.692856
Cost after iteration 350: 0.692502
Cost after iteration 400: 0.691650
Cost after iteration 450: 0.689604
Cost after iteration 500: 0.684798
Cost after iteration 550: 0.674145
Cost after iteration 600: 0.653417
Cost after iteration 650: 0.621882
Cost after iteration 700: 0.587810
Cost after iteration 750: 0.560301
Cost after iteration 800: 0.540220
Cost after iteration 850: 0.524147
Cost after iteration 900: 0.508574
Cost after iteration 950: 0.491013
Cost after iteration 1000: 0.471099
Cost after iteration 1050: 0.451062
Cost after iteration 1100: 0.433623
Cost after iteration 1150: 0.419801
Cost after iteration 1200: 0.409139
Cost after iteration 1250: 0.400994
Cost after iteration 1300: 0.394644
Cost after iteration 1350: 0.389544
Cost after iter

0.736

In [62]:
parameters

{'W1': array([[1.07303775, 0.14619285],
        [0.56309786, 0.48114712]]),
 'W2': array([[ 1.08843826, -0.74067773]]),
 'b1': array([[ 0.10942382],
        [-0.01702384]]),
 'b2': array([[-0.09888404]])}

## C. The accuracy for S = 10 is 100%

In [53]:
config = {
    "activations": ["relu", "sigmoid"],
    "layers_dims" : [2, 10, 1], #  2-layer model
    "learning_rate": 0.0075,
    "num_iterations" : 3000,
    "print_cost": True,
    "random_seed": random_state,
    "lambd": 0.5
}

parameters  = train_model(x_train.T, y_train.T, config)
y_test_pred = predict(x_test.T, y_test.T, parameters, config.get("activations")).T 
accuracy_score(y_test, y_test_pred)

Cost after iteration 0: 0.692926
Cost after iteration 50: 0.689212
Cost after iteration 100: 0.672043
Cost after iteration 150: 0.615933
Cost after iteration 200: 0.554442
Cost after iteration 250: 0.522741
Cost after iteration 300: 0.500189
Cost after iteration 350: 0.475016
Cost after iteration 400: 0.445164
Cost after iteration 450: 0.415436
Cost after iteration 500: 0.389899
Cost after iteration 550: 0.366299
Cost after iteration 600: 0.339104
Cost after iteration 650: 0.306336
Cost after iteration 700: 0.273213
Cost after iteration 750: 0.245517
Cost after iteration 800: 0.223940
Cost after iteration 850: 0.207078
Cost after iteration 900: 0.193555
Cost after iteration 950: 0.182426
Cost after iteration 1000: 0.173136
Cost after iteration 1050: 0.165184
Cost after iteration 1100: 0.158205
Cost after iteration 1150: 0.151980
Cost after iteration 1200: 0.146350
Cost after iteration 1250: 0.141219
Cost after iteration 1300: 0.136530
Cost after iteration 1350: 0.132191
Cost after iter

1.0

In [57]:
parameters

{'W1': array([[ 1.09113153,  0.17470247],
        [ 0.34042289,  0.30449958],
        [-0.02208754, -0.04524017],
        [ 0.54197801,  0.48489009],
        [ 0.11011674,  0.09852332],
        [-0.04629777, -0.09483148],
        [-0.22306181, -0.45705816],
        [-0.02155755, -0.04402689],
        [-0.02240468, -0.04588485],
        [-0.17769971, -0.36404718]]),
 'W2': array([[ 1.11054459, -0.45658311, -0.05027683, -0.72740834, -0.1477399 ,
         -0.10540846, -0.50873899, -0.04572437, -0.050336  , -0.40506409]]),
 'b1': array([[ 0.10991337],
        [-0.01121258],
        [ 0.00180702],
        [-0.01938075],
        [-0.00404729],
        [ 0.00380767],
        [ 0.01985845],
        [ 0.00077604],
        [ 0.0018217 ],
        [ 0.01519968]]),
 'b2': array([[1.37552988]])}