In [97]:
from collections import deque
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

## Parameter Intitializtion

In [98]:
def initialize_parameters(layer_dims, seed=42):

    parameters = {}
    L = len(layer_dims)
    np.random.seed(seed)
    
    for l in range(1, L):
        
        parameters['W' + str(l)] = \
        np.random.randn(layer_dims[l], 
                        layer_dims[l-1]) * 0.01
        parameters['b' + str(l)] = \
        np.zeros((layer_dims[l], 1))
        
    
    return parameters

In [99]:
def sigmoid(Z):

    A = (1 + np.exp(-Z))**(-1)
    cache = Z
    
    return A, cache

In [100]:
# TODO: figure out how the activation_cache works

def sigmoid_gradient(dA, cache):
    
    Z = cache
    s = np.power((1 + np.exp(-Z)),-1)
    dZ = dA * s * (1 - s)
        
    return dZ

In [101]:
def relu(Z):

    A = np.maximum(Z, 0)
    cache = Z
    return A, cache

In [102]:
def relu_gradient(dA, cache):

    Z = cache
    dZ = np.array(dA, copy=True)
    dZ[Z <= 0] = 0
    
    return dZ


In [103]:
def tanh(Z):

    e_z = np.exp(Z)
    e_nz = np.exp(-Z)
    A = (e_z - e_nz)/(e_z + e_nz)
    
    return A, Z

In [104]:
def tanh_gradient(Z, cache):
    
    Z = cache
    
    e_z = np.exp(Z)
    e_nz = np.exp(-Z)
    s = (e_z - e_nz)/(e_z + e_nz)    
    A = Z * (1 - np.power(s, 2))

    return A, Z
        

## Forward propagation

Implementing forward propagation

In [105]:
def linear_forward(A, W, b):

    Z = np.dot(W, A) + b
    cache = (A, W, b)
    
    return Z, cache

In [106]:
def linear_activation_forward(A_prev, W, b, activation):

    Z, linear_cache =  linear_forward(A_prev, W, b)
    
    if activation == "relu":
        A, activation_cache = relu(Z)
        
    elif activation == "sigmoid":
        A, activation_cache = sigmoid(Z)
        
    elif activation == "tanh":
        A, activation_cache = tanh(Z)
    
    else:
        raise(TypeError("Activation done not exist: {}".\
                        format(activation)))
        
    assert (A.shape == (W.shape[0], A_prev.shape[1]))
    cache = (linear_cache, activation_cache)
    return (A, cache)
    

In [107]:
def L_model_forward(X, parameters, activations):

    caches = []
    A = X
    L = len(parameters) // 2 
    
    for l in range(1, L):
        A_prev = A
        activation = activations.popleft()
        
        A, cache = linear_activation_forward(A_prev, parameters["W" + str(l)], 
                                             parameters["b" + str(l)], 
                                             activation = activation)
        caches.append(cache)

    activation = activations.popleft()
    AL, cache = linear_activation_forward(A, parameters["W" + str(l + 1)], 
                                          parameters["b" + str(l + 1)], 
                                          activation = activation)
    caches.append(cache)
    
    return (AL, caches)


In [108]:
def compute_cost(AL, Y):

    
    m = Y.shape[1]
    a = np.multiply(Y, np.log(AL))
    b = np.multiply((1 - Y), np.log(1 - AL))
    cost = - 1 * np.sum(a + b, axis=1) / m
    
    cost = np.squeeze(cost)
    
    return cost


In [131]:
# GRADED FUNCTION: compute_cost_with_regularization

def compute_cost_w_reg(A3, Y, parameters, lambd):
    """
    Implement the cost function with L2 regularization. See formula (2) above.
    
    Arguments:
    A3 -- post-activation, output of forward propagation, of shape (output size, number of examples)
    Y -- "true" labels vector, of shape (output size, number of examples)
    parameters -- python dictionary containing parameters of the model
    
    Returns:
    cost - value of the regularized loss function (formula (2))
    """
    m = Y.shape[1]
    W1 = parameters["W1"]
    W2 = parameters["W2"]
    
    cross_entropy_cost = compute_cost(A3, Y) # This gives you the cross-entropy part of the cost
    
    ### START CODE HERE ### (approx. 1 line)
    L2_regularization_cost = lambd * (np.sum(np.square(W1)) + np.sum(np.square(W2))) / (2 * m)
    ### END CODER HERE ###
    
    cost = cross_entropy_cost + L2_regularization_cost
    
    return cost

## Backward propagation

Implementing backward propagation

In [186]:
def linear_backward(dZ, cache, lambd):

    A_prev, W, b = cache
    m = A_prev.shape[1]
    
    dW = (1/m) * np.dot(dZ, A_prev.T) + (lambd * W) / m
    db = (1/m) * np.sum(dZ, axis=1, keepdims=True)
    dA_prev = np.dot(W.T, dZ)
    
    
    #dW1 = 1. / m * np.dot(dZ1, X.T) + (lambd * W1) / m

    return (dA_prev, dW, db)


In [176]:
def linear_activation_backward(dA, cache, activation, lambd):
    
    linear_cache, activation_cache = cache
    
    if activation == "relu":
        dZ = relu_gradient(dA, activation_cache)
        
    elif activation == "sigmoid":
        dZ = sigmoid_gradient(dA, activation_cache)
        
    elif activation == "tanh":
        dZ = tanh_gradient(dA, activation_cache)
    
    dA_prev, dW, db = linear_backward(dZ, linear_cache, lambd)
    
    return (dA_prev, dW, db)

In [163]:
def L_model_backward(AL, Y, caches, activations, lambd):

    grads = {}
    L = len(caches)
    m = AL.shape[1]
    Y = Y.reshape(AL.shape)
    
    # Initialize the backpropagation
    
    dAL = - (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL))
    
    # Lth layer gradients
    
    activation = activations.pop()
    current_cache = caches[L-1]
    grads["dA" + str(L-1)], grads["dW" + str(L)],\
                            grads["db" + str(L)] = linear_activation_backward(dAL, current_cache, activation, lambd)
    
    # Loop from l=L-2 to l=0
    
    for l in reversed(range(L-1)):
        
        activation = activations.pop()
        current_cache = caches[l]
        
        dA_prev_temp, dW_temp, db_temp = linear_activation_backward(grads["dA" + str(l + 1)],\
                                                                    current_cache, activation, lambd)
        grads["dA" + str(l)] = dA_prev_temp
        grads["dW" + str(l + 1)] = dW_temp
        grads["db" + str(l + 1)] = db_temp
        
    return grads
    

In [150]:
# GRADED FUNCTION: backward_propagation_with_regularization

def backward_w_reg(X, Y, cache, lambd):
    """
    Implements the backward propagation of our baseline model to which we added an L2 regularization.
    
    Arguments:
    X -- input dataset, of shape (input size, number of examples)
    Y -- "true" labels vector, of shape (output size, number of examples)
    cache -- cache output from forward_propagation()
    lambd -- regularization hyperparameter, scalar
    
    Returns:
    gradients -- A dictionary with the gradients with respect to each parameter, activation and pre-activation variables
    """
    
    m = X.shape[1]
    (Z1, A1, W1, b1, Z2, A2, W2, b2) = cache
    
    #dZ3 = A3 - Y  
    #dW3 = 1. / m * np.dot(dZ3, A2.T) + (lambd * W3) / m
    #db3 = 1. / m * np.sum(dZ3, axis=1, keepdims=True)

    #dA2 = np.dot(W3.T, dZ3)
    
    #dZ2 = np.multiply(dA2, np.int64(A2 > 0))
    dZ2 = A2 - Y
    dW2 = 1. / m * np.dot(dZ2, A1.T) + (lambd * W2) / m
    db2 = 1. / m * np.sum(dZ2, axis=1, keepdims=True)
    
    dA1 = np.dot(W2.T, dZ2)
    
    dZ1 = np.multiply(dA1, np.int64(A1 > 0))
    dW1 = 1. / m * np.dot(dZ1, X.T) + (lambd * W1) / m
    db1 = 1. / m * np.sum(dZ1, axis=1, keepdims=True)
    
    gradients = {"dA2": dA2,
                 "dZ2": dZ2, "dW2": dW2, "db2": db2, "dA1": dA1, 
                 "dZ1": dZ1, "dW1": dW1, "db1": db1}
    
    return gradients

## Update parameters

Update parameters of the model using gradient descent.

In [151]:
def update_parameters(parameters, grads, learning_rate):

    L = len(parameters) // 2 
    
    for l in range(L):
        
        parameters["W" + str(l+1)] = parameters["W" + str(l+1)] - \
                                        learning_rate * grads["dW" + str(l + 1)]
        parameters["b" + str(l+1)] = parameters["b" + str(l+1)] - \
                                        learning_rate * grads["db" + str(l + 1)]
        
    return parameters
    

In [177]:
def train_model(X, Y, config):
        
    activations    = config["activations"]
    layers_dims    = config["layers_dims"]
    learning_rate  = config["learning_rate"]
    num_iterations = config["num_iterations"]
    print_cost     = config["print_cost"]
    random_seed    = config["random_seed"]
    lambd          = config["lambd"]
    
    np.random.seed(random_seed)
    
    costs          = [] 
    
    parameters = initialize_parameters(layers_dims)
    
    for i in range(0, num_iterations):
        
        fact       = deque(activations.copy())
        AL, caches = L_model_forward(X, parameters, fact)          
        #cost      = compute_cost(AL, Y)
        cost       = compute_cost_w_reg(AL, Y, parameters, lambd)
        bact       = deque(activations.copy())
        grads      = L_model_backward(AL, Y, caches, bact, lambd)
        #grads     = backward_w_reg(AL, Y, caches, bact, lambd)
        parameters = update_parameters(parameters, grads, learning_rate)
        
        if print_cost and i % 50 == 0:  print ("Cost after iteration %i: %f" %(i, cost))
        if print_cost and i % 100 == 0: costs.append((i, cost))
            
    if print_cost: costs.append((i, cost))
             
    return parameters, costs
        

In [178]:
def predict(X, y, parameters, activations):
    
    m = X.shape[1]
    n = len(parameters) // 2
    p = np.zeros((1,m))
    
    # Forward propagation
    
    facts          = deque(activations.copy())
    probas, caches = L_model_forward(X, parameters, facts)
    
    for i in range(0, probas.shape[1]):
        if probas[0,i] > 0.5:
            p[0,i] = 1
        else:
            p[0,i] = 0
            
    print("Accuracy: "  + str(np.sum((p == y)/m)))
    return p
    

In [179]:
def report_model(meta_config:dict):
    """ Generate a model report based on meta_config file """
    
    config_template = meta_config.get("config_template", None)
    learning_range = meta_config.get("learning_range", None)
    learning_step = meta_config.get("learning_step", None)
    X_train = meta_config.get("X_train", None)
    y_train = meta_config.get("y_train", None)
    X_test = meta_config.get("X_test", None)
    y_test = meta_config.get("y_test", None)
    X_validate = meta_config.get("X_validate", None)
    y_validate = meta_config.get("y_validate", None) 
    
    learning_min, learning_max = learning_range
    
    learning_rates = np.arange(learning_min, learning_max, learning_step)
    reports = []
    report = {}
    
    i = 0
    for learning_rate in learning_rates:
        
        config = config_template.copy()
        config["learning_rate"] = learning_rate
        
        # Train the model
        
        parameters, costs = train_model(X_train, y_train, config)
        
        # Predict on training set
        train_score = predict(X_train, y_train, parameters, 
                              config.get("activations"))
        
        # Predict on test set
        test_score = predict(X_test, y_test, parameters, 
                             config.get("activations"))
        
        # Predict on validation set
        validation_score = predict(X_validate, y_validate, 
                                   parameters, config.get("activations"))
        
        model_report = report.copy()
        model_report["learning_rate"] = learning_rate
        model_report["costs"] = costs
        model_report["train_parameters"] = parameters
        model_report["train_score"] = train_score
        model_report["test_score"] = test_score
        model_report["validation_score"] = validation_score
        
        reports.append(model_report)
        
        if i % 100 == 0:
            print("Computing the {}th model".format(i))
            
        i += 1
            
    return reports
        
    

# Running the Model

In [180]:
#X_train, y_train

In [181]:
def load_data(filepath, y_label_value):
    
    
    x          = np.load(filepath).transpose()
    y          = np.full(x.shape[0], y_label_value)
    y          = y.reshape(y.shape[0],1)
    input_data = np.append(x, y, axis=1) 
    
    
    return input_data

In [182]:
def load():
    
    
    input_data    = pd.read_csv('data/HW5.csv')
    train, test   = train_test_split(input_data, test_size = 0.2)
    
    x_train       = np.array(train.loc[:, ['a', 'b']])
    y_train       = np.array(train.loc[:, ['label']])
    
    x_test        = np.array(test.loc[:, ['a', 'b']])
    y_test        = np.array(test.loc[:, ['label']])
    
    
    
    return x_train, y_train, x_test, y_test

In [183]:
x_train, y_train, x_test, y_test = load()

In [218]:
# Need to figure out config for the data you have available
random_state = 42
config = {
    "activations": ["relu", "sigmoid"],
    "layers_dims" : [2, 10, 1], #  2-layer model
    "learning_rate": 0.0075,
    "num_iterations" : 3000,
    "print_cost": True,
    "random_seed": random_state,
    "lambd": 500000
}

In [219]:
parameters, costs = train_model(x_train.T, y_train.T, config) 

Cost after iteration 0: 0.846367
Cost after iteration 50: 0.693116
Cost after iteration 100: 0.693091
Cost after iteration 150: 0.693069
Cost after iteration 200: 0.693052
Cost after iteration 250: 0.693037
Cost after iteration 300: 0.693025
Cost after iteration 350: 0.693015
Cost after iteration 400: 0.693007
Cost after iteration 450: 0.693000
Cost after iteration 500: 0.692994
Cost after iteration 550: 0.692990
Cost after iteration 600: 0.692986
Cost after iteration 650: 0.692982
Cost after iteration 700: 0.692980
Cost after iteration 750: 0.692977
Cost after iteration 800: 0.692976
Cost after iteration 850: 0.692974
Cost after iteration 900: 0.692973
Cost after iteration 950: 0.692972
Cost after iteration 1000: 0.692971
Cost after iteration 1050: 0.692970
Cost after iteration 1100: 0.692970
Cost after iteration 1150: 0.692969
Cost after iteration 1200: 0.692969
Cost after iteration 1250: 0.692968
Cost after iteration 1300: 0.692968
Cost after iteration 1350: 0.692968
Cost after iter

In [220]:
y_test_pred = predict(x_test.T, y_test.T, parameters, config.get("activations"))
y_test_pred = y_test_pred.T 
y_test_pred

Accuracy: 0.46199999999999997


array([[0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],

In [221]:
y_test

array([[1],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [1],
       [0],
       [1],
       [0],
       [1],
       [0],
       [1],
       [0],
       [1],
       [1],
       [0],
       [1],
       [1],
       [1],
       [1],
       [0],
       [1],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [1],
       [1],
       [0],
       [0],
       [0],
       [0],
       [1],
       [1],
       [0],
       [0],
       [1],
       [1],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [0],
       [0],
       [1],
       [1],
       [0],
       [1],
       [0],
       [1],
       [1],
       [1],
       [0],
       [1],
       [0],
       [1],
       [1],
       [1],
       [0],
       [0],
       [1],
       [1],
       [0],
       [0],
       [0],
       [0],
       [1],
    

In [222]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_test_pred)

0.462

In [125]:
meta_config = {
    "config_template" : config,
    "learning_range": (0.0005, 0.01),
    "learning_step": 0.0005,
    "X_train": x_train.T,
    "y_train": y_train.T,
    "X_test": x_test.T,
    "y_test": y_test.T,
    "X_validate": x_test.T,
    "y_validate": y_test.T,
}

In [94]:
reports_partb = report_model(meta_config)

Cost after iteration 0: 0.693152
Cost after iteration 50: 0.693148
Cost after iteration 100: 0.693145
Cost after iteration 150: 0.693142
Cost after iteration 200: 0.693139
Cost after iteration 250: 0.693136
Cost after iteration 300: 0.693133
Cost after iteration 350: 0.693130
Cost after iteration 400: 0.693127
Cost after iteration 450: 0.693124
Cost after iteration 500: 0.693122
Cost after iteration 550: 0.693119
Cost after iteration 600: 0.693116
Cost after iteration 650: 0.693114
Cost after iteration 700: 0.693111
Cost after iteration 750: 0.693108
Cost after iteration 800: 0.693106
Cost after iteration 850: 0.693103
Cost after iteration 900: 0.693101
Cost after iteration 950: 0.693098
Cost after iteration 1000: 0.693096
Cost after iteration 1050: 0.693093
Cost after iteration 1100: 0.693091
Cost after iteration 1150: 0.693088
Cost after iteration 1200: 0.693086
Cost after iteration 1250: 0.693084
Cost after iteration 1300: 0.693081
Cost after iteration 1350: 0.693079
Cost after iter

Cost after iteration 2500: 0.610541
Cost after iteration 2550: 0.601333
Cost after iteration 2600: 0.592435
Cost after iteration 2650: 0.583997
Cost after iteration 2700: 0.576117
Cost after iteration 2750: 0.568825
Cost after iteration 2800: 0.562108
Cost after iteration 2850: 0.555933
Cost after iteration 2900: 0.550223
Cost after iteration 2950: 0.544908
Cost after iteration 0: 0.693152
Cost after iteration 50: 0.693136
Cost after iteration 100: 0.693122
Cost after iteration 150: 0.693108
Cost after iteration 200: 0.693096
Cost after iteration 250: 0.693084
Cost after iteration 300: 0.693072
Cost after iteration 350: 0.693060
Cost after iteration 400: 0.693048
Cost after iteration 450: 0.693035
Cost after iteration 500: 0.693022
Cost after iteration 550: 0.693006
Cost after iteration 600: 0.692988
Cost after iteration 650: 0.692967
Cost after iteration 700: 0.692941
Cost after iteration 750: 0.692909
Cost after iteration 800: 0.692869
Cost after iteration 850: 0.692817
Cost after it

Cost after iteration 2100: 0.423815
Cost after iteration 2150: 0.417289
Cost after iteration 2200: 0.411604
Cost after iteration 2250: 0.406706
Cost after iteration 2300: 0.402456
Cost after iteration 2350: 0.398722
Cost after iteration 2400: 0.395413
Cost after iteration 2450: 0.392464
Cost after iteration 2500: 0.389827
Cost after iteration 2550: 0.387446
Cost after iteration 2600: 0.385295
Cost after iteration 2650: 0.383359
Cost after iteration 2700: 0.381594
Cost after iteration 2750: 0.379983
Cost after iteration 2800: 0.378525
Cost after iteration 2850: 0.377182
Cost after iteration 2900: 0.375940
Cost after iteration 2950: 0.374790
Cost after iteration 0: 0.693152
Cost after iteration 50: 0.693124
Cost after iteration 100: 0.693101
Cost after iteration 150: 0.693079
Cost after iteration 200: 0.693058
Cost after iteration 250: 0.693035
Cost after iteration 300: 0.693009
Cost after iteration 350: 0.692976
Cost after iteration 400: 0.692930
Cost after iteration 450: 0.692860
Cost 

Cost after iteration 1900: 0.377215
Cost after iteration 1950: 0.375383
Cost after iteration 2000: 0.373748
Cost after iteration 2050: 0.372298
Cost after iteration 2100: 0.370998
Cost after iteration 2150: 0.369830
Cost after iteration 2200: 0.368766
Cost after iteration 2250: 0.367791
Cost after iteration 2300: 0.366900
Cost after iteration 2350: 0.366080
Cost after iteration 2400: 0.365315
Cost after iteration 2450: 0.364604
Cost after iteration 2500: 0.363949
Cost after iteration 2550: 0.363337
Cost after iteration 2600: 0.362769
Cost after iteration 2650: 0.362245
Cost after iteration 2700: 0.361757
Cost after iteration 2750: 0.361297
Cost after iteration 2800: 0.360867
Cost after iteration 2850: 0.360463
Cost after iteration 2900: 0.360084
Cost after iteration 2950: 0.359724
Cost after iteration 0: 0.693152
Cost after iteration 50: 0.693113
Cost after iteration 100: 0.693081
Cost after iteration 150: 0.693051
Cost after iteration 200: 0.693016
Cost after iteration 250: 0.692968
C

Cost after iteration 1600: 0.370226
Cost after iteration 1650: 0.368783
Cost after iteration 1700: 0.367500
Cost after iteration 1750: 0.366359
Cost after iteration 1800: 0.365327
Cost after iteration 1850: 0.364390
Cost after iteration 1900: 0.363548
Cost after iteration 1950: 0.362778
Cost after iteration 2000: 0.362087
Cost after iteration 2050: 0.361455
Cost after iteration 2100: 0.360873
Cost after iteration 2150: 0.360341
Cost after iteration 2200: 0.359847
Cost after iteration 2250: 0.359392
Cost after iteration 2300: 0.358975
Cost after iteration 2350: 0.358582
Cost after iteration 2400: 0.358212
Cost after iteration 2450: 0.357867
Cost after iteration 2500: 0.357543
Cost after iteration 2550: 0.357240
Cost after iteration 2600: 0.356955
Cost after iteration 2650: 0.356683
Cost after iteration 2700: 0.356426
Cost after iteration 2750: 0.356183
Cost after iteration 2800: 0.355956
Cost after iteration 2850: 0.355744
Cost after iteration 2900: 0.355541
Cost after iteration 2950: 0

In [126]:
reports_partb

[{'costs': [(0, array(0.69315169)),
   (100, array(0.6931452)),
   (200, array(0.69313897)),
   (300, array(0.69313297)),
   (400, array(0.69312719)),
   (500, array(0.69312159)),
   (600, array(0.69311616)),
   (700, array(0.69311088)),
   (800, array(0.69310573)),
   (900, array(0.6931007)),
   (1000, array(0.69309576)),
   (1100, array(0.6930909)),
   (1200, array(0.69308611)),
   (1300, array(0.69308137)),
   (1400, array(0.69307666)),
   (1500, array(0.69307196)),
   (1600, array(0.69306726)),
   (1700, array(0.69306255)),
   (1800, array(0.69305779)),
   (1900, array(0.69305298)),
   (2000, array(0.69304808)),
   (2100, array(0.69304308)),
   (2200, array(0.69303794)),
   (2300, array(0.69303265)),
   (2400, array(0.69302716)),
   (2500, array(0.69302146)),
   (2600, array(0.69301549)),
   (2700, array(0.69300922)),
   (2800, array(0.69300259)),
   (2900, array(0.69299555)),
   (2999, array(0.69298812))],
  'learning_rate': 0.0005,
  'test_score': array([[0., 0., 0., 0., 0., 0., 

In [96]:
x_train.T

array([[ -4.50329446,  -3.45161423, -17.31231426, ..., -14.13973607,
        -18.95504258,   9.05813255],
       [ 17.87968042,  17.52549448,  16.4308011 , ..., -16.75511252,
         14.33449316,   3.16985326]])