In [None]:
import numpy as np
import h5py 
import matplotlib.pyplot as plt
from scipy import ndimage
import os
cwd= os.getcwd() # current working directory
path = os.path.join(cwd,'DATA')
def load_dataset():
    file_name=  os.path.join(path , 'train_catvnoncat.h5')
    train_dataset = h5py.File(file_name, "r")
    X_train = np.array(train_dataset["train_set_x"][:]) # your train set features
    Y_train = np.array(train_dataset["train_set_y"][:]) # your train set labels

    file_name=  os.path.join(path , 'test_catvnoncat.h5')
    test_dataset = h5py.File(file_name, "r")
    X_test = np.array(test_dataset["test_set_x"][:]) # your test set features
    Y_test = np.array(test_dataset["test_set_y"][:]) # your test set labels

    classes = ['non-cat','cat']

    return X_train, Y_train, X_test, Y_test, classes

X_train,Y_train, X_test, Y_test, classes = load_dataset()
index = 11
print(classes[Y_train[index]])
plt.imshow(X_train[index])
plt.show()
print('X_train.shape= ', X_train.shape)
print('X_test.shape= ', X_test.shape)
print('Y_train.shape= ', Y_train.shape)
print('Y_test.shape= ', Y_test.shape)
print('Number of training examples: ', X_train.shape[0])
print('Number of testing examples: ', X_test.shape[0])
print('Height/Width of each image: ', X_train.shape[1])
print('Each image is of size: ', X_train.shape[1:])
# Reshape the data 
X_train_flatten = X_train.reshape(X_train.shape[0], -1).T
X_test_flatten = X_test.reshape(X_test.shape[0], -1).T
Y_train = np.array(Y_train)
Y_train = Y_train.reshape(1, -1)
Y_test = np.array(Y_test)
Y_test = Y_test.reshape(1, -1)
print('X_train_flatten shape: ', X_train_flatten.shape)
print('Y_train shape: ', Y_train.shape)
print('X_test_flatten shape: ', X_test_flatten.shape)
print('Y_test shape: ', Y_test.shape)
X_train_scaled = X_train_flatten/255.
X_test_scaled = X_test_flatten/255.
def initialize_parameters(layer_dims):
    """
    layer_dims - list containing the dimensions of each layer in our network including input layer e.g. [12288,7,1]
    Returns: dictionary with keys "W" and "b" and their values are dicts with keys corresponding to layers numbers.
        for 'W' - value for every layer is weight matrix of shape (layer_dims[l], layer_dims[l-1])
        for 'b' - bias vector of shape (layer_dims[l], 1)
    """    
    np.random.seed(1)
    parameters = {'W':{}, 'b':{}}

    for l in range(1, len(layer_dims)):
        parameters['W'][l] = np.random.randn(layer_dims[l], layer_dims[l-1]) * np.sqrt(2 / layer_dims[l-1])
        parameters['b'][l] = np.zeros((layer_dims[l], 1))
    
    return parameters
# check the initialize_parameters()
layer_dims= [2,3,5,1] 
params = initialize_parameters(layer_dims)
for l in range(1,len(layer_dims)):
    print ('W[{0}] =\n{1}\nb[{0}] =\n{2}\n'.format(l, params['W'][l], params['b'][l] ))
def forward_propagation_step(A_prev, W, b, activation):
    """
    A_prev - activations from previous layer: (size of previous layer, number of examples)
    W - weights matrix: array of shape (size of current layer, size of previous layer)
    b - bias vector, array of shape (size of the current layer, 1)
    activation - text string "sigmoid" or "relu"

    Returns:
    A -  post-activation value 
    cache - tuple containing W, b, A_prev, Z stored for computing the backward pass
    """
    
    Z = np.dot(W, A_prev) + b
    
    if activation == "sigmoid":
        A= sigmoid(Z)
        
    elif activation == "relu":
        A = relu(Z)

    assert (A.shape == (W.shape[0], A_prev.shape[1]))

    cache = (W, b, A_prev, Z) # used at backward propagation. Note: b looks as need just to check the shape of dJ_db
    return A, cache


def sigmoid(Z):
    return 1/(1+np.exp(-Z))

def relu(Z):
    return np.maximum(0,Z)
def compute_cost(A_last, Y):
    """
    A_last - vector of predicted probabilties - activations of last layer L, shape (1, number of examples)
    Y - true label e.g. cat vs non-cat, shape (1, number of examples)
    Returns:
    cost - cross-entropy cost
    """

    assert (A_last.shape == Y.shape)
    cost = -1 / Y.shape[1] * np.sum(Y * np.log(A_last) + (1 - Y) * np.log(1 - A_last))

    cost = np.squeeze(cost)

    assert(cost.shape == ()) 
    
    return cost
def init_backward_propagation(Y, A_last):
    dL_dA_last =  - (np.divide(Y, A_last) - np.divide(1 - Y, 1 - A_last)) / Y.shape[1]
    return dL_dA_last
def backward_propagation_step(dL_dA, cache, activation):
    """
    dL_dA - activation gradient for current layer l
    cache - (W, b, A_prev, Z) stored for current layer  l
    activation - string: "sigmoid" or "relu"
    
    Returns:
    dL_dA_prev - Gradient activation of the previous layer l-1, same shape as A_prev
    dL_dW - Gradient of W current layer l, same shape as W
    dL_db - Gradient of b (current layer l), same shape as b
    """
    W, b, A_prev, Z = cache 

    # backward activation part:
    if activation == "relu":
        dg_dz = relu_backward(Z)
    elif activation == "sigmoid":
        dg_dz = sigmoid_backward(Z)
        
    assert (dL_dA.shape == dg_dz.shape)
    dL_dZ = dL_dA * dg_dz

    # backward linear part:
   
    dL_dW = 1 / A_prev.shape[1] * np.dot(dL_dZ, A_prev.T)
    dL_db = 1 / A_prev.shape[1] * np.sum(dL_dZ, axis=1, keepdims=True)
    dL_dA_prev = np.dot(W.T, dL_dZ)
    

    assert (dL_dA_prev.shape == A_prev.shape)
    assert (dL_dW.shape == W.shape)
    assert (dL_db.shape == b.shape)

    return dL_dA_prev, dL_dW, dL_db
        

def relu_backward(Z):
    dg_dz = np.where(Z > 0, 1, 0)
    assert (dg_dz.shape == Z.shape)    
    return dg_dz



def sigmoid_backward(Z):

    dg_dz = sigmoid(Z) * (1 - sigmoid(Z))
    assert (dg_dz.shape == Z.shape)    
    return dg_dz
def update_parameters(parameters, grads, learning_rate):
    """
    Update parameters due to gradient descent rule 
    parameters - dictionary with keys 'W' and 'b' each is dict with keys of layer numbers 
    grads - dictionary with keys 'W' and 'b' each is dict with keys of layer numbers 
   
    Returns: updated parameters the same shape as input parameters 
    """
   
    for i in range(1, len(parameters['W']) + 1):
        parameters['W'][i] -= learning_rate * grads['W'][i]
        parameters['b'][i] -= learning_rate * grads['b'][i]
    return parameters
def two_layer_model(X, Y, layers_dims, learning_rate = 0.0075, num_iterations = 3000, print_cost=False):
    '''
    X - input layer of shape (input size, number of examples)
    Y - output layer of shape (1,m)
    layers_dims - list of layers dims including input layer 
    '''
  
    np.random.seed(1)
    grads = {'W':{}, 'b':{}}
    costs = []   # track the cost
    m = X.shape[1] # number of examples

    # Initialize parameters 
    parameters = initialize_parameters(layers_dims)

    # Loop (gradient descent)
    for i in range(num_iterations):

        # Forward propagation: LINEAR -> RELU -> LINEAR -> SIGMOID.
        A1, cache1 = forward_propagation_step(X, parameters['W'][1], parameters['b'][1], activation='relu')
        A2, cache2 = forward_propagation_step(A1, parameters['W'][2], parameters['b'][2], activation='sigmoid')

        # Compute cost        
        cost = compute_cost(A2, Y)
                
        # Initialize backward propagation        
        dL_dA2 = init_backward_propagation(Y, A2)

        # Backward propagation.
        dL_dA1, grads['W'][2], grads['b'][2] = backward_propagation_step(dL_dA2, cache2, activation='sigmoid')
        _, grads['W'][1], grads['b'][1] = backward_propagation_step(dL_dA1, cache1, activation='relu')
        
       
        # Update parameters    
        parameters =  update_parameters(parameters, grads, learning_rate)
        
        # Print the cost every 100 training example
        if print_cost and i % 100 == 0:
            print("Cost after iteration {}: {}".format(i, np.squeeze(cost)))
        if i % 100 == 0:
            costs.append(cost)
       
    # plot the cost
    if print_cost:
        plt.plot(np.squeeze(costs))
        plt.ylabel('Втрати')
        plt.xlabel('Ітерації (сотні)')
        plt.title("Швидкість навчання =" + str(learning_rate))
        plt.show()

    
    return parameters
n_x = X_train_scaled.shape[0]
n_h= 7 
n_y = Y_train.shape[0]
layers_dims = [n_x, n_h, n_y]

parameters = two_layer_model(
    X_train_scaled, Y_train, layers_dims, learning_rate = 0.003, num_iterations = 3000, print_cost=True)
def evaluate_two_layers(X, Y, parameters):
    """        
    X - array to predict 
    parameters - parameters of the trained model
    """
  
    # Forward propagation
    A1,_ = forward_propagation_step(X, parameters['W'][1], parameters['b'][1], activation='relu')
    Y_pred, _ = forward_propagation_step(A1, parameters['W'][2], parameters['b'][2], activation='sigmoid')

    predictions = (Y_pred > 0.5)
    accuracy = np.mean(predictions == Y)
    return accuracy   

print("Accuracy:: {:.3f}".format(evaluate_two_layers(X_train_scaled, Y_train, parameters)))
print("Accuracy:: {:.3f}".format(evaluate_two_layers(X_test_scaled, Y_test, parameters)))

def forward_propagation_whole_process(X, parameters):
    """
    [LINEAR->RELU]*(L-1)->LINEAR->SIGMOID
    X - data, array of shape (input size, number of examples)
    parameters - initialized parameters foreach of 'W' and 'b' keas values have keys 1,2,...L 
    
    Returns:
    A_last - last activation value (y_pred)
    caches - dict of caches containing every cache of forward propagation indexed from 0 to L-1
    """

    caches = {}
    A = X
    
    L = len(parameters['W']) # number of layers in the neural network

    # [LINEAR -> RELU]*(L-1)
    for l in range(1, L):
        Z = np.dot(parameters['W'][l], A) + parameters['b'][l]
        A = np.maximum(0, Z)
        cache = (A, Z)
        caches[l] = cache

    
    #LINEAR -> SIGMOID
    Z = np.dot(parameters['W'][L], A) + parameters['b'][L]  # LINEAR
    A_last = sigmoid(Z)
    caches[L] = (A, Z)

    assert(A_last.shape == (1, X.shape[1])) # (1,m) 
            
    return A_last, caches
def backward_propagation_whole_process(A_last, Y, caches):
    """
    backward propagation for the [LINEAR->RELU] * (L-1) -> LINEAR -> SIGMOID 
    A_last - probability vector, output(y_pred) of the forward propagation 
    Y - true labels (0 if non-cat, 1 if cat)
    caches - dict of caches for each layer that contains (W, b, A, Z)
    Returns: grads - of keys 'W' and 'b' each containing the  dictionaries of keys 1..L  
    """
    dL_dA= {}
    dL_dW = {}
    dL_db= {}
    
    L = len(caches) # the number of layers
    m = Y.shape[1] # number of samples
    Y = Y.reshape(A_last.shape) # make sure Y is the same shape as A_last(y_pred)
    
    # Initialize backpropagation
    dL_dA[L] = - (np.divide(A_last, Y) - np.divide(1 - A_last, 1 - Y))

    # Backward pass for layer (SIGMOID -> LINEAR)
    current_cache = caches[L]
    A_prev, Z_prev = current_cache

    dL_dW[L] = np.dot(dL_dA[L], A_prev.T) / m
    dL_db[L] = np.sum(dL_dA[L], axis=1, keepdims=True) / m

    # Loop from l=L-2 to l=0
    for l in reversed(range(1, L)):
      # Backward pass for l-th layer (RELU -> LINEAR)
      current_cache = caches[l]
      A_prev, Z_prev = current_cache

      # ReLU gradient (derivative)
      g = np.copy(Z_prev)
      g[Z_prev < 0] = 0

      dL_dA[l-1] = np.dot(dL_dW[l], A_prev.T) / m + np.dot(g.T, dL_dA[l]) * dA_dZ(Z_prev)  # Chain rule
      dL_dW[l] = np.dot(dL_dA[l-1], A_prev.T) / m
      dL_db[l] = np.sum(dL_dA[l-1], axis=1, keepdims=True) / m

    grads = {"W": dL_dW, "b": dL_db}

    return grads
def model(X, Y, layers_dims, learning_rate = 0.0075, num_iterations = 3000, verbose = False):
    """
    X - data, array of shape (number of examples, num_px * num_px * 3)
    Y - true label vector (containing 0 if cat, 1 if non-cat), of shape (1, number of examples)
    layers_dims - list containing the input size and each layer size, of length (number of layers + 1).
    learning_rate - learning rate of the gradient descent update rule
    num_iterations - number of iterations of the optimization loop
    verbose - if True, it prints the cost every 100 steps
    
    Returns:
    parameters - parameters learnt by the model. They can then be used to predict.
    """
    print ('Training {}-layers neural network with layers dimensions: {}'.format (len(layers_dims)-1, layers_dims))
    np.random.seed(1)
    costs = [] # to track of cost
            
    parameters = None
        
    # Loop (gradient descent)
    for i in range(num_iterations):

        # Forward propagation: [LINEAR -> RELU]*(L-1) -> LINEAR -> SIGMOID.
        A_last, caches = None
    
        # Compute cost
        cost = None

        # Backward propagation.
        grads = None

        # Update parameters.
        parameters = None
       
                
        # Print the cost every 100 training example
        if verbose and i % 100 == 0:
            print ("Cost after iteration {}: {}".format(i, cost))
        if i % 100 == 0:
            costs.append(cost)
            
    # plot the cost
    None
    
   
    return parameters
