In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
              
import torch

import torchvision
from torch.utils import data
from torchvision import transforms
from torch import nn
from d2l import torch as d2l

from DataPreprocessing import DataPreprocessing
from FootballDataset import FootballDataset

## Football predicter

### Problem: Predicting football results based on team performance

In this notebook, we build, train, validate, and test a Neural Network with PyTorch to predict the __target_label__ field (win, draw, lose) of the upcoming matches

1. <a href="#1">Read the dataset</a>
    * <a href='#11'> Select features to build the model </a>
2. <a href="#2">Data Processing</a>
    * <a href="#21">Data Preprocessing (cleaning)</a>
    * <a href="#22">Train - Validation - Test Datasets</a>
    * <a href="#23">Data processing with Pipeline and ColumnTransformer</a>
3. <a href="#3">Neural Network Training and Validation</a>
4. <a href="#4">Test the Neural Network</a>
5. <a href="#5">Improvement ideas</a>


__Rosters schema:__ 
- __id:__ (INT) Identifier of roster, which is unique for a player and match
- __goals:__ (INT) Number of goals scored by player in that match
- __shots:__ (INT) Number of shots to the goal for that player
- __own_goals:__ (INT) Number of own goals scored by player in that match
- __xG:__ (FLOAT) Expected goals for that player
- __time:__ (INT) Field time in minutes of player
- __player_id:__ (INT) Unique identifier for that player.
- __team_id:__(INT) Unique identifier of that team
- __position:__(INT) Position played by that player
- __player:__(STR) Name of the player
- __h_a:__(STR, ['h','a']) Home or away
- __yellow_card:__(INT,[0,1,2]) Number of yellow cards
- __red_card:__(INT,[0,1]) Number of red cards
- __roster_in:__ (INT) ID of roster that substitued this player
- __roster_out:__ (INT) ID of roster that left to give place to this player
- __key_passes:__ (INT) number of key passes
- __assists:__ (INT) number of assists to goals
- __xA:__ (FLOAT) expected assists to goals
- __xGChain:__ (FLOAT) expected goals chain
- __xGBuildup:__ (FLOAT) expected goals buildup
- __positionOrder:__ (INT) order in lineup position
- __date:__ (DATE) date of match
- __homeScore:__ (INT) Score of home team
- __awayScore:__ (INT) Score of away team
- __matchId:__ (INT) Unique identifier of the match

__Teams schema:__
- __matchId:__ (INT) Unique identifier of the match
- __teamId:__ (INT) Unique identifier of the team
- __h_a:__ (STR, ['h','a']) Home or away
- __xG:__ (FLOAT) Expected goals for the team
- __xGA:__ (FLOAT) Expected goals against
- __npxG:__ (FLOAT) Expected goals for the team (excluding penalties and own goals)
- __npxGA:__ (FLOAT) Expected goals against (excluding penalties and own goals)
- __deep:__ (FLOAT) Passes completed within an estimated 20 yards of goal (crosses excluded)
- __deep_allowed:__ (FLOAT) Allowed deep passes for the opposite team
- __scored:__ (INT) Goals scored
- __missed:__ (INT) Goals scored against
- __xpts:__ (FLOAT) Expected points
- __result:__ (STR, ['l','w','d']) Match result, win, draw, or loss
- __wins:__ (BOOLEAN) True if team wins
- __draws:__ (BOOLEAN) True if team draws
- __loses:__ (BOOLEAN) True if team loses
- __pts:__ (INT) Points gained for that team
- __npxGD:__ (FLOAT) Difference between expected goals for and against, excluding penalties and own goals.
- __ppda.att:__ (FLOAT) Passes per defensive action in the attack part of the field (PPDA metric is calculated by dividing the number of passes allowed by the defending team by the total number of defensive actions.)
- __ppda.def:__ (FLOAT) Passes per defensive action in the defensive part of the field.
- __ppda_allowed.att:__ (FLOAT) Opponent passes per defensive action in the attack part of the field.
- __ppda_allowed.def:__ (FLOAT) Opponent passes per defensive action in the defensive part of the field.

__Additional fields:__
- __home_points:__ (INT) Points in season before match for home team
- __away_points:__ (INT) Points in season before match for away team
- __scored_goals_season_h:__ (INT) Goals scored in season for home team
- __missed_goals_season_h:__ (INT) Goals missed in season for home team
- __scored_goals_season_a:__ (INT) Goals scored in season for away team
- __missed_goals_season_a:__ (INT) Goals missed in season for away team
- __league:__ (STR) League of the match
- __season:__ (INT) Season of the match
- __(TO BE INSERTED)n_points_h:__ (INT) Points earned in the last N encounters for home team
- __(TO BE INSERTED)n_points_a:__ (INT) Points earned in the last N encounters for away team
- __(TO BE INSERTED)top_assists_h:__ (FLOAT) Highest individual season assists.
- __(TO BE INSERTED)top_score_a:__ (FLOAT) Highest individual season score in squad
- __avg_ppda.att_h:__(FLOAT) Season average of ppda.att for host team
- __avg_ppda.def_h:__(FLOAT) Season average of ppda.def for host team
- __avg_ppda.att_a:__(FLOAT) Season average of ppda.att for visiting team
- __avg_ppda.def_a:__(FLOAT) Season average of ppda.def for visiting team
- __avg_ppda_allowed.att_h:__(FLOAT) Season average of ppda_allowed.att for host team
- __avg_ppda_allowed.def_h:__(FLOAT) Season average of ppda_allowed.def for host team
- __avg_ppda_allowed.att_a:__(FLOAT) Season average of ppda_allowed.att for visiting team
- __avg_ppda_allowed.def_a:__(FLOAT) Season average of ppda_allowed.def for visiting team 
- __avg_deep_h:__(FLOAT) Season average of deep passes for host team
- __avg_deep_a:__(FLOAT) Season average of deep passes for visitor team
- __avg_deep_allowed_h:__ (FLOAT) Season average of allowed deep passes for host team
- __avg_deep_allowed_a:__ (FLOAT) Season average of allowed deep passes for visitor team
... to be continued

##  <a name = '#1'>0 -  Read the data set </a>

In [None]:
# Step 0. Run preprocessing
#dataStream = DataPreprocessing()
df = pd.read_csv('Data/Preprocessed/dataset.csv')

In [None]:
df_2021 = df.loc[df['season'] == 2021,:]
def missing_pct(df):
    # Calculate percentage of missing for each column
    s_missing = df.isnull().sum() * 100 / df.shape[0]
    # Convert the series back to data frame
    df_missing = pd.DataFrame(s_missing).round(2)
    # Reset and rename the index
    df_missing = df_missing.reset_index().rename(
                    columns={
                            'index':'Column',
                            0:'Missing_Percentage (%)'
                    }
                )
    # Sort the data frame
    df_missing = df_missing.sort_values('Missing_Percentage (%)', ascending=False)
    return df_missing

missing_pct(df_2021)

In [None]:
# Step 1. If you need to update the dataset, uncomment below
# dataProcess = DataPreprocessing()

# Step 2. Choose params from schema below
"""schema = ["h_a","xG","xGA","npxG","npxGA","deep","deep_allowed","scored","missed","xpts","result",
          "date","wins","draws","loses","pts","npxGD","ppda.att","ppda.def","ppda_allowed.att",
          "ppda_allowed.def","H team","team_id","h_a_boolean","league","datetime","season","matchId",
          "home_points","scored_goals_season_h","missed_goals_season_h","avg_ppda.att_h","avg_ppda.def_h",
          "avg_ppda_allowed.att_h","avg_ppda_allowed.def_a","avg_deep_h","avg_deep_allowed_h"]"""

num_params = ["season_points","scored_goals_season","missed_goals_season",
             "season_points_adv","scored_goals_season_adv","missed_goals_season_adv"]
# "avg_ppda.att","avg_ppda_allowed.def","avg_deep","avg_deep_allowed", "avg_ppda.att","avg_ppda_allowed.def","avg_deep","avg_deep_allowed"

cat_params = ["h_a"]

target = ["pts"]#["wins","draws","loses"]

# Data loading
def load_data_football(batch_size, num_par, cat_par, target):  #@save
    """Load the dataset into memory."""
    # Load dataset
    dataset = FootballDataset(cat_par, num_par, target]#,league = ["La liga"], season = [2015,2016] )

    # Split into training and test
    train_size = int(0.8 * len(dataset))
    test_size = len(dataset) - train_size
    trainset, testset = data.random_split(dataset, [train_size, test_size])

    # Dataloaders
    return (data.DataLoader(trainset, batch_size=batch_size, shuffle=True),
            data.DataLoader(testset, batch_size=batch_size, shuffle=False))


In [None]:
train_iter, test_iter = load_data_football(100, num_params, cat_params, target)

In [None]:
for X, y in train_iter:
    print(X.shape, X.dtype, y.shape, y.dtype)
    print(X)
    break

In [None]:
# PyTorch does not implicitly reshape the inputs. Thus we define the flatten
# layer to reshape the inputs before the linear layer in our network
net = nn.Sequential(nn.Flatten(), nn.Linear(8, 3))

def init_weights(m):
    if type(m) == nn.Linear:
        nn.init.normal_(m.weight, std=0.01)

net.apply(init_weights);

In [None]:
loss = nn.CrossEntropyLoss()

In [None]:
def train(net, train_iter, test_iter, loss, num_epochs, updater):  #@save
    """Train a model"""
    animator = Animator(xlabel='epoch', xlim=[1, num_epochs], ylim=[0.3, 0.9],
                        legend=['train loss', 'train acc', 'test acc'])
    for epoch in range(num_epochs):
        train_metrics = train_epoch_ch3(net, train_iter, loss, updater)
        test_acc = evaluate_accuracy(net, test_iter)
        animator.add(epoch + 1, train_metrics + (test_acc,))
    train_loss, train_acc = train_metrics
    assert train_loss < 0.5, train_loss
    assert train_acc <= 1 and train_acc > 0.7, train_acc
    assert test_acc <= 1 and test_acc > 0.7, test_acc

In [None]:
net(X).shape

In [None]:
trainer = torch.optim.SGD(net.parameters(), lr=0.2)
num_epochs = 30
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, trainer)

# Model 1 -  Simple Neural Network - Teams Schema

## 1 <a name="#11">Select features to build the model</a>
(<a href="#1">Go to Data Set </a>)

This time we build a model using all features (except __ASIN__). That is, we build a classifier including __numerical, categorical__ and __text__ features. 

In [None]:
# Set 1
numerical_features = ['deep','ppda.att','ppda.def','npxG']#['deep','deep_allowed','ppda.att','ppda_allowed.att','ppda_allowed.def']
categorical_features = ['h_a']
model_features = numerical_features + categorical_features
model_target = ['scored']#['npxG','npxGA']
X = df.loc[:,model_features]
X = np.array(X.values).T # Each column is a data
Y = df.loc[:,model_target].T
Y = np.atleast_2d(np.array(Y.values))

# Set 2
dic_switch = {'h':'a','a':'h'}
numerical_features2 = ['deep_allowed','ppda_allowed.att','ppda_allowed.def','npxGA']
categorical_features2 = ['h_a']
model_features2 = numerical_features2 + categorical_features2
model_target2 = ['missed']#['npxG','npxGA']
df_copy = df
df_copy['h_a'] = df_copy['h_a'].apply(lambda x: dic_switch[x])
X2 = df_copy.loc[:,model_features2]
X2 = np.array(X2.values).T # Each column is a data
Y2 = df_copy.loc[:,model_target2].T
Y2 = np.atleast_2d(np.array(Y2.values))

In [None]:
print(Y.shape)
print(Y2.shape)

In [None]:
# Print X and X2
X2

## 2. Data-preprocessing

### 2.1 <a name = '#21'> Cleaning numerical features </a>

Let's examine the numerical features.

In [None]:
for i in range(0,len(numerical_features)):
    print(df[numerical_features[i]].value_counts(bins=10, sort=False))

__Outliers__. We have no outlier data in the numerical features considered.

__Missing Numerical Values__. Let's check missing values for these numerical features.

In [None]:
print(df[numerical_features].isna().sum())

There are no missing values.

## 2.2  <a name = '#22'>Train - Validation - Test Datasets</a>
(<a href="#2">Go to Data Processing</a>)

We split our dataset into training (80%), validation (10%), and test (10%) subsets using sklearn's [train_test_split()](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html) function twice.

In [None]:
# Train-Validation split
train_data, test_data = train_test_split(df, test_size=0.2, shuffle=True, random_state=23)

# Validation-Test split
val_data, test_data = train_test_split(test_data, test_size=0.5, shuffle=True, random_state=23)

# Print the shapes of the Train - Test Datasets
print('Train - Validation - Test Datasets shapes: ', train_data.shape, val_data.shape, test_data.shape)

## 2.3 <a name = '#23' >Data processing with Pipeline and ColumnTransformer</a>


We can use the composite Pipeline of Day 2 to train and tune a neural network in sklearn, using its implementation of neural network __MultiOutputRegressor__. However, sklearn is not a neural network framework, lacking access to large scale optimization techniques with GPU support and more neural network related utility functions. 
 
In a second instance, instead, we build four neural networks with __PyTorch__, one per output. While for classic, non-neural algorithms, PyTorch is not particularly useful, using an actual deep learning framework for neural network experimentation provides more flexibility and customization.

Choice of model and hosting platform aside, we can still reuse the collective ColumnTransformer from Day 2 to preprocess the data for neural network training, validation and test, ensuring that the transformations learned on the train data are performed accordingly on the training, validation and test datasets.

In [None]:
### COLUMN_TRANSFORMER ###
##########################

# Preprocess the numerical features
numerical_processor = Pipeline([
    ('num_imputer', SimpleImputer(strategy='mean')),
    ('num_scaler', MinMaxScaler()) # Shown in case is needed, not a must with Decision Trees
                                ])
                  
# Preprocess the categorical features
categorical_processor = Pipeline([
    ('cat_imputer', SimpleImputer(strategy='constant', fill_value='missing')), # Shown in case is needed, no effect here as we already imputed with 'nan' strings
    ('cat_encoder', OneHotEncoder(handle_unknown='ignore')) # handle_unknown tells it to ignore (rather than throw an error for) any value that was not present in the initial training set.
                                ])
# Combine all data preprocessors from above (add more, if you choose to define more!)
# For each processor/step specify: a name, the actual process, and finally the features to be processed
data_processor = ColumnTransformer([
    ('numerical_processing', numerical_processor, numerical_features),
    ('categorical_processing', categorical_processor, categorical_features)
                                    ]) 

# Visualize the data processing pipeline
from sklearn import set_config
set_config(display='diagram')
data_processor




In [None]:
### DATA PROCESSING ###
#######################

# Get train data to train the network
X_train = train_data[model_features]
y_train = train_data[model_target].values

# Get validation data to validate the network 
X_val = val_data[model_features]
y_val = val_data[model_target].values

# Get test data to test the network for submission to the leaderboard
X_test = test_data[model_features]
y_test = test_data[model_target].values

print('Datasets shapes before processing: ', X_train.shape, X_val.shape, X_test.shape)

X_train = data_processor.fit_transform(X_train)#.toarray()
X_val = data_processor.transform(X_val)#.toarray()
X_test = data_processor.transform(X_test)#.toarray()

print('Datasets shapes after processing: ', X_train.shape, X_val.shape, X_test.shape)

## 3.1 Multi Output Regressor


In [None]:
multi = MultiOutputRegressor(LinearRegression()).fit(X_train, y_train)

print(f'Example: Multi: Predict = {multi.predict(X_train[[2]])[0][0]:.2f}, Actual = {y_train[2][0]:.2f}')

## 3.2 MLP Regressor

In [None]:
from sklearn.neural_network import MLPRegressor

MLP = MLPRegressor(random_state=1, alpha = 0.01, max_iter=50000, hidden_layer_sizes = (100,50)).fit(X_train, y_train.ravel())
print(f'The Rˆ2 for the training set is = {MLP.score(X_train, y_train):.3f}')
print(f'The Rˆ2 for the test set is = {MLP.score(X_test, y_test):.3f}')

In [None]:
# Set 1
X = df[model_features]
X = data_processor.fit_transform(X)
Y = df[model_target].values

# Set 2
columns_ = ['deep_allowed','ppda_allowed.att','ppda_allowed.def','npxGA','h_a']
dic_switch = {'h':'a','a':'h'}
cols_rename = {'deep_allowed': 'deep','ppda_allowed.att':'ppda.att', 
              'ppda_allowed.def':'ppda.def','npxGA':'npxG','h_a':'h_a','missed':'scored'}
df_copy = df[cols_rename.keys()]
df_copy.loc[:,'h_a'] = df_copy['h_a'].apply(lambda x: dic_switch[x])
df_copy.rename(columns = cols_rename, inplace = True)

X2 = df_copy[model_features]
X2 = data_processor.fit_transform(X2) # Each column is a data
Y2 = df_copy[model_target].values


In [None]:
dic = {'Predicted xG': MLP.predict(X),
       'Actual xG': Y.flatten(),
       'Predicted xGA': MLP.predict(X2),
       'Actual xGA': Y2.flatten()}
df_output = pd.DataFrame(dic)

def result(G1,G2):
    if (G1-G2) > 0.5:
        return 'w'
    elif (G1-G2) < -0.5:
        return 'l'
    else:
        return 'd'


df_output['Predicted xResult'] = df_output.apply(lambda x: result(x['Predicted xG'],x['Predicted xGA']),axis = 1)
df_output['Actual xResult'] = df_output.apply(lambda x: result(x['Actual xG'],x['Actual xGA']),axis = 1)
perc = df_output.apply(lambda x: x['Predicted xResult'] == x['Actual xResult'],axis=1).sum()/len(df_output)*100
print(f'The result accuracy is {perc:.2f} %')

In [None]:
df_output

In [None]:
len(df_output)

In [None]:
MLP.score(X_train, y_train)


## ML Model with PyTorch
Here we implement the training with pytorch

## 2 - ML model

In [None]:
## ML parameters
n_x = X.shape[0]
n_y = Y.shape[0]
n_h = 7

In [None]:
def sigmoid(Z):
    A = 1/(1+np.exp(-Z))
    cache = (A,Z)
    return A, cache

def relu(Z):
    A = np.maximum(0,Z)
    cache = (A,Z)
    return A, cache

def linear_forward(A, W, b):
    """
    Implement the linear part of a layer's forward propagation.

    Arguments:
    A -- activations from previous layer (or input data): (size of previous layer, number of examples)
    W -- weights matrix: numpy array of shape (size of current layer, size of previous layer)
    b -- bias vector, numpy array of shape (size of the current layer, 1)

    Returns:
    Z -- the input of the activation function, also called pre-activation parameter 
    cache -- a python tuple containing "A", "W" and "b" ; stored for computing the backward pass efficiently
    """
    
    ### START CODE HERE ### (≈ 1 line of code)
    Z = np.dot(W,A) + b 
    ### END CODE HERE ###
    
    assert(Z.shape == (W.shape[0], A.shape[1]))
    cache = (A, W, b)
    
    return Z, cache

def linear_activation_forward(A_prev, W, b, activation):
    """
    Implement the forward propagation for the LINEAR->ACTIVATION layer

    Arguments:
    A_prev -- activations from previous layer (or input data): (size of previous layer, number of examples)
    W -- weights matrix: numpy array of shape (size of current layer, size of previous layer)
    b -- bias vector, numpy array of shape (size of the current layer, 1)
    activation -- the activation to be used in this layer, stored as a text string: "sigmoid" or "relu"

    Returns:
    A -- the output of the activation function, also called the post-activation value 
    cache -- a python tuple containing "linear_cache" and "activation_cache";
             stored for computing the backward pass efficiently
    """
    
    if activation == "sigmoid":
        # Inputs: "A_prev, W, b". Outputs: "A, activation_cache".
        Z, linear_cache = linear_forward(A_prev, W, b)
        A, activation_cache = sigmoid(Z)
    
    elif activation == "relu":
        # Inputs: "A_prev, W, b". Outputs: "A, activation_cache".
        Z, linear_cache = linear_forward(A_prev,W,b)
        A, activation_cache = relu(Z)
    
    assert (A.shape == (W.shape[0], A_prev.shape[1]))
    cache = (linear_cache, activation_cache)

    return A, cache

In [None]:
def sigmoid_backward(dA, cache):
    A, Z = cache
    dZ = dA * A * (1 - A)
    return dZ

def relu_backward(dA, cache):
    A, Z = cache
    dZ = np.multiply(dA, np.int64(A > 0))
    return dZ

def linear_backward(dZ, cache):
    A_prev, W, b = cache
    m = A_prev.shape[1]

    dW = (1 / m) * np.dot(dZ, A_prev.T)
    db = (1 / m) * np.sum(dZ, axis=1, keepdims=True)
    dA_prev = np.dot(W.T, dZ)

    assert dA_prev.shape == A_prev.shape
    assert dW.shape == W.shape
    assert db.shape == b.shape

    return dA_prev, dW, db
    
def linear_activation_backward(dA, cache, activation):
    """
    Implement the backward propagation for the LINEAR->ACTIVATION layer.
    
    Arguments:
    dA -- post-activation gradient for current layer l 
    cache -- tuple of values (linear_cache, activation_cache) we store for computing backward propagation efficiently
    activation -- the activation to be used in this layer, stored as a text string: "sigmoid" or "relu"
    
    Returns:
    dA_prev -- Gradient of the cost with respect to the activation (of the previous layer l-1), same shape as A_prev
    dW -- Gradient of the cost with respect to W (current layer l), same shape as W
    db -- Gradient of the cost with respect to b (current layer l), same shape as b
    """
    linear_cache, activation_cache = cache
    
    if activation == "relu":
        ### START CODE HERE ### (≈ 2 lines of code)
        dZ = relu_backward(dA, activation_cache)
        dA_prev, dW, db = linear_backward(dZ,linear_cache)
        ### END CODE HERE ###
        
    elif activation == "sigmoid":
        ### START CODE HERE ### (≈ 2 lines of code)
        dZ = sigmoid_backward(dA, activation_cache)
        dA_prev, dW, db = linear_backward(dZ,linear_cache)
        ### END CODE HERE ###
    
    return dA_prev, dW, db

In [None]:
def initialize_parameters(n_x, n_h, n_y):
    """
    Argument:
    n_x -- size of the input layer
    n_h -- size of the hidden layer
    n_y -- size of the output layer
    
    Returns:
    params -- python dictionary containing your parameters:
                    W1 -- weight matrix of shape (n_h, n_x)
                    b1 -- bias vector of shape (n_h, 1)
                    W2 -- weight matrix of shape (n_y, n_h)
                    b2 -- bias vector of shape (n_y, 1)
    """
    
    ### START CODE HERE ### (≈ 4 lines of code)
    W1 = np.random.randn(n_h,n_x)*0.01
    b1 = np.zeros((n_h,1))
    W2 = np.random.randn(n_y,n_h)*0.01
    b2 = np.zeros((n_y,1))
    ### END CODE HERE ###
    
    assert (W1.shape == (n_h, n_x))
    assert (b1.shape == (n_h, 1))
    assert (W2.shape == (n_y, n_h))
    assert (b2.shape == (n_y, 1))
    
    parameters = {"W1": W1,
                  "b1": b1,
                  "W2": W2,
                  "b2": b2}
    
    return parameters

In [None]:
def forward_propagation(X, parameters):
    """
    Argument:
    X -- input data of size (n_x, m)
    parameters -- python dictionary containing your parameters (output of initialization function)
    
    Returns:
    A2 -- The sigmoid output of the second activation
    cache -- a dictionary containing "Z1", "A1", "Z2" and "A2"
    """
    # Retrieve each parameter from the dictionary "parameters"
    ### START CODE HERE ### (≈ 4 lines of code)
    W1 = parameters["W1"]
    b1 = parameters["b1"]
    W2 = parameters["W2"]
    b2 = parameters["b2"]
    ### END CODE HERE ###
    
    # Implement Forward Propagation to calculate A2 (probabilities)
    ### START CODE HERE ### (≈ 4 lines of code)
    Z1 = np.dot(W1,X) + b1
    A1 = np.tanh(Z1)
    Z2 = np.dot(W2,A1) + b2
    A2 = sigmoid(Z2)
    ### END CODE HERE ###
    
    assert(A2.shape == (1, X.shape[1]))
    
    cache = {"Z1": Z1,
             "A1": A1,
             "Z2": Z2,
             "A2": A2}
    
    return A2, cache

In [None]:
def compute_cost(A2, Y):
    """
    Computes the cross-entropy cost given in equation (13)
    
    Arguments:
    A2 -- The sigmoid output of the second activation, of shape (1, number of examples)
    Y -- "true" labels vector of shape (1, number of examples)
    parameters -- python dictionary containing your parameters W1, b1, W2 and b2
    [Note that the parameters argument is not used in this function, 
    but the auto-grader currently expects this parameter.
    Future version of this notebook will fix both the notebook 
    and the auto-grader so that `parameters` is not needed.
    For now, please include `parameters` in the function signature,
    and also when invoking this function.]
    
    Returns:
    cost -- cross-entropy cost given equation (13)
    
    """
    
    m = Y.shape[1] # number of example

    # Compute the cross-entropy cost
    logprobs = -np.squeeze(np.dot(np.log(A2),Y.T)) - np.squeeze(np.dot((1-Y),np.log(1-A2).T))
    cost = 1/m *logprobs
    cost = float(np.squeeze(cost))  # makes sure cost is the dimension we expect. 
                                    # E.g., turns [[17]] into 17 
    assert(isinstance(cost, float))
    
    return cost

In [None]:
# GRADED FUNCTION: backward_propagation

def backward_propagation(parameters, cache, X, Y):
    """
    Implement the backward propagation using the instructions above.
    
    Arguments:
    parameters -- python dictionary containing our parameters 
    cache -- a dictionary containing "Z1", "A1", "Z2" and "A2".
    X -- input data of shape (2, number of examples)
    Y -- "true" labels vector of shape (1, number of examples)
    
    Returns:
    grads -- python dictionary containing your gradients with respect to different parameters
    """
    m = X.shape[1]
    
    # First, retrieve W1 and W2 from the dictionary "parameters".
    ### START CODE HERE ### (≈ 2 lines of code)
    W1 = parameters["W1"]
    W2 = parameters["W2"]
    ### END CODE HERE ###
        
    # Retrieve also A1 and A2 from dictionary "cache".
    ### START CODE HERE ### (≈ 2 lines of code)
    A1 = cache["A1"]
    A2 = cache["A2"]
    ### END CODE HERE ###
    
    # Backward propagation: calculate dW1, db1, dW2, db2. 
    ### START CODE HERE ### (≈ 6 lines of code, corresponding to 6 equations on slide above)
    dZ2 = A2 - Y
    dW2 = 1/m*np.dot(dZ2,A1.T)
    db2 = 1/m*np.sum(dZ2,axis=1,keepdims=True)
    dZ1 = np.dot(W2.T,dZ2)*(1-np.power(A1,2))
    dW1 = 1/m*np.dot(dZ1,X.T)
    db1 = 1/m*np.sum(dZ1,axis = 1, keepdims = True)
    ### END CODE HERE ###
    
    grads = {"dW1": dW1,
             "db1": db1,
             "dW2": dW2,
             "db2": db2}
    
    return grads

In [None]:
# GRADED FUNCTION: update_parameters

def update_parameters(parameters, grads, learning_rate = 1.2):
    """
    Updates parameters using the gradient descent update rule given above
    
    Arguments:
    parameters -- python dictionary containing your parameters 
    grads -- python dictionary containing your gradients 
    
    Returns:
    parameters -- python dictionary containing your updated parameters 
    """
    # Retrieve each parameter from the dictionary "parameters"
    ### START CODE HERE ### (≈ 4 lines of code)
    W1 = parameters["W1"]
    b1 = parameters["b1"]
    W2 = parameters["W2"]
    b2 = parameters["b2"]
    ### END CODE HERE ###
    
    # Retrieve each gradient from the dictionary "grads"
    ### START CODE HERE ### (≈ 4 lines of code)
    dW1 = grads["dW1"]
    db1 = grads["db1"]
    dW2 = grads["dW2"]
    db2 = grads["db2"]
    ## END CODE HERE ###
    
    # Update rule for each parameter
    ### START CODE HERE ### (≈ 4 lines of code)
    W1 = W1-learning_rate*dW1
    b1 = b1-learning_rate*db1
    W2 = W2-learning_rate*dW2
    b2 = b2-learning_rate*db2
    ### END CODE HERE ###
    
    parameters = {"W1": W1,
                  "b1": b1,
                  "W2": W2,
                  "b2": b2}
    
    return parameters

In [None]:
def two_layer_model(X, Y, layers_dims, learning_rate = 0.0075, num_iterations = 3000, print_cost=False):
    """
    Implements a two-layer neural network: LINEAR->RELU->LINEAR->SIGMOID.
    
    Arguments:
    X -- input data, of shape (n_x, number of examples)
    Y -- true "label" vector (containing 1 if cat, 0 if non-cat), of shape (1, number of examples)
    layers_dims -- dimensions of the layers (n_x, n_h, n_y)
    num_iterations -- number of iterations of the optimization loop
    learning_rate -- learning rate of the gradient descent update rule
    print_cost -- If set to True, this will print the cost every 100 iterations 
    
    Returns:
    parameters -- a dictionary containing W1, W2, b1, and b2
    """
    
    np.random.seed(1)
    grads = {}
    costs = []                              # to keep track of the cost
    m = X.shape[1]                           # number of examples
    (n_x, n_h, n_y) = layers_dims
    
    # Initialize parameters dictionary, by calling one of the functions you'd previously implemented
    ### START CODE HERE ### (≈ 1 line of code)
    parameters = initialize_parameters(n_x,n_h,n_y)
    ### END CODE HERE ###
    
    # Get W1, b1, W2 and b2 from the dictionary parameters.
    W1 = parameters["W1"]
    b1 = parameters["b1"]
    W2 = parameters["W2"]
    b2 = parameters["b2"]
    
    # Loop (gradient descent)

    for i in range(0, num_iterations):

        # Forward propagation: LINEAR -> RELU -> LINEAR -> SIGMOID. Inputs: "X, W1, b1, W2, b2". Output: "A1, cache1, A2, cache2".
        A1, cache1 = linear_activation_forward(X, W1, b1,'relu')
        A2, cache2 = linear_activation_forward(A1, W2, b2,'sigmoid')
        # Compute cost
        cost = compute_cost(A2,Y)
        
        # Initializing backward propagation
        dA2 = - (np.divide(Y, A2) - np.divide(1 - Y, 1 - A2))
        
        # Backward propagation. Inputs: "dA2, cache2, cache1". Outputs: "dA1, dW2, db2; also dA0 (not used), dW1, db1".
        ### START CODE HERE ### (≈ 2 lines of code)
        dA1, dW2, db2 = linear_activation_backward(dA2,cache2,'sigmoid')
        dA0, dW1, db1 = linear_activation_backward(dA1, cache1,'relu')
        ### END CODE HERE ###
        
        # Set grads['dWl'] to dW1, grads['db1'] to db1, grads['dW2'] to dW2, grads['db2'] to db2
        grads['dW1'] = dW1
        grads['db1'] = db1
        grads['dW2'] = dW2
        grads['db2'] = db2
        
        # Update parameters.
        ### START CODE HERE ### (approx. 1 line of code)
        parameters = update_parameters(parameters, grads, learning_rate)
        ### END CODE HERE ###

        # Retrieve W1, b1, W2, b2 from parameters
        W1 = parameters["W1"]
        b1 = parameters["b1"]
        W2 = parameters["W2"]
        b2 = parameters["b2"]
        
        # Print the cost every 100 training example
        if print_cost and i % 100 == 0:
            print("Cost after iteration {}: {}".format(i, np.squeeze(cost)))
        if i % 100 == 0:
            costs.append(cost)
       
    # plot the cost

    plt.plot(np.squeeze(costs))
    plt.ylabel('cost')
    plt.xlabel('iterations (per hundreds)')
    plt.title("Learning rate =" + str(learning_rate))
    plt.show()
    
    return parameters

In [None]:
layers_dims = (n_x,n_h,n_y)
params = two_layer_model(X, Y, layers_dims, learning_rate = 0.0075, num_iterations = 3000, print_cost=False)

## Predict