# Analysis on Score with Players and on More/Less 1.5 and 2.5 Goals

## Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import sqlite3
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

## DB Connection

In [2]:
cnx = sqlite3.connect('../data/database.sqlite')

## Retrieve Match and Player DF

In [3]:
match_df = pd.read_sql_query("SELECT * from Match", cnx)

In [4]:
player_df = pd.read_sql_query("SELECT * from Player_Attributes", cnx)

## Clean Match and Player DF

In [5]:
match_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25979 entries, 0 to 25978
Columns: 115 entries, id to BSA
dtypes: float64(96), int64(9), object(10)
memory usage: 22.8+ MB


In [6]:
match_df.iloc[:, :50].columns

Index(['id', 'country_id', 'league_id', 'season', 'stage', 'date',
       'match_api_id', 'home_team_api_id', 'away_team_api_id',
       'home_team_goal', 'away_team_goal', 'home_player_X1', 'home_player_X2',
       'home_player_X3', 'home_player_X4', 'home_player_X5', 'home_player_X6',
       'home_player_X7', 'home_player_X8', 'home_player_X9', 'home_player_X10',
       'home_player_X11', 'away_player_X1', 'away_player_X2', 'away_player_X3',
       'away_player_X4', 'away_player_X5', 'away_player_X6', 'away_player_X7',
       'away_player_X8', 'away_player_X9', 'away_player_X10',
       'away_player_X11', 'home_player_Y1', 'home_player_Y2', 'home_player_Y3',
       'home_player_Y4', 'home_player_Y5', 'home_player_Y6', 'home_player_Y7',
       'home_player_Y8', 'home_player_Y9', 'home_player_Y10',
       'home_player_Y11', 'away_player_Y1', 'away_player_Y2', 'away_player_Y3',
       'away_player_Y4', 'away_player_Y5', 'away_player_Y6'],
      dtype='object')

In [7]:
match_df.iloc[:, 51:].columns

Index(['away_player_Y8', 'away_player_Y9', 'away_player_Y10',
       'away_player_Y11', 'home_player_1', 'home_player_2', 'home_player_3',
       'home_player_4', 'home_player_5', 'home_player_6', 'home_player_7',
       'home_player_8', 'home_player_9', 'home_player_10', 'home_player_11',
       'away_player_1', 'away_player_2', 'away_player_3', 'away_player_4',
       'away_player_5', 'away_player_6', 'away_player_7', 'away_player_8',
       'away_player_9', 'away_player_10', 'away_player_11', 'goal', 'shoton',
       'shotoff', 'foulcommit', 'card', 'cross', 'corner', 'possession',
       'B365H', 'B365D', 'B365A', 'BWH', 'BWD', 'BWA', 'IWH', 'IWD', 'IWA',
       'LBH', 'LBD', 'LBA', 'PSH', 'PSD', 'PSA', 'WHH', 'WHD', 'WHA', 'SJH',
       'SJD', 'SJA', 'VCH', 'VCD', 'VCA', 'GBH', 'GBD', 'GBA', 'BSH', 'BSD',
       'BSA'],
      dtype='object')

In [8]:
match_df = match_df[['id', 'country_id', 'league_id', 'season', 'stage', 'date',
       'match_api_id', 'home_team_api_id', 'away_team_api_id',
       'home_team_goal', 'away_team_goal', 'home_player_1', 'home_player_2', 'home_player_3',
       'home_player_4', 'home_player_5', 'home_player_6', 'home_player_7',
       'home_player_8', 'home_player_9', 'home_player_10', 'home_player_11',
       'away_player_1', 'away_player_2', 'away_player_3', 'away_player_4',
       'away_player_5', 'away_player_6', 'away_player_7', 'away_player_8',
       'away_player_9', 'away_player_10', 'away_player_11']]

In [9]:
match_df

Unnamed: 0,id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,...,away_player_2,away_player_3,away_player_4,away_player_5,away_player_6,away_player_7,away_player_8,away_player_9,away_player_10,away_player_11
0,1,1,1,2008/2009,1,2008-08-17 00:00:00,492473,9987,9993,1,...,,,,,,,,,,
1,2,1,1,2008/2009,1,2008-08-16 00:00:00,492474,10000,9994,0,...,,,,,,,,,,
2,3,1,1,2008/2009,1,2008-08-16 00:00:00,492475,9984,8635,0,...,,,,,,,,,,
3,4,1,1,2008/2009,1,2008-08-17 00:00:00,492476,9991,9998,5,...,,,,,,,,,,
4,5,1,1,2008/2009,1,2008-08-16 00:00:00,492477,7947,9985,1,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25974,25975,24558,24558,2015/2016,9,2015-09-22 00:00:00,1992091,10190,10191,1,...,563066.0,8800.0,67304.0,158253.0,133126.0,186524.0,93223.0,121115.0,232110.0,289732.0
25975,25976,24558,24558,2015/2016,9,2015-09-23 00:00:00,1992092,9824,10199,1,...,114792.0,150007.0,178119.0,27232.0,570830.0,260708.0,201704.0,36382.0,34082.0,95257.0
25976,25977,24558,24558,2015/2016,9,2015-09-23 00:00:00,1992093,9956,10179,2,...,67349.0,202663.0,32597.0,114794.0,188114.0,25840.0,482200.0,95230.0,451335.0,275122.0
25977,25978,24558,24558,2015/2016,9,2015-09-22 00:00:00,1992094,7896,10243,0,...,121080.0,197757.0,260964.0,231614.0,113235.0,41116.0,462608.0,42262.0,92252.0,194532.0


In [10]:
#Define function to add the number of goals column
def number_goals(s1, s2):
    return s1+s2

In [11]:
match_df["nb_goals"] = pd.Series(dtype="int64")
match_df["nb_goals"] = match_df[["home_team_goal", "away_team_goal"]].apply(lambda x: number_goals(match_df["home_team_goal"], match_df["away_team_goal"]))

In [12]:
## Add the score column base on the home_team_goal and away_team_goal
match_df["score"] = pd.Series(dtype="int64")

for ix, row in match_df.iterrows():
    if row["home_team_goal"] > row["away_team_goal"]:
        match_df.loc[ix, "score"] = 0 ## O is home victory
    elif row["home_team_goal"] < row["away_team_goal"]:
        match_df.loc[ix, "score"] = 2 ## 2 is away victory
    else:
        match_df.loc[ix, "score"] = 1 ## 1 is a draw

In [13]:
match_df[["home_team_goal", "away_team_goal", "nb_goals", "score"]].head()

Unnamed: 0,home_team_goal,away_team_goal,nb_goals,score
0,1,1,2,1.0
1,0,0,0,1.0
2,0,3,3,2.0
3,5,0,5,0.0
4,1,3,4,2.0


In [14]:
match_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25979 entries, 0 to 25978
Data columns (total 35 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                25979 non-null  int64  
 1   country_id        25979 non-null  int64  
 2   league_id         25979 non-null  int64  
 3   season            25979 non-null  object 
 4   stage             25979 non-null  int64  
 5   date              25979 non-null  object 
 6   match_api_id      25979 non-null  int64  
 7   home_team_api_id  25979 non-null  int64  
 8   away_team_api_id  25979 non-null  int64  
 9   home_team_goal    25979 non-null  int64  
 10  away_team_goal    25979 non-null  int64  
 11  home_player_1     24755 non-null  float64
 12  home_player_2     24664 non-null  float64
 13  home_player_3     24698 non-null  float64
 14  home_player_4     24656 non-null  float64
 15  home_player_5     24663 non-null  float64
 16  home_player_6     24654 non-null  float6

In [15]:
for i in range (1,12):
    match_df = match_df[match_df["home_player_"+str(i)].notna()]
    match_df = match_df[match_df["away_player_"+str(i)].notna()]

In [16]:
match_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21374 entries, 145 to 25978
Data columns (total 35 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                21374 non-null  int64  
 1   country_id        21374 non-null  int64  
 2   league_id         21374 non-null  int64  
 3   season            21374 non-null  object 
 4   stage             21374 non-null  int64  
 5   date              21374 non-null  object 
 6   match_api_id      21374 non-null  int64  
 7   home_team_api_id  21374 non-null  int64  
 8   away_team_api_id  21374 non-null  int64  
 9   home_team_goal    21374 non-null  int64  
 10  away_team_goal    21374 non-null  int64  
 11  home_player_1     21374 non-null  float64
 12  home_player_2     21374 non-null  float64
 13  home_player_3     21374 non-null  float64
 14  home_player_4     21374 non-null  float64
 15  home_player_5     21374 non-null  float64
 16  home_player_6     21374 non-null  floa

In [17]:
player_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 183978 entries, 0 to 183977
Data columns (total 42 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   id                   183978 non-null  int64  
 1   player_fifa_api_id   183978 non-null  int64  
 2   player_api_id        183978 non-null  int64  
 3   date                 183978 non-null  object 
 4   overall_rating       183142 non-null  float64
 5   potential            183142 non-null  float64
 6   preferred_foot       183142 non-null  object 
 7   attacking_work_rate  180748 non-null  object 
 8   defensive_work_rate  183142 non-null  object 
 9   crossing             183142 non-null  float64
 10  finishing            183142 non-null  float64
 11  heading_accuracy     183142 non-null  float64
 12  short_passing        183142 non-null  float64
 13  volleys              181265 non-null  float64
 14  dribbling            183142 non-null  float64
 15  curve            

In [18]:
## Define function to return the overall rating of each player on the peach

def players_rating(s):
    if player_df[player_df["player_api_id"] == s].empty:
        return 0
    return int(player_df[player_df["player_api_id"] == s]["overall_rating"])

In [19]:
player_df = player_df.sort_values(by=["date", "overall_rating"], ascending=False).drop_duplicates("player_api_id")

In [20]:
player_df = player_df.sort_values(by="overall_rating", ascending=False)

In [21]:
for i in range(1,12):
    match_df["home_player_"+str(i)] = match_df["home_player_"+str(i)].apply(lambda s: int(s))
    match_df["away_player_"+str(i)] = match_df["away_player_"+str(i)].apply(lambda s: int(s))

### Adding Column home/away_player_rate_n

In [22]:
for i in range(1,12):
    match_df["home_player_rate"+str(i)] = match_df["home_player_"+str(i)].apply(players_rating) 
for i in range(1,12):
    match_df["away_player_rate"+str(i)] = match_df["away_player_"+str(i)].apply(players_rating)

### Change dtype of score's column

In [23]:
match_df["score"] = match_df["score"].astype('int64')

### Change dtype of Data in `match_df`

In [24]:
match_df["date"] = pd.to_datetime(match_df["date"])

### Create More/Less 1.5 Goals and More/Less 2.5 Goals columns

In [25]:
match_df["more_less_1.5_goals"] = match_df["nb_goals"].apply(lambda x: 0 if x<2 else 1)

In [26]:
match_df["more_less_2.5_goals"] = match_df["nb_goals"].apply(lambda x: 0 if x<3 else 1)

---

## Analyse Match DF

In [27]:
match_df.columns

Index(['id', 'country_id', 'league_id', 'season', 'stage', 'date',
       'match_api_id', 'home_team_api_id', 'away_team_api_id',
       'home_team_goal', 'away_team_goal', 'home_player_1', 'home_player_2',
       'home_player_3', 'home_player_4', 'home_player_5', 'home_player_6',
       'home_player_7', 'home_player_8', 'home_player_9', 'home_player_10',
       'home_player_11', 'away_player_1', 'away_player_2', 'away_player_3',
       'away_player_4', 'away_player_5', 'away_player_6', 'away_player_7',
       'away_player_8', 'away_player_9', 'away_player_10', 'away_player_11',
       'nb_goals', 'score', 'home_player_rate1', 'home_player_rate2',
       'home_player_rate3', 'home_player_rate4', 'home_player_rate5',
       'home_player_rate6', 'home_player_rate7', 'home_player_rate8',
       'home_player_rate9', 'home_player_rate10', 'home_player_rate11',
       'away_player_rate1', 'away_player_rate2', 'away_player_rate3',
       'away_player_rate4', 'away_player_rate5', 'away_player_

In [29]:
match_df[['score', 'nb_goals', 'more_less_1.5_goals',
       'more_less_2.5_goals','home_player_rate1', 'home_player_rate2',
       'home_player_rate3', 'home_player_rate4', 'home_player_rate5',
       'home_player_rate6', 'home_player_rate7', 'home_player_rate8',
       'home_player_rate9', 'home_player_rate10', 'home_player_rate11',
       'away_player_rate1', 'away_player_rate2', 'away_player_rate3',
       'away_player_rate4', 'away_player_rate5', 'away_player_rate6',
       'away_player_rate7', 'away_player_rate8', 'away_player_rate9',
       'away_player_rate10', 'away_player_rate11']].corr()

Unnamed: 0,score,nb_goals,more_less_1.5_goals,more_less_2.5_goals,home_player_rate1,home_player_rate2,home_player_rate3,home_player_rate4,home_player_rate5,home_player_rate6,...,away_player_rate2,away_player_rate3,away_player_rate4,away_player_rate5,away_player_rate6,away_player_rate7,away_player_rate8,away_player_rate9,away_player_rate10,away_player_rate11
score,1.0,-0.051135,-0.033537,-0.046012,-0.159051,-0.16136,-0.164457,-0.174332,-0.159563,-0.168452,...,0.135228,0.147215,0.14939,0.147108,0.153638,0.152971,0.152245,0.148708,0.154601,0.146564
nb_goals,-0.051135,1.0,0.689244,0.80433,0.009692,0.034476,0.03832,0.033798,0.036227,0.04586,...,-0.001137,-0.014831,-0.010422,-0.01448,-0.000213,0.001784,0.001027,0.016274,0.021356,0.009356
more_less_1.5_goals,-0.033537,0.689244,1.0,0.5833,-0.000456,0.012935,0.018095,0.017915,0.014699,0.021736,...,-0.008273,-0.017767,-0.017654,-0.021292,-0.007367,-0.006247,-0.005947,0.005218,0.008579,0.005002
more_less_2.5_goals,-0.046012,0.80433,0.5833,1.0,0.010741,0.026866,0.03038,0.03094,0.024046,0.03938,...,0.00139,-0.013365,-0.006464,-0.012036,-0.001691,0.000222,0.0017,0.014236,0.016639,0.006715
home_player_rate1,-0.159051,0.009692,-0.000456,0.010741,1.0,0.612088,0.63485,0.622579,0.587848,0.60931,...,0.371087,0.377031,0.384346,0.347004,0.351997,0.351916,0.353365,0.346226,0.344875,0.358299
home_player_rate2,-0.16136,0.034476,0.012935,0.026866,0.612088,1.0,0.639476,0.63401,0.607686,0.640327,...,0.380088,0.390681,0.394959,0.356579,0.362777,0.363802,0.361288,0.350897,0.360084,0.36834
home_player_rate3,-0.164457,0.03832,0.018095,0.03038,0.63485,0.639476,1.0,0.643797,0.615043,0.643297,...,0.388916,0.402155,0.396624,0.359705,0.371117,0.364786,0.370064,0.358275,0.354451,0.373292
home_player_rate4,-0.174332,0.033798,0.017915,0.03094,0.622579,0.63401,0.643797,1.0,0.595479,0.641987,...,0.392429,0.395675,0.392859,0.359037,0.369824,0.369047,0.364035,0.352468,0.359828,0.37094
home_player_rate5,-0.159563,0.036227,0.014699,0.024046,0.587848,0.607686,0.615043,0.595479,1.0,0.608265,...,0.35623,0.363936,0.364235,0.326492,0.344594,0.33764,0.344657,0.333086,0.331915,0.344902
home_player_rate6,-0.168452,0.04586,0.021736,0.03938,0.60931,0.640327,0.643297,0.641987,0.608265,1.0,...,0.365222,0.378871,0.37498,0.340886,0.350507,0.35038,0.34793,0.333761,0.336412,0.349495


---

## Modelling without PreProcessing (Scaling)

### Importing Modelling libraries

In [30]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, Sequential
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.callbacks import EarlyStopping


In [32]:
X_init = match_df[['home_player_rate1', 'home_player_rate2',
       'home_player_rate3', 'home_player_rate4', 'home_player_rate5',
       'home_player_rate6', 'home_player_rate7', 'home_player_rate8',
       'home_player_rate9', 'home_player_rate10', 'home_player_rate11',
       'away_player_rate1', 'away_player_rate2', 'away_player_rate3',
       'away_player_rate4', 'away_player_rate5', 'away_player_rate6',
       'away_player_rate7', 'away_player_rate8', 'away_player_rate9',
       'away_player_rate10', 'away_player_rate11']]

### Predict Score (1 | N | 2)

In [43]:
y = match_df['score']

In [62]:
y.value_counts()

0    9810
2    6166
1    5398
Name: score, dtype: int64

In [70]:
X_score = X_init

In [71]:
y_score_cat = y_cat = to_categorical(y, dtype='int64')

In [72]:
X_score_train, X_score_test, y_score_train, y_score_test = train_test_split(X_score, y_score_cat, test_size=0.3, random_state=42)

In [73]:
X_score_train.shape, X_score_test.shape, y_score_train.shape, y_score_test.shape

((14961, 22), (6413, 22), (14961, 3), (6413, 3))

In [74]:
#Init model

model_score = Sequential()
model_score.add(layers.Dense(5, activation='relu', input_dim=22)) # put the dimensions of X.shape for x1,x2,x3 
model_score.add(layers.Dense(2, activation='relu')) # ReLU activation is the prefered default activation fonction
model_score.add(layers.Dense(3, activation='softmax')) # Number of neurones correspond to the number of categorical values to classify

In [75]:
#Compile model
model_score.compile(loss='categorical_crossentropy',   # The loss is calculated with the categorical_crossentropy parameter
              optimizer='adam',
              metrics=['accuracy'])

In [76]:
#Fit model
es = EarlyStopping(patience=10, restore_best_weights=True, verbose=1)  ## define the number of patience (retries before stopping the iteration epohcs) 


model_score.fit(X_score_train,                ## doing the fit on the train data
          y_score_train,                ## doing the fit on the train data
          validation_split=0.3,   ## Validation set (here split the 30% of the train data)
          epochs=100,             ## Number of epochs to iterate (the EarlyStopping should stop before arriving at the end if find optimum acu)
          batch_size=16,          ## Number of batch size. Slice the data to adjust weights
          callbacks = [es])       ## Calling EarlyStopping

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Restoring model weights from the end of the best epoch.
Epoch 00037: early stopping


<tensorflow.python.keras.callbacks.History at 0x7f62b60dee90>

In [77]:
#Evaluate the model
model_score.evaluate(X_score_test, y_score_test, verbose=1)  ## Evaluate the model with the test set



[0.9797433614730835, 0.5240916609764099]

### Predict More/Less 1.5 Goals

In [104]:
X_ml_1_5 = X_init

In [105]:
y_ml_1_5 = match_df["more_less_1.5_goals"] 

In [106]:
y_ml_1_5.value_counts()

1    16147
0     5227
Name: more_less_1.5_goals, dtype: int64

In [107]:
X_ml_1_5_train, X_ml_1_5_test, y_ml_1_5_train, y_ml_1_5_test = train_test_split(X_ml_1_5, y_ml_1_5, test_size=0.3, random_state=42)

In [108]:
X_ml_1_5_train.shape, X_ml_1_5_test.shape, y_ml_1_5_train.shape, y_ml_1_5_test.shape

((14961, 22), (6413, 22), (14961,), (6413,))

In [109]:
#Init model

model_ml_1_5 = Sequential()
model_ml_1_5.add(layers.Dense(5, activation='relu', input_dim=22)) # put the dimensions of X.shape for x1,x2,x3 
model_ml_1_5.add(layers.Dense(2, activation='relu')) # ReLU activation is the prefered default activation fonction
model_ml_1_5.add(layers.Dense(1, activation='sigmoid')) # Number of neurones correspond to the number of categorical values to classify

In [110]:
#Compile model
model_ml_1_5.compile(loss='binary_crossentropy',   # The loss is calculated with the categorical_crossentropy parameter
              optimizer='adam',
              metrics=['accuracy'])

In [111]:
#Fit model
es = EarlyStopping(patience=10, restore_best_weights=True, verbose=1)  ## define the number of patience (retries before stopping the iteration epohcs) 


model_ml_1_5.fit(X_ml_1_5_train,                ## doing the fit on the train data
          y_ml_1_5_train,                ## doing the fit on the train data
          validation_split=0.3,   ## Validation set (here split the 30% of the train data)
          epochs=100,             ## Number of epochs to iterate (the EarlyStopping should stop before arriving at the end if find optimum acu)
          batch_size=16,          ## Number of batch size. Slice the data to adjust weights
          callbacks = [es])       ## Calling EarlyStopping

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Restoring model weights from the end of the best epoch.
Epoch 00014: early stopping


<tensorflow.python.keras.callbacks.History at 0x7f62b8538950>

In [112]:
#Evaluate the model
model_ml_1_5.evaluate(X_ml_1_5_test, y_ml_1_5_test, verbose=1)  ## Evaluate the model with the test set



[0.5531734824180603, 0.7584593892097473]

### Predict More/Less 2.5 Goals

In [87]:
X_ml_2_5 = X_init

In [88]:
y_ml_2_5 = match_df["more_less_2.5_goals"] 

In [89]:
y_ml_2_5.value_counts()

1    10953
0    10421
Name: more_less_2.5_goals, dtype: int64

In [90]:
X_ml_2_5_train, X_ml_2_5_test, y_ml_2_5_train, y_ml_2_5_test = train_test_split(X_ml_2_5, y_ml_2_5, test_size=0.3, random_state=42)

In [91]:
X_ml_2_5_train.shape, X_ml_2_5_test.shape, y_ml_2_5_train.shape, y_ml_2_5_test.shape

((14961, 22), (6413, 22), (14961,), (6413,))

In [100]:
#Init model

model_ml_2_5 = Sequential()
model_ml_2_5.add(layers.Dense(5, activation='relu', input_dim=22)) # put the dimensions of X.shape for x1,x2,x3 
model_ml_2_5.add(layers.Dense(3, activation='relu')) # ReLU activation is the prefered default activation fonction
model_ml_2_5.add(layers.Dense(1, activation='sigmoid')) # Number of neurones correspond to the number of categorical values to classify

In [101]:
#Compile model
model_ml_2_5.compile(loss='binary_crossentropy',   # The loss is calculated with the categorical_crossentropy parameter
              optimizer='adam',
              metrics=['accuracy'])

In [102]:
#Fit model
es = EarlyStopping(patience=10, restore_best_weights=True, verbose=1)  ## define the number of patience (retries before stopping the iteration epohcs) 


model_ml_2_5.fit(X_ml_2_5_train,                ## doing the fit on the train data
          y_ml_2_5_train,                ## doing the fit on the train data
          validation_split=0.3,   ## Validation set (here split the 30% of the train data)
          epochs=100,             ## Number of epochs to iterate (the EarlyStopping should stop before arriving at the end if find optimum acu)
          batch_size=32,          ## Number of batch size. Slice the data to adjust weights
          callbacks = [es])       ## Calling EarlyStopping

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Restoring model weights from the end of the best epoch.
Epoch 00015: early stopping


<tensorflow.python.keras.callbacks.History at 0x7f62b83df890>

In [103]:
#Evaluate the model
model_ml_2_5.evaluate(X_ml_2_5_test, y_ml_2_5_test, verbose=1)  ## Evaluate the model with the test set



[0.692613422870636, 0.516606867313385]

## Modelling with Preprocessing (Scaling)

### Predict Score (1 | N | 2)

In [113]:
Xs_score = X_init

In [114]:
# Instanciate Robust Scaler
r_scaler = RobustScaler()

# Fit scaler to feature
r_scaler.fit(X_init)

# Scale
Xs_score = r_scaler.transform(Xs_score)

In [120]:
y.value_counts()

0    9810
2    6166
1    5398
Name: score, dtype: int64

In [121]:
Xs_score_train, Xs_score_test, y_score_train, y_score_test = train_test_split(Xs_score, y_score_cat, test_size=0.3, random_state=42)

In [122]:
Xs_score_train.shape, Xs_score_test.shape, y_score_train.shape, y_score_test.shape

((14961, 22), (6413, 22), (14961, 3), (6413, 3))

In [123]:
#Init model

model_s_score = Sequential()
model_s_score.add(layers.Dense(5, activation='relu', input_dim=22)) # put the dimensions of X.shape for x1,x2,x3 
model_s_score.add(layers.Dense(2, activation='relu')) # ReLU activation is the prefered default activation fonction
model_s_score.add(layers.Dense(3, activation='softmax')) # Number of neurones correspond to the number of categorical values to classify

In [124]:
#Compile model
model_s_score.compile(loss='categorical_crossentropy',   # The loss is calculated with the categorical_crossentropy parameter
              optimizer='adam',
              metrics=['accuracy'])

In [126]:
#Fit model
es = EarlyStopping(patience=10, restore_best_weights=True, verbose=1)  ## define the number of patience (retries before stopping the iteration epohcs) 

model_s_score.fit(Xs_score_train,                ## doing the fit on the train data
          y_score_train,                ## doing the fit on the train data
          validation_split=0.3,   ## Validation set (here split the 30% of the train data)
          epochs=100,             ## Number of epochs to iterate (the EarlyStopping should stop before arriving at the end if find optimum acu)
          batch_size=16,          ## Number of batch size. Slice the data to adjust weights
          callbacks = [es])       ## Calling EarlyStopping

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Restoring model weights from the end of the best epoch.
Epoch 00040: early stopping


<tensorflow.python.keras.callbacks.History at 0x7f62b8720e90>

In [127]:
#Evaluate the model
model_s_score.evaluate(Xs_score_test, y_score_test, verbose=1)  ## Evaluate the model with the test set



[0.9744535088539124, 0.5251832008361816]

### Predict More/Less 1.5 Goals

In [128]:
Xs_ml_1_5 = X_init

In [129]:
y_ml_1_5 = match_df["more_less_1.5_goals"] 

In [130]:
y_ml_1_5.value_counts()

1    16147
0     5227
Name: more_less_1.5_goals, dtype: int64

In [131]:
Xs_ml_1_5_train, Xs_ml_1_5_test, y_ml_1_5_train, y_ml_1_5_test = train_test_split(Xs_ml_1_5, y_ml_1_5, test_size=0.3, random_state=42)

In [132]:
Xs_ml_1_5_train.shape, Xs_ml_1_5_test.shape, y_ml_1_5_train.shape, y_ml_1_5_test.shape

((14961, 22), (6413, 22), (14961,), (6413,))

In [133]:
#Init model

model_s_ml_1_5 = Sequential()
model_s_ml_1_5.add(layers.Dense(5, activation='relu', input_dim=22)) # put the dimensions of X.shape for x1,x2,x3 
model_s_ml_1_5.add(layers.Dense(2, activation='relu')) # ReLU activation is the prefered default activation fonction
model_s_ml_1_5.add(layers.Dense(1, activation='sigmoid')) # Number of neurones correspond to the number of categorical values to classify

In [134]:
#Compile model
model_s_ml_1_5.compile(loss='binary_crossentropy',   # The loss is calculated with the categorical_crossentropy parameter
              optimizer='adam',
              metrics=['accuracy'])

In [135]:
#Fit model
es = EarlyStopping(patience=10, restore_best_weights=True, verbose=1)  ## define the number of patience (retries before stopping the iteration epohcs) 

model_s_ml_1_5.fit(Xs_ml_1_5_train,                ## doing the fit on the train data
          y_ml_1_5_train,                ## doing the fit on the train data
          validation_split=0.3,   ## Validation set (here split the 30% of the train data)
          epochs=100,             ## Number of epochs to iterate (the EarlyStopping should stop before arriving at the end if find optimum acu)
          batch_size=16,          ## Number of batch size. Slice the data to adjust weights
          callbacks = [es])       ## Calling EarlyStopping

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Restoring model weights from the end of the best epoch.
Epoch 00014: early stopping


<tensorflow.python.keras.callbacks.History at 0x7f62b8843610>

In [136]:
#Evaluate the model
model_s_ml_1_5.evaluate(Xs_ml_1_5_test, y_ml_1_5_test, verbose=1)  ## Evaluate the model with the test set



[0.5532316565513611, 0.7584593892097473]

### Predict More/Less 2.5 Goals

In [137]:
Xs_ml_2_5 = X_init

In [138]:
y_ml_2_5 = match_df["more_less_2.5_goals"] 

In [142]:
y_ml_2_5.value_counts()

1    10953
0    10421
Name: more_less_2.5_goals, dtype: int64

In [143]:
Xs_ml_2_5_train, Xs_ml_2_5_test, y_ml_2_5_train, y_ml_1_5_test = train_test_split(Xs_ml_2_5, y_ml_2_5, test_size=0.3, random_state=42)

In [144]:
Xs_ml_2_5_train.shape, Xs_ml_2_5_test.shape, y_ml_2_5_train.shape, y_ml_2_5_test.shape

((14961, 22), (6413, 22), (14961,), (6413,))

In [145]:
#Init model

model_s_ml_2_5 = Sequential()
model_s_ml_2_5.add(layers.Dense(5, activation='relu', input_dim=22)) # put the dimensions of X.shape for x1,x2,x3 
model_s_ml_2_5.add(layers.Dense(2, activation='relu')) # ReLU activation is the prefered default activation fonction
model_s_ml_2_5.add(layers.Dense(1, activation='sigmoid')) # Number of neurones correspond to the number of categorical values to classify

In [146]:
#Compile model
model_s_ml_2_5.compile(loss='binary_crossentropy',   # The loss is calculated with the categorical_crossentropy parameter
              optimizer='adam',
              metrics=['accuracy'])

In [147]:
#Fit model
es = EarlyStopping(patience=10, restore_best_weights=True, verbose=1)  ## define the number of patience (retries before stopping the iteration epohcs) 

model_s_ml_2_5.fit(Xs_ml_2_5_train,                ## doing the fit on the train data
          y_ml_2_5_train,                ## doing the fit on the train data
          validation_split=0.3,   ## Validation set (here split the 30% of the train data)
          epochs=100,             ## Number of epochs to iterate (the EarlyStopping should stop before arriving at the end if find optimum acu)
          batch_size=16,          ## Number of batch size. Slice the data to adjust weights
          callbacks = [es])       ## Calling EarlyStopping

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Restoring model weights from the end of the best epoch.
Epoch 00011: early stopping


<tensorflow.python.keras.callbacks.History at 0x7f62b89f5950>

In [148]:
#Evaluate the model
model_s_ml_2_5.evaluate(Xs_ml_2_5_test, y_ml_2_5_test, verbose=1)  ## Evaluate the model with the test set



[0.6929056644439697, 0.516606867313385]