# Football Result Predictor Model

## Dataset

In [1]:
#Import dependencies
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import normalize
from IPython.display import display
import seaborn as sns
import datetime as dt
import os
import sys

In [2]:
# Fetch Dataset - Get Relevant Stats : Premier League Data since Season 2005-06 (15 Full Seasons)
# Dataset from www.football-data.co.uk
dfList = []
loc = "/Users/shafiquemahen/Documents/Final Project/Datasets/"
season1 = pd.read_csv(loc + "2005-06.csv")
season2 = pd.read_csv(loc + "2006-07.csv")
season3 = pd.read_csv(loc + "2007-08.csv")
season4 = pd.read_csv(loc + "2008-09.csv")
season5 = pd.read_csv(loc + "2009-10.csv")
season6 = pd.read_csv(loc + "2010-11.csv")
season7 = pd.read_csv(loc + "2011-12.csv")
season8 = pd.read_csv(loc + "2012-13.csv")
season9 = pd.read_csv(loc + "2013-14.csv")
season10 = pd.read_csv(loc + "2014-15.csv")
season11 = pd.read_csv(loc + "2015-16.csv")
season12 = pd.read_csv(loc + "2016-17.csv")
season13 = pd.read_csv(loc + "2017-18.csv")
season14 = pd.read_csv(loc + "2018-19.csv")
season15 = pd.read_csv(loc + "2019-20.csv")

df = pd.concat([season1, season2, season3, season4, season5, season6, season7, season8, season9, season10, season11, season12, season13, season14, season15])

df = df[['HomeTeam','AwayTeam','FTHG','FTAG','FTR', 'HS', 'AS','HST','AST','HF','AF','HC','AC','HY','AY','HR','AR','HTHG','HTAG','HTR']]
dfList.append(df)

#Legend
#HomeTeam - Home Team
#AwayTeam - Away Team
#FTHG - Full Time Home Goals
#FTAG - Full Time Away Goals
#FTR - Full Time Result
#HS - Home Shots
#AS - Away Shots
#HST - Home Shots on Target
#AST - Away Shots on Target
#HF - Home Fouls
#AF - Away Fouls
#HC - Home Corners
#AC - Away Corners
#HY - Home Yellow Cards
#AY - Away Yellow Cards
#HR - Home Red Cards
#AR - Away Red Cards
#HTHG - Half Time Home Goals
#HTAG - Half Time Away Goals
#HTR - Half Time Result

eplData = dfList[0].append(dfList[1:])
#Get rid of NaN values
eplData = eplData.dropna()
display(eplData)


Unnamed: 0,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR,HTHG,HTAG,HTR
0,Aston Villa,Bolton,2.0,2.0,D,3.0,13.0,2.0,6.0,14.0,16.0,7.0,8.0,0.0,2.0,0.0,0.0,2.0,2.0,D
1,Everton,Man United,0.0,2.0,A,10.0,12.0,5.0,5.0,15.0,14.0,8.0,6.0,3.0,1.0,0.0,0.0,0.0,1.0,A
2,Fulham,Birmingham,0.0,0.0,D,15.0,7.0,7.0,4.0,12.0,13.0,6.0,6.0,1.0,2.0,0.0,0.0,0.0,0.0,D
3,Man City,West Brom,0.0,0.0,D,15.0,13.0,8.0,3.0,13.0,11.0,3.0,6.0,2.0,3.0,0.0,0.0,0.0,0.0,D
4,Middlesbrough,Liverpool,0.0,0.0,D,4.0,16.0,2.0,7.0,17.0,11.0,5.0,0.0,2.0,3.0,1.0,0.0,0.0,0.0,D
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,Leicester,Man United,0.0,2.0,A,14.0,7.0,3.0,3.0,12.0,11.0,3.0,3.0,1.0,4.0,1.0,0.0,0.0,0.0,D
376,Man City,Norwich,5.0,0.0,H,31.0,5.0,10.0,4.0,7.0,4.0,9.0,0.0,1.0,1.0,0.0,0.0,2.0,0.0,H
377,Newcastle,Liverpool,1.0,3.0,A,3.0,14.0,2.0,6.0,11.0,5.0,2.0,4.0,1.0,0.0,0.0,0.0,1.0,1.0,D
378,Southampton,Sheffield United,3.0,1.0,H,13.0,5.0,4.0,3.0,9.0,16.0,9.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,A


## Preparing Dataset

In [3]:
#Convert Result Objects to Integers
eplData.loc[eplData['FTR'] == "D", 'FTR'] = 0
eplData.loc[eplData['FTR'] == "H", 'FTR'] = 1
eplData.loc[eplData['FTR'] == "A", 'FTR'] = -1

eplData.loc[eplData['HTR'] == "D", 'HTR'] = 0
eplData.loc[eplData['HTR'] == "H", 'HTR'] = 1
eplData.loc[eplData['HTR'] == "A", 'HTR'] = -1

#Assign each team a unique value
enumeratedTeams = list(enumerate(eplData.HomeTeam.unique()))

for value in eplData.HomeTeam.unique():
    new_value = [x for (x, y) in enumeratedTeams if y is value][0]
    eplData.loc[eplData['HomeTeam'] == value, 'HomeTeam'] = new_value
    eplData.loc[eplData['AwayTeam'] == value, 'AwayTeam'] = new_value
    
display(eplData)
print(enumeratedTeams)

Unnamed: 0,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR,HTHG,HTAG,HTR
0,0,18,2.0,2.0,0,3.0,13.0,2.0,6.0,14.0,16.0,7.0,8.0,0.0,2.0,0.0,0.0,2.0,2.0,0
1,1,14,0.0,2.0,-1,10.0,12.0,5.0,5.0,15.0,14.0,8.0,6.0,3.0,1.0,0.0,0.0,0.0,1.0,-1
2,2,10,0.0,0.0,0,15.0,7.0,7.0,4.0,12.0,13.0,6.0,6.0,1.0,2.0,0.0,0.0,0.0,0.0,0
3,3,17,0.0,0.0,0,15.0,13.0,8.0,3.0,13.0,11.0,3.0,6.0,2.0,3.0,0.0,0.0,0.0,0.0,0
4,4,13,0.0,0.0,0,4.0,16.0,2.0,7.0,17.0,11.0,5.0,0.0,2.0,3.0,1.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,35,14,0.0,2.0,-1,14.0,7.0,3.0,3.0,12.0,11.0,3.0,3.0,1.0,4.0,1.0,0.0,0.0,0.0,0
376,3,31,5.0,0.0,1,31.0,5.0,10.0,4.0,7.0,4.0,9.0,0.0,1.0,1.0,0.0,0.0,2.0,0.0,1
377,15,13,1.0,3.0,-1,3.0,14.0,2.0,6.0,11.0,5.0,2.0,4.0,1.0,0.0,0.0,0.0,1.0,1.0,0
378,32,21,3.0,1.0,1,13.0,5.0,4.0,3.0,9.0,16.0,9.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,-1


[(0, 'Aston Villa'), (1, 'Everton'), (2, 'Fulham'), (3, 'Man City'), (4, 'Middlesbrough'), (5, 'Portsmouth'), (6, 'Sunderland'), (7, 'West Ham'), (8, 'Arsenal'), (9, 'Wigan'), (10, 'Birmingham'), (11, 'Blackburn'), (12, 'Charlton'), (13, 'Liverpool'), (14, 'Man United'), (15, 'Newcastle'), (16, 'Tottenham'), (17, 'West Brom'), (18, 'Bolton'), (19, 'Chelsea'), (20, 'Reading'), (21, 'Sheffield United'), (22, 'Watford'), (23, 'Derby'), (24, 'Hull'), (25, 'Stoke'), (26, 'Wolves'), (27, 'Burnley'), (28, 'Blackpool'), (29, 'QPR'), (30, 'Swansea'), (31, 'Norwich'), (32, 'Southampton'), (33, 'Crystal Palace'), (34, 'Cardiff'), (35, 'Leicester'), (36, 'Bournemouth'), (37, 'Brighton'), (38, 'Huddersfield')]


## Creating a Dummy Classifier

In [4]:
#Separate Results and Prediction Data
X_dummy = eplData.drop(['FTR'],1)
X_dummy = normalize(X_dummy)
y_all = eplData['FTR']
y_all = y_all.astype(int)
display(y_all)
#Create a Dummy Classifier that uses the most frequent result as it's prediction
dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X_dummy, y_all)
dummy_clf.predict(X_dummy)
dummy_score = dummy_clf.score(X_dummy, y_all)
print(dummy_clf.predict(X_dummy))
print("Dummy Classifier Score:", dummy_score*100, "%")


0      0
1     -1
2      0
3      0
4      0
      ..
375   -1
376    1
377   -1
378    1
379    0
Name: FTR, Length: 5700, dtype: int64

[1 1 1 ... 1 1 1]
Dummy Classifier Score: 46.54385964912281 %


## Experiment 1 - Model that predicts only using Shots and Goal data

In [5]:
#Create Dataset that only has Shots and Goal data
X_exp1 = eplData.drop(['FTHG','FTAG','FTR','HF','AF','HC','AC','HY','AY','HR','AR','HTR'],1)

# Normalize data.
#cols = [['FTHG','FTAG', 'HS', 'AS','HST','AST','HTHG','HTAG']]
cols = [['HS', 'AS','HST','AST','HTHG','HTAG']]

for col in cols:
    X_exp1[col] = normalize(X_exp1[col])

#Shuffle and split the dataset into training and testing set.
X_train, X_test, y_train, y_test = train_test_split(X_exp1, y_all, 
                                                    test_size = 0.2,
                                                    random_state = 5,
                                                    stratify = y_all)

display(X_train)
display(X_test)
display(y_train)
display(y_test)

# Get Models
clf_1 = LogisticRegression(multi_class='multinomial',random_state = 42, solver='lbfgs', max_iter=3000)
clf_2 = SVC(random_state = 419, kernel='rbf')
clf_3 = KNeighborsClassifier(n_neighbors = 67) # n_neighbors = Sqrt of training data = Sqrt of 5700*0.8
clf_4 = RandomForestClassifier(max_depth=5, random_state=0)
clf_5 = DecisionTreeClassifier(random_state=8)

# Train and Test Models
clf_1.fit(X_train, y_train)
# Calculate CV scores
scores_train = cross_val_score(clf_1, X_train, y_train, cv=5)
scores_test = cross_val_score(clf_1, X_test, y_test, cv=5)
print("Logistic Regression Training Set (Cross Validation Score):", scores_train.mean())
print("Logistic Regression Test Set (Cross Validation Score):", scores_test.mean())
print("")

clf_2.fit(X_train, y_train)
# Calculate CV scores
scores_train = cross_val_score(clf_2, X_train, y_train, cv=5)
scores_test = cross_val_score(clf_2, X_test, y_test, cv=5)
print("SVC Training Set (Cross Validation Score):", scores_train.mean())
print("SVC Test Set (Cross Validation Score):", scores_test.mean())
print("")

clf_3.fit(X_train, y_train)
# Calculate CV scores
scores_train = cross_val_score(clf_3, X_train, y_train, cv=5)
scores_test = cross_val_score(clf_3, X_test, y_test, cv=5)
print("KNN Training Set (Cross Validation Score):", scores_train.mean())
print("KNN Test Set (Cross Validation Score):", scores_test.mean())
print("")

clf_4.fit(X_train, y_train)
# Calculate CV scores
scores_train = cross_val_score(clf_4, X_train, y_train, cv=5)
scores_test = cross_val_score(clf_4, X_test, y_test, cv=5)
print("RFC Training Set (Cross Validation Score):", scores_train.mean())
print("RFC Test Set (Cross Validation Score):", scores_test.mean())
print("")

clf_5.fit(X_train, y_train)
# Calculate CV scores
scores_train = cross_val_score(clf_5, X_train, y_train, cv=5)
scores_test = cross_val_score(clf_5, X_test, y_test, cv=5)
print("DTC Training Set (Cross Validation Score):", scores_train.mean())
print("DTC Test Set (Cross Validation Score):", scores_test.mean())
print("")

Unnamed: 0,HomeTeam,AwayTeam,HS,AS,HST,AST,HTHG,HTAG
15,30,27,0.607457,0.728948,0.303728,0.060746,0.060746,0.000000
332,15,8,0.450035,0.843816,0.225018,0.168763,0.056254,0.056254
184,6,14,0.435194,0.739830,0.130558,0.478714,0.000000,0.130558
113,15,5,0.696526,0.497519,0.348263,0.348263,0.049752,0.149256
114,15,36,0.749350,0.582828,0.249783,0.166522,0.083261,0.041631
...,...,...,...,...,...,...,...,...
234,25,6,0.768922,0.341743,0.512615,0.170872,0.000000,0.000000
98,13,16,0.767836,0.402200,0.475327,0.146254,0.000000,0.036564
339,2,26,0.861586,0.391630,0.313304,0.078326,0.000000,0.000000
70,0,16,0.687118,0.572598,0.343559,0.286299,0.000000,0.000000


Unnamed: 0,HomeTeam,AwayTeam,HS,AS,HST,AST,HTHG,HTAG
104,15,22,0.526316,0.842105,0.105263,0.052632,0.000000,0.000000
73,35,1,0.388973,0.826568,0.097243,0.388973,0.048622,0.048622
366,19,15,0.654031,0.566827,0.305215,0.392419,0.043602,0.043602
34,5,4,0.827516,0.275839,0.472866,0.118217,0.000000,0.039406
71,19,34,0.797452,0.518344,0.279108,0.119618,0.039873,0.039873
...,...,...,...,...,...,...,...,...
66,15,14,0.644386,0.594818,0.148704,0.446113,0.000000,0.099136
303,17,13,0.627054,0.479512,0.516398,0.331970,0.000000,0.000000
356,8,19,0.613396,0.654289,0.286251,0.327144,0.000000,0.081786
379,26,6,0.706207,0.554877,0.353103,0.252217,0.050443,0.050443


15     1
332    1
184   -1
113   -1
114    1
      ..
234   -1
98     1
339    0
70     0
64     1
Name: FTR, Length: 4560, dtype: int64

104    1
73    -1
366    0
34     1
71     1
      ..
66    -1
303    1
356   -1
379    1
227   -1
Name: FTR, Length: 1140, dtype: int64

Logistic Regression Training Set (Cross Validation Score): 0.6324561403508773
Logistic Regression Test Set (Cross Validation Score): 0.6087719298245615

SVC Training Set (Cross Validation Score): 0.48267543859649126
SVC Test Set (Cross Validation Score): 0.48245614035087714

KNN Training Set (Cross Validation Score): 0.4894736842105263
KNN Test Set (Cross Validation Score): 0.4640350877192982

RFC Training Set (Cross Validation Score): 0.6368421052631579
RFC Test Set (Cross Validation Score): 0.6307017543859649

DTC Training Set (Cross Validation Score): 0.5429824561403509
DTC Test Set (Cross Validation Score): 0.5377192982456139



## Experiment 2 - Model that predicts using more indepth Gameplay data

In [6]:
#Create Dataset that has all Gameplay data
X_exp2 = eplData.drop(['FTHG','FTAG','FTR'],1)

# Normalize data
cols = [['HS', 'AS','HST','AST','HF','AF','HC','AC','HY','AY','HR','AR','HTHG','HTAG']]

for col in cols:
    X_exp2[col] = normalize(X_exp2[col])

#Shuffle and split the dataset into training and testing set.
X_train, X_test, y_train, y_test = train_test_split(X_exp2, y_all, 
                                                    test_size = 0.2,
                                                    random_state = 5,
                                                    stratify = y_all)

display(X_train)
display(X_test)
display(y_train)
display(y_test)

# Get Models
clf_1 = LogisticRegression(multi_class='multinomial',random_state = 42, solver='lbfgs', max_iter=3000)
clf_2 = SVC(random_state = 419, kernel='rbf')
clf_3 = KNeighborsClassifier(n_neighbors = 67) # n_neighbors = Sqrt of training data = Sqrt of 5700*0.8
clf_4 = RandomForestClassifier(max_depth=5, random_state=0)
clf_5 = DecisionTreeClassifier(random_state=8)

# Train and Test Models
clf_1.fit(X_train, y_train)
# Calculate CV scores
scores_train = cross_val_score(clf_1, X_train, y_train, cv=5)
scores_test = cross_val_score(clf_1, X_test, y_test, cv=5)
print("Logistic Regression Training Set (Cross Validation Score):", scores_train.mean())
print("Logistic Regression Test Set (Cross Validation Score):", scores_test.mean())
print("")

clf_2.fit(X_train, y_train)
# Calculate CV scores
scores_train = cross_val_score(clf_2, X_train, y_train, cv=5)
scores_test = cross_val_score(clf_2, X_test, y_test, cv=5)
print("SVC Training Set (Cross Validation Score):", scores_train.mean())
print("SVC Test Set (Cross Validation Score):", scores_test.mean())
print("")

clf_3.fit(X_train, y_train)
# Calculate CV scores
scores_train = cross_val_score(clf_3, X_train, y_train, cv=5)
scores_test = cross_val_score(clf_3, X_test, y_test, cv=5)
print("KNN Training Set (Cross Validation Score):", scores_train.mean())
print("KNN Test Set (Cross Validation Score):", scores_test.mean())
print("")

clf_4.fit(X_train, y_train)
# Calculate CV scores
scores_train = cross_val_score(clf_4, X_train, y_train, cv=5)
scores_test = cross_val_score(clf_4, X_test, y_test, cv=5)
print("RFC Training Set (Cross Validation Score):", scores_train.mean())
print("RFC Test Set (Cross Validation Score):", scores_test.mean())
print("")

clf_5.fit(X_train, y_train)
# Calculate CV scores
scores_train = cross_val_score(clf_5, X_train, y_train, cv=5)
scores_test = cross_val_score(clf_5, X_test, y_test, cv=5)
print("DTC Training Set (Cross Validation Score):", scores_train.mean())
print("DTC Test Set (Cross Validation Score):", scores_test.mean())
print("")

Unnamed: 0,HomeTeam,AwayTeam,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR,HTHG,HTAG,HTR
15,30,27,0.391031,0.469237,0.195515,0.039103,0.547443,0.508340,0.078206,0.117309,0.078206,0.039103,0.000000,0.000000,0.039103,0.000000,1
332,15,8,0.341743,0.640768,0.170872,0.128154,0.469897,0.384461,0.085436,0.213589,0.042718,0.000000,0.000000,0.000000,0.042718,0.042718,0
184,6,14,0.333704,0.567297,0.100111,0.367075,0.467186,0.400445,0.133482,0.100111,0.066741,0.033370,0.000000,0.000000,0.000000,0.100111,-1
113,15,5,0.465891,0.332779,0.232945,0.232945,0.499169,0.499169,0.133112,0.166390,0.066556,0.066556,0.000000,0.000000,0.033278,0.099834,-1
114,15,36,0.589294,0.458339,0.196431,0.130954,0.294647,0.360124,0.229170,0.327385,0.065477,0.032739,0.000000,0.000000,0.065477,0.032739,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
234,25,6,0.468521,0.208232,0.312348,0.104116,0.624695,0.312348,0.260290,0.260290,0.000000,0.052058,0.052058,0.000000,0.000000,0.000000,0
98,13,16,0.650870,0.340932,0.402919,0.123975,0.278944,0.340932,0.247950,0.092981,0.092981,0.092981,0.000000,0.000000,0.000000,0.030994,-1
339,2,26,0.506316,0.230144,0.184115,0.046029,0.414259,0.506316,0.230144,0.414259,0.046029,0.000000,0.000000,0.000000,0.000000,0.000000,0
70,0,16,0.487869,0.406558,0.243935,0.203279,0.528525,0.447214,0.081312,0.081312,0.000000,0.040656,0.000000,0.040656,0.000000,0.000000,0


Unnamed: 0,HomeTeam,AwayTeam,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR,HTHG,HTAG,HTR
104,15,22,0.354218,0.566749,0.070844,0.035422,0.425062,0.389640,0.283375,0.318796,0.035422,0.177109,0.000000,0.00000,0.000000,0.000000,0
73,35,1,0.291343,0.619103,0.072836,0.291343,0.364179,0.400596,0.072836,0.364179,0.072836,0.036418,0.036418,0.00000,0.036418,0.036418,0
366,19,15,0.458778,0.397607,0.214096,0.275267,0.519948,0.397607,0.214096,0.122341,0.122341,0.061170,0.000000,0.00000,0.030585,0.030585,0
34,5,4,0.760750,0.253583,0.434714,0.108679,0.289809,0.217357,0.108679,0.072452,0.072452,0.036226,0.000000,0.00000,0.000000,0.036226,-1
71,19,34,0.659380,0.428597,0.230783,0.098907,0.395628,0.164845,0.197814,0.296721,0.032969,0.065938,0.000000,0.00000,0.032969,0.032969,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66,15,14,0.431420,0.398234,0.099558,0.298675,0.497792,0.398234,0.265489,0.199117,0.132745,0.132745,0.000000,0.00000,0.000000,0.066372,-1
303,17,13,0.513036,0.392322,0.422500,0.271607,0.301786,0.452679,0.000000,0.150893,0.060357,0.090536,0.000000,0.00000,0.000000,0.000000,0
356,8,19,0.443484,0.473050,0.206959,0.236525,0.561747,0.354787,0.177394,0.059131,0.029566,0.000000,0.000000,0.00000,0.000000,0.059131,-1
379,26,6,0.466408,0.366463,0.233204,0.166574,0.366463,0.632982,0.133259,0.033315,0.033315,0.066630,0.000000,0.06663,0.033315,0.033315,0


15     1
332    1
184   -1
113   -1
114    1
      ..
234   -1
98     1
339    0
70     0
64     1
Name: FTR, Length: 4560, dtype: int64

104    1
73    -1
366    0
34     1
71     1
      ..
66    -1
303    1
356   -1
379    1
227   -1
Name: FTR, Length: 1140, dtype: int64

Logistic Regression Training Set (Cross Validation Score): 0.655921052631579
Logistic Regression Test Set (Cross Validation Score): 0.6385964912280702

SVC Training Set (Cross Validation Score): 0.6175438596491227
SVC Test Set (Cross Validation Score): 0.5192982456140351

KNN Training Set (Cross Validation Score): 0.5366228070175438
KNN Test Set (Cross Validation Score): 0.4754385964912281

RFC Training Set (Cross Validation Score): 0.6521929824561404
RFC Test Set (Cross Validation Score): 0.6403508771929826

DTC Training Set (Cross Validation Score): 0.5714912280701754
DTC Test Set (Cross Validation Score): 0.5552631578947368



## Experiment 3 - Model that incorporates Team Strength based on Appearances

In [7]:
##Create a new Table with each Team and their Appearance Count
table = pd.DataFrame(columns=('Team','HomeApps','AwayApps'))
table.Team = df.HomeTeam.unique()
#Home Apps should equal Away Apps (19 games each per season)
table.HomeApps = eplData.HomeTeam.value_counts()
table.AwayApps = eplData.AwayTeam.value_counts()

f_HomeApps = []
f_AwayApps = []
##FIX ERROR WITH HUDDERSFIELD SWAPPING WITH NAN VALUE MANUALLY
table.loc[36,'Team'] = 'Huddersfield'
table = table[:-1]

for index,row in df.iterrows():
    f_HomeApps.append(table[table['Team'] == row['HomeTeam']]['HomeApps'].values)
    f_AwayApps.append(table[table['Team'] == row['AwayTeam']]['AwayApps'].values)
    
df['HomeApps'] = f_HomeApps
df['AwayApps'] = f_AwayApps


print(table)

#Convert Result Objects to Integers
df.loc[df['FTR'] == "D", 'FTR'] = 0
df.loc[df['FTR'] == "H", 'FTR'] = 1
df.loc[df['FTR'] == "A", 'FTR'] = -1

df.loc[df['HTR'] == "D", 'HTR'] = 0
df.loc[df['HTR'] == "H", 'HTR'] = 1
df.loc[df['HTR'] == "A", 'HTR'] = -1

#print(df)

#Team Strength now represented with Appearance Count so team name values redundant
X_exp3 = df.drop(['HomeTeam','AwayTeam','FTHG','FTAG','FTR'],1)

#Drop NaN row manually
X_exp3 = X_exp3.drop([380])

# Normalize data
cols = [['HS', 'AS','HST','AST','HF','AF','HC','AC','HY','AY','HR','AR','HTHG','HTAG','HomeApps','AwayApps']]
for col in cols:
    X_exp3[col] = normalize(X_exp3[col])

#Shuffle and split the dataset into training and testing set.
X_train, X_test, y_train, y_test = train_test_split(X_exp3, y_all, 
                                                    test_size = 0.2,
                                                    random_state = 5,
                                                    stratify = y_all)

display(X_train)
display(X_test)
display(y_train)
display(y_test)

# Get Models
clf_1 = LogisticRegression(multi_class='multinomial',random_state = 42, solver='lbfgs', max_iter=3000)
clf_2 = SVC(random_state = 419, kernel='rbf')
clf_3 = KNeighborsClassifier(n_neighbors = 67) # n_neighbors = Sqrt of training data = Sqrt of 5700*0.8
clf_4 = RandomForestClassifier(max_depth=5, random_state=0)
clf_5 = DecisionTreeClassifier(random_state=8)

# Train and Test Models
clf_1.fit(X_train, y_train)
# Calculate CV scores
scores_train = cross_val_score(clf_1, X_train, y_train, cv=5)
scores_test = cross_val_score(clf_1, X_test, y_test, cv=5)
print("Logistic Regression Training Set (Cross Validation Score):", scores_train.mean())
print("Logistic Regression Test Set (Cross Validation Score):", scores_test.mean())
print("")

clf_2.fit(X_train, y_train)
# Calculate CV scores
scores_train = cross_val_score(clf_2, X_train, y_train, cv=5)
scores_test = cross_val_score(clf_2, X_test, y_test, cv=5)
print("SVC Training Set (Cross Validation Score):", scores_train.mean())
print("SVC Test Set (Cross Validation Score):", scores_test.mean())
print("")

clf_3.fit(X_train, y_train)
# Calculate CV scores
scores_train = cross_val_score(clf_3, X_train, y_train, cv=5)
scores_test = cross_val_score(clf_3, X_test, y_test, cv=5)
print("KNN Training Set (Cross Validation Score):", scores_train.mean())
print("KNN Test Set (Cross Validation Score):", scores_test.mean())
print("")

clf_4.fit(X_train, y_train)
# Calculate CV scores
scores_train = cross_val_score(clf_4, X_train, y_train, cv=5)
scores_test = cross_val_score(clf_4, X_test, y_test, cv=5)
print("RFC Training Set (Cross Validation Score):", scores_train.mean())
print("RFC Test Set (Cross Validation Score):", scores_test.mean())
print("")

clf_5.fit(X_train, y_train)
# Calculate CV scores
scores_train = cross_val_score(clf_5, X_train, y_train, cv=5)
scores_test = cross_val_score(clf_5, X_test, y_test, cv=5)
print("DTC Training Set (Cross Validation Score):", scores_train.mean())
print("DTC Test Set (Cross Validation Score):", scores_test.mean())
print("")



    

                Team  HomeApps  AwayApps
0        Aston Villa     228.0     228.0
1            Everton     285.0     285.0
2             Fulham     190.0     190.0
3           Man City     285.0     285.0
4      Middlesbrough      95.0      95.0
5         Portsmouth      95.0      95.0
6         Sunderland     209.0     209.0
7           West Ham     266.0     266.0
8            Arsenal     285.0     285.0
9              Wigan     152.0     152.0
10        Birmingham      76.0      76.0
11         Blackburn     133.0     133.0
12          Charlton      38.0      38.0
13         Liverpool     285.0     285.0
14        Man United     285.0     285.0
15         Newcastle     247.0     247.0
16         Tottenham     285.0     285.0
17         West Brom     190.0     190.0
18            Bolton     133.0     133.0
19           Chelsea     285.0     285.0
20           Reading      57.0      57.0
21  Sheffield United      38.0      38.0
22           Watford     114.0     114.0
23             D

Unnamed: 0,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR,HTHG,HTAG,HTR,HomeApps,AwayApps
15,0.056488,0.067786,0.028244,0.005649,0.079083,0.073435,0.011298,0.016946,0.011298,0.005649,0.000000,0.000000,0.005649,0.000000,1,0.751293,0.643965
332,0.021172,0.039697,0.010586,0.007939,0.029111,0.023818,0.005293,0.013232,0.002646,0.000000,0.000000,0.000000,0.002646,0.002646,0,0.653673,0.754238
184,0.028194,0.047929,0.008458,0.031013,0.039471,0.033832,0.011277,0.008458,0.005639,0.002819,0.000000,0.000000,0.000000,0.008458,-1,0.589249,0.803522
113,0.052564,0.037546,0.026282,0.026282,0.056319,0.056319,0.015018,0.018773,0.007509,0.007509,0.000000,0.000000,0.003755,0.011264,-1,0.927386,0.356687
114,0.070498,0.054832,0.023499,0.015666,0.035249,0.043082,0.027416,0.039166,0.007833,0.003917,0.000000,0.000000,0.007833,0.003917,1,0.967393,0.223245
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
234,0.031790,0.014129,0.021193,0.007064,0.042387,0.021193,0.017661,0.017661,0.000000,0.003532,0.003532,0.000000,0.000000,0.000000,0,0.671123,0.738235
98,0.051936,0.027205,0.032151,0.009893,0.022258,0.027205,0.019785,0.007419,0.007419,0.007419,0.000000,0.000000,0.000000,0.002473,-1,0.704852,0.704852
339,0.051514,0.023415,0.018732,0.004683,0.042148,0.051514,0.023415,0.042148,0.004683,0.000000,0.000000,0.000000,0.000000,0.000000,0,0.889786,0.444893
70,0.032804,0.027337,0.016402,0.013668,0.035538,0.030071,0.005467,0.005467,0.000000,0.002734,0.000000,0.002734,0.000000,0.000000,0,0.623281,0.779102


Unnamed: 0,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR,HTHG,HTAG,HTR,HomeApps,AwayApps
104,0.036563,0.058501,0.007313,0.003656,0.043876,0.040219,0.029251,0.032907,0.003656,0.018282,0.000000,0.000000,0.000000,0.000000,0,0.903109,0.416820
73,0.025959,0.055163,0.006490,0.025959,0.032449,0.035693,0.006490,0.032449,0.006490,0.003245,0.003245,0.000000,0.003245,0.003245,0,0.369914,0.924784
366,0.039624,0.034341,0.018491,0.023775,0.044908,0.034341,0.018491,0.010567,0.010567,0.005283,0.000000,0.000000,0.002642,0.002642,0,0.752865,0.652483
34,0.153109,0.051036,0.087491,0.021873,0.058327,0.043746,0.021873,0.014582,0.014582,0.007291,0.000000,0.000000,0.000000,0.007291,-1,0.692638,0.692638
71,0.069176,0.044964,0.024212,0.010376,0.041506,0.017294,0.020753,0.031129,0.003459,0.006918,0.000000,0.000000,0.003459,0.003459,0,0.985758,0.131434
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66,0.034361,0.031717,0.007929,0.023788,0.039647,0.031717,0.021145,0.015859,0.010572,0.010572,0.000000,0.000000,0.000000,0.005286,-1,0.652850,0.753288
303,0.049400,0.037777,0.040683,0.026153,0.029059,0.043589,0.000000,0.014530,0.005812,0.008718,0.000000,0.000000,0.000000,0.000000,0,0.552123,0.828184
356,0.037086,0.039558,0.017307,0.019779,0.046975,0.029669,0.014834,0.004945,0.002472,0.000000,0.000000,0.000000,0.000000,0.004945,-1,0.704630,0.704630
379,0.060467,0.047510,0.030233,0.021595,0.047510,0.082062,0.017276,0.004319,0.004319,0.008638,0.000000,0.008638,0.004319,0.004319,0,0.410311,0.902684


15     1
332    1
184   -1
113   -1
114    1
      ..
234   -1
98     1
339    0
70     0
64     1
Name: FTR, Length: 4560, dtype: int64

104    1
73    -1
366    0
34     1
71     1
      ..
66    -1
303    1
356   -1
379    1
227   -1
Name: FTR, Length: 1140, dtype: int64

Logistic Regression Training Set (Cross Validation Score): 0.6346491228070176
Logistic Regression Test Set (Cross Validation Score): 0.6210526315789474

SVC Training Set (Cross Validation Score): 0.6361842105263158
SVC Test Set (Cross Validation Score): 0.6149122807017544

KNN Training Set (Cross Validation Score): 0.6271929824561403
KNN Test Set (Cross Validation Score): 0.6105263157894736

RFC Training Set (Cross Validation Score): 0.6447368421052632
RFC Test Set (Cross Validation Score): 0.6254385964912281

DTC Training Set (Cross Validation Score): 0.5432017543859649
DTC Test Set (Cross Validation Score): 0.5271929824561403



## Experiment 4 - Model that incorporates Team Strength through gameplay statistics

In [8]:
# Using table from previous experiment
table = table.assign(HomeStrength = 0, HomeWeakness = 0, AwayStrength = 0, AwayWeakness = 0)


#Work out the average goals scored / conceded over 15 seasons
avgHomeGoalsScored = eplData.FTHG.sum() / (380*15)
avgAwayGoalsScored = eplData.FTAG.sum() / (380*15)
avgHomeGoalsConceded = avgAwayGoalsScored
avgAwayGoalsConceded = avgHomeGoalsScored
print(avgHomeGoalsScored)
print(avgAwayGoalsScored)

grouped_home = eplData.groupby('HomeTeam')
grouped_away = eplData.groupby('AwayTeam')

#Code to check every fixture for every team
#for key, item in grouped_home:
#    print(grouped_home.get_group(key), "\n\n")

# 19 Home Games / 19 Away games each season for 15 seasons = 285 Home Games / 285 Away Games
table.HomeStrength = (grouped_home.FTHG.sum().values / table.HomeApps) / avgHomeGoalsScored
table.AwayStrength = (grouped_away.FTAG.sum().values / table.AwayApps) / avgAwayGoalsScored
table.HomeWeakness = (grouped_home.FTAG.sum().values / table.HomeApps) / avgHomeGoalsConceded
table.AwayWeakness = (grouped_away.FTHG.sum().values / table.AwayApps) / avgAwayGoalsConceded

f_HomeStrength = []
f_HomeWeakness = []
f_AwayStrength = []
f_AwayWeakness = []

# Append new metrics to original dataframe
for index,row in df.iterrows():
    f_HomeStrength.append(table[table['Team'] == row['HomeTeam']]['HomeStrength'].values)
    f_HomeWeakness.append(table[table['Team'] == row['HomeTeam']]['HomeWeakness'].values)
    f_AwayStrength.append(table[table['Team'] == row['AwayTeam']]['AwayStrength'].values)
    f_AwayWeakness.append(table[table['Team'] == row['AwayTeam']]['AwayWeakness'].values)

    
df['HomeStrength'] = f_HomeStrength
df['HomeWeakness'] = f_HomeWeakness
df['AwayStrength'] = f_AwayStrength
df['AwayWeakness'] = f_AwayWeakness

df

# Drop unneeded features
X_exp4 = df.drop(['HomeTeam','AwayTeam','FTHG','FTAG','FTR'],1)

#Drop NaN row manually
X_exp4 = X_exp4.drop([380])

# Normalize data
cols = [['HS', 'AS','HST','AST','HF','AF','HC','AC','HY','AY','HR','AR','HTHG','HTAG','HomeApps','AwayApps']]
for col in cols:
    X_exp4[col] = normalize(X_exp4[col])

#Shuffle and split the dataset into training and testing set.
X_train, X_test, y_train, y_test = train_test_split(X_exp4, y_all, 
                                                    test_size = 0.2,
                                                    random_state = 5,
                                                    stratify = y_all)

display(X_train)
display(X_test)
display(y_train)
display(y_test)

# Get Models
clf_1 = LogisticRegression(multi_class='multinomial',random_state = 42, solver='lbfgs', max_iter=3000)
clf_2 = SVC(random_state = 419, kernel='rbf')
clf_3 = KNeighborsClassifier(n_neighbors = 67) # n_neighbors = Sqrt of training data = Sqrt of 5700*0.8
clf_4 = RandomForestClassifier(max_depth=5, random_state=0)
clf_5 = DecisionTreeClassifier(random_state=8)

# Train and Test Models
clf_1.fit(X_train, y_train)
# Calculate CV scores
scores_train = cross_val_score(clf_1, X_train, y_train, cv=5)
scores_test = cross_val_score(clf_1, X_test, y_test, cv=5)
print("Logistic Regression Training Set (Cross Validation Score):", scores_train.mean())
print("Logistic Regression Test Set (Cross Validation Score):", scores_test.mean())
print("")

clf_2.fit(X_train, y_train)
# Calculate CV scores
scores_train = cross_val_score(clf_2, X_train, y_train, cv=5)
scores_test = cross_val_score(clf_2, X_test, y_test, cv=5)
print("SVC Training Set (Cross Validation Score):", scores_train.mean())
print("SVC Test Set (Cross Validation Score):", scores_test.mean())
print("")

clf_3.fit(X_train, y_train)
# Calculate CV scores
scores_train = cross_val_score(clf_3, X_train, y_train, cv=5)
scores_test = cross_val_score(clf_3, X_test, y_test, cv=5)
print("KNN Training Set (Cross Validation Score):", scores_train.mean())
print("KNN Test Set (Cross Validation Score):", scores_test.mean())
print("")

clf_4.fit(X_train, y_train)
# Calculate CV scores
scores_train = cross_val_score(clf_4, X_train, y_train, cv=5)
scores_test = cross_val_score(clf_4, X_test, y_test, cv=5)
print("RFC Training Set (Cross Validation Score):", scores_train.mean())
print("RFC Test Set (Cross Validation Score):", scores_test.mean())
print("")

clf_5.fit(X_train, y_train)
# Calculate CV scores
scores_train = cross_val_score(clf_5, X_train, y_train, cv=5)
scores_test = cross_val_score(clf_5, X_test, y_test, cv=5)
print("DTC Training Set (Cross Validation Score):", scores_train.mean())
print("DTC Test Set (Cross Validation Score):", scores_test.mean())
print("")


1.5375438596491229
1.1475438596491228


Unnamed: 0,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,...,AR,HTHG,HTAG,HTR,HomeApps,AwayApps,HomeStrength,HomeWeakness,AwayStrength,AwayWeakness
15,0.056488,0.067786,0.028244,0.005649,0.079083,0.073435,0.011298,0.016946,0.011298,0.005649,...,0.000000,0.005649,0.000000,1,0.751293,0.643965,[0.8753341592227946],[1.1138532771310634],[0.7949854762268767],[1.163852122318576]
332,0.021172,0.039697,0.010586,0.007939,0.029111,0.023818,0.005293,0.013232,0.002646,0.000000,...,0.000000,0.002646,0.002646,0,0.653673,0.754238,[0.8952708633219815],[1.0831089106582152],[1.3851093105029813],[0.8512094933820173]
184,0.028194,0.047929,0.008458,0.031013,0.039471,0.033832,0.011277,0.008458,0.005639,0.002819,...,0.000000,0.000000,0.008458,-1,0.589249,0.803522,[0.7312974565370731],[1.138274659143028],[1.4218009478672986],[0.6846188954815152]
113,0.052564,0.037546,0.026282,0.026282,0.056319,0.056319,0.015018,0.018773,0.007509,0.007509,...,0.000000,0.003755,0.011264,-1,0.927386,0.356687,[0.8952708633219815],[1.0831089106582152],[0.7613514753095858],[1.0474669100867182]
114,0.070498,0.054832,0.023499,0.015666,0.035249,0.043082,0.027416,0.039166,0.007833,0.003917,...,0.000000,0.007833,0.003917,1,0.967393,0.223245,[0.8952708633219815],[1.0831089106582152],[0.687968200580951],[1.004107713372889]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
234,0.031790,0.014129,0.021193,0.007064,0.042387,0.021193,0.017661,0.017661,0.000000,0.003532,...,0.000000,0.000000,0.000000,0,0.671123,0.738235,[0.848927430397079],[0.9769148448249503],[0.7880363024836347],[1.135844985685241]
98,0.051936,0.027205,0.032151,0.009893,0.022258,0.027205,0.019785,0.007419,0.007419,0.007419,...,0.000000,0.000000,0.002473,-1,0.704852,0.704852,[1.3806481058877225],[0.6604494725577129],[1.2903225806451613],[0.8922866271109082]
339,0.051514,0.023415,0.018732,0.004683,0.042148,0.051514,0.023415,0.042148,0.004683,0.000000,...,0.000000,0.000000,0.000000,0,0.889786,0.444893,[0.9105431309904153],[1.164959486317077],[0.9081180247668552],[1.0611592879963487]
70,0.032804,0.027337,0.016402,0.013668,0.035538,0.030071,0.005467,0.005467,0.000000,0.002734,...,0.002734,0.000000,0.000000,0,0.623281,0.779102,[0.7844591510725696],[1.0854609386943894],[1.2903225806451613],[0.8922866271109082]


Unnamed: 0,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,...,AR,HTHG,HTAG,HTR,HomeApps,AwayApps,HomeStrength,HomeWeakness,AwayStrength,AwayWeakness
104,0.036563,0.058501,0.007313,0.003656,0.043876,0.040219,0.029251,0.032907,0.003656,0.018282,...,0.000000,0.000000,0.000000,0,0.903109,0.416820,[0.8952708633219815],[1.0831089106582152],[0.7796972939917444],[1.169557279780922]
73,0.025959,0.055163,0.006490,0.025959,0.032449,0.035693,0.006490,0.032449,0.006490,0.003245,...,0.000000,0.003245,0.003245,0,0.369914,0.924784,[1.0155180282975809],[0.9478672985781992],[0.9570402079192785],[0.8557736193518941]
366,0.039624,0.034341,0.018491,0.023775,0.044908,0.034341,0.018491,0.010567,0.010567,0.005283,...,0.000000,0.002642,0.002642,0,0.752865,0.652483,[1.3852122318575992],[0.6818529276868981],[0.8220337986428798],[1.1111891303584593]
34,0.153109,0.051036,0.087491,0.021873,0.058327,0.043746,0.021873,0.014582,0.014582,0.007291,...,0.000000,0.000000,0.007291,-1,0.692638,0.692638,[0.8146964856230031],[1.0457116648830453],[0.6421036538755542],[1.0269283432222729]
71,0.069176,0.044964,0.024212,0.010376,0.041506,0.017294,0.020753,0.031129,0.003459,0.006918,...,0.000000,0.003459,0.003459,0,0.985758,0.131434,[1.3852122318575992],[0.6818529276868981],[0.5733068338174592],[1.1980830670926517]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66,0.034361,0.031717,0.007929,0.023788,0.039647,0.031717,0.021145,0.015859,0.010572,0.010572,...,0.000000,0.000000,0.005286,-1,0.652850,0.753288,[0.8952708633219815],[1.0831089106582152],[1.4218009478672986],[0.6846188954815152]
303,0.049400,0.037777,0.040683,0.026153,0.029059,0.043589,0.000000,0.014530,0.005812,0.008718,...,0.000000,0.000000,0.000000,0,0.552123,0.828184,[0.8420812414422638],[1.2108240330224738],[1.3545329460327167],[0.766773162939297]
356,0.037086,0.039558,0.017307,0.019779,0.046975,0.029669,0.014834,0.004945,0.002472,0.000000,...,0.000000,0.000000,0.004945,-1,0.704630,0.704630,[1.3806481058877225],[0.7277174743922948],[1.3942822198440608],[0.6823368324965768]
379,0.060467,0.047510,0.030233,0.021595,0.047510,0.082062,0.017276,0.004319,0.004319,0.008638,...,0.008638,0.004319,0.004319,0,0.410311,0.902684,[0.8010041077133729],[1.2383427610457116],[0.7880363024836347],[1.135844985685241]


15     1
332    1
184   -1
113   -1
114    1
      ..
234   -1
98     1
339    0
70     0
64     1
Name: FTR, Length: 4560, dtype: int64

104    1
73    -1
366    0
34     1
71     1
      ..
66    -1
303    1
356   -1
379    1
227   -1
Name: FTR, Length: 1140, dtype: int64

Logistic Regression Training Set (Cross Validation Score): 0.6432017543859649
Logistic Regression Test Set (Cross Validation Score): 0.6350877192982457

SVC Training Set (Cross Validation Score): 0.6379385964912281
SVC Test Set (Cross Validation Score): 0.6096491228070176

KNN Training Set (Cross Validation Score): 0.6366228070175438
KNN Test Set (Cross Validation Score): 0.6324561403508773

RFC Training Set (Cross Validation Score): 0.6473684210526317
RFC Test Set (Cross Validation Score): 0.6263157894736843

DTC Training Set (Cross Validation Score): 0.5581140350877193
DTC Test Set (Cross Validation Score): 0.5508771929824562



## Experiment 5 - Optimisation

In [12]:
## Optimising KNN
from sklearn.model_selection import GridSearchCV

# Defining parameter range 
param_grid = [{'weights': ['uniform', 'distance'],  
              'n_neighbors': [23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79],
              'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']}]

clf_3_opt = GridSearchCV(clf_3, param_grid, refit = True, verbose = 1, n_jobs=-1) 
   
# Fitting the model for grid search 
clf_3_opt.fit(X_train, y_train) 
 
# Print best parameters
print(clf_3_opt.best_params_)

# Calculate CV scores
scores_train = cross_val_score(clf_3_opt, X_train, y_train, cv=5)
scores_test = cross_val_score(clf_3_opt, X_test, y_test, cv=5)
print("KNN Training Set (Cross Validation Score):", scores_train.mean())
print("KNN Test Set (Cross Validation Score):", scores_test.mean())
print("")



Fitting 5 folds for each of 112 candidates, totalling 560 fits
{'algorithm': 'auto', 'n_neighbors': 67, 'weights': 'uniform'}
Fitting 5 folds for each of 112 candidates, totalling 560 fits
Fitting 5 folds for each of 112 candidates, totalling 560 fits
Fitting 5 folds for each of 112 candidates, totalling 560 fits
Fitting 5 folds for each of 112 candidates, totalling 560 fits
Fitting 5 folds for each of 112 candidates, totalling 560 fits
Fitting 5 folds for each of 112 candidates, totalling 560 fits
Fitting 5 folds for each of 112 candidates, totalling 560 fits
Fitting 5 folds for each of 112 candidates, totalling 560 fits
Fitting 5 folds for each of 112 candidates, totalling 560 fits
Fitting 5 folds for each of 112 candidates, totalling 560 fits
KNN Training Set (Cross Validation Score): 0.6346491228070176
KNN Test Set (Cross Validation Score): 0.6324561403508773

