In [1]:
%matplotlib inline
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
import statsmodels.api as sm


In [2]:
# Load the data
# Statistics of each game for 10 seasons
file_path = './epl_game_data/epl-allseasons-matchstats.csv'
raw_df = pd.read_csv(file_path)
raw_df


Unnamed: 0,Season,Date,Referee,HomeTeam,AwayTeam,FullTime,Halftime,HomeGoals,HomeGoalsHalfTime,HomeShots,...,HomeYellowCards,HomeRedCards,AwayGoals,AwayGoalsHalfTime,AwayShots,AwayShotsOnTarget,AwayCorners,AwayFouls,AwayYellowCards,AwayRedCards
0,2010/11,2010-08-14,M Dean,Aston Villa,West Ham,HomeWin,HomeWin,3,2,23,...,1,0,0,0,12,2,7,15,2,0
1,2010/11,2010-08-14,P Dowd,Blackburn,Everton,HomeWin,HomeWin,1,1,7,...,2,0,0,0,17,12,3,14,1,0
2,2010/11,2010-08-14,S Attwell,Bolton,Fulham,Draw,Draw,0,0,13,...,1,0,0,0,12,7,8,13,3,0
3,2010/11,2010-08-14,M Clattenburg,Chelsea,West Brom,HomeWin,HomeWin,6,2,18,...,1,0,0,0,10,4,1,10,0,0
4,2010/11,2010-08-14,A Taylor,Sunderland,Birmingham,Draw,HomeWin,2,1,6,...,3,1,2,0,13,7,6,10,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3795,2019/20,2020-07-26,M Atkinson,Leicester,Man Utd,AwayWin,Draw,0,0,14,...,1,1,2,0,7,3,3,11,4,0
3796,2019/20,2020-07-26,C Pawson,Man City,Norwich,HomeWin,HomeWin,5,2,31,...,1,0,0,0,5,4,0,4,1,0
3797,2019/20,2020-07-26,A Taylor,Newcastle,Liverpool,AwayWin,Draw,1,1,3,...,1,0,3,1,14,6,4,5,0,0
3798,2019/20,2020-07-26,P Bankes,Southampton,Sheffield Utd,HomeWin,AwayWin,3,0,13,...,0,0,1,1,5,3,1,16,1,0


In [3]:
# Check the column names
print(list(raw_df.columns))

['Season', 'Date', 'Referee', 'HomeTeam', 'AwayTeam', 'FullTime', 'Halftime', 'HomeGoals', 'HomeGoalsHalfTime', 'HomeShots', 'HomeShotsOnTarget', 'HomeCorners', 'HomeFouls', 'HomeYellowCards', 'HomeRedCards', 'AwayGoals', 'AwayGoalsHalfTime', 'AwayShots', 'AwayShotsOnTarget', 'AwayCorners', 'AwayFouls', 'AwayYellowCards', 'AwayRedCards']


In [4]:
# Inspect the data types
raw_df.dtypes

Season               object
Date                 object
Referee              object
HomeTeam             object
AwayTeam             object
FullTime             object
Halftime             object
HomeGoals             int64
HomeGoalsHalfTime     int64
HomeShots             int64
HomeShotsOnTarget     int64
HomeCorners           int64
HomeFouls             int64
HomeYellowCards       int64
HomeRedCards          int64
AwayGoals             int64
AwayGoalsHalfTime     int64
AwayShots             int64
AwayShotsOnTarget     int64
AwayCorners           int64
AwayFouls             int64
AwayYellowCards       int64
AwayRedCards          int64
dtype: object

In [5]:
#Function to transform FTR into numeric data type
def transformResult(row):
    if(row.FullTime == 'HomeWin'):
        return 1
    elif(row.FullTime == 'AwayWin'):
        return -1
    else:
        return 0

raw_df['Result'] = raw_df.apply(lambda row: transformResult(row),axis=1)
raw_df

Unnamed: 0,Season,Date,Referee,HomeTeam,AwayTeam,FullTime,Halftime,HomeGoals,HomeGoalsHalfTime,HomeShots,...,HomeRedCards,AwayGoals,AwayGoalsHalfTime,AwayShots,AwayShotsOnTarget,AwayCorners,AwayFouls,AwayYellowCards,AwayRedCards,Result
0,2010/11,2010-08-14,M Dean,Aston Villa,West Ham,HomeWin,HomeWin,3,2,23,...,0,0,0,12,2,7,15,2,0,1
1,2010/11,2010-08-14,P Dowd,Blackburn,Everton,HomeWin,HomeWin,1,1,7,...,0,0,0,17,12,3,14,1,0,1
2,2010/11,2010-08-14,S Attwell,Bolton,Fulham,Draw,Draw,0,0,13,...,0,0,0,12,7,8,13,3,0,0
3,2010/11,2010-08-14,M Clattenburg,Chelsea,West Brom,HomeWin,HomeWin,6,2,18,...,0,0,0,10,4,1,10,0,0,1
4,2010/11,2010-08-14,A Taylor,Sunderland,Birmingham,Draw,HomeWin,2,1,6,...,1,2,0,13,7,6,10,3,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3795,2019/20,2020-07-26,M Atkinson,Leicester,Man Utd,AwayWin,Draw,0,0,14,...,1,2,0,7,3,3,11,4,0,-1
3796,2019/20,2020-07-26,C Pawson,Man City,Norwich,HomeWin,HomeWin,5,2,31,...,0,0,0,5,4,0,4,1,0,1
3797,2019/20,2020-07-26,A Taylor,Newcastle,Liverpool,AwayWin,Draw,1,1,3,...,0,3,1,14,6,4,5,0,0,-1
3798,2019/20,2020-07-26,P Bankes,Southampton,Sheffield Utd,HomeWin,AwayWin,3,0,13,...,0,1,1,5,3,1,16,1,0,1


In [6]:
# Break up the df so each season becomes an individual dataframe.
# Seasons: '2010/11' | '2011/12' | '2012/13' | '2013/14' | '2014/15' | '2015/16' | '2016/17' | '2017/18' | '2018/19'
#          '2019/20'

df_2010 = raw_df[(raw_df['Season'] == '2010/11')].reset_index(drop=True)
df_2011 = raw_df[(raw_df['Season'] == '2011/12')].reset_index(drop=True)
df_2012 = raw_df[(raw_df['Season'] == '2012/13')].reset_index(drop=True)
df_2013 = raw_df[(raw_df['Season'] == '2013/14')].reset_index(drop=True)
df_2014 = raw_df[(raw_df['Season'] == '2014/15')].reset_index(drop=True)
df_2015 = raw_df[(raw_df['Season'] == '2015/16')].reset_index(drop=True)
df_2016 = raw_df[(raw_df['Season'] == '2016/17')].reset_index(drop=True)
df_2017 = raw_df[(raw_df['Season'] == '2017/18')].reset_index(drop=True)
df_2018 = raw_df[(raw_df['Season'] == '2018/19')].reset_index(drop=True)
df_2019 = raw_df[(raw_df['Season'] == '2019/20')].reset_index(drop=True)

seasons = [df_2010, df_2011, df_2012, df_2013, df_2014, df_2015, df_2016, df_2017, df_2018, df_2019]


In [7]:
df_2019.head()

Unnamed: 0,Season,Date,Referee,HomeTeam,AwayTeam,FullTime,Halftime,HomeGoals,HomeGoalsHalfTime,HomeShots,...,HomeRedCards,AwayGoals,AwayGoalsHalfTime,AwayShots,AwayShotsOnTarget,AwayCorners,AwayFouls,AwayYellowCards,AwayRedCards,Result
0,2019/20,2019-08-09,M Oliver,Liverpool,Norwich,HomeWin,HomeWin,4,4,15,...,0,1,0,12,5,2,9,2,0,1
1,2019/20,2019-08-10,M Dean,West Ham,Man City,AwayWin,AwayWin,0,0,5,...,0,5,1,14,9,1,13,2,0,-1
2,2019/20,2019-08-10,K Friend,Bournemouth,Sheffield Utd,Draw,Draw,1,0,13,...,0,1,0,8,3,4,19,1,0,0
3,2019/20,2019-08-10,G Scott,Burnley,Southampton,HomeWin,Draw,3,0,10,...,0,0,0,11,3,7,12,0,0,1
4,2019/20,2019-08-10,J Moss,Crystal Palace,Everton,Draw,Draw,0,0,6,...,0,0,0,10,3,2,14,1,1,0


In [8]:
# Get each team's offensive/defensive capabilities per season.
def get_features_per_season(data_now):
    table_new = pd.DataFrame(columns=('Team','HGS','AGS','HAS','AAS','HGC','AGC','HDS','ADS'))
    
    # Calculate each team's avg home and away goals per season - 38 Games in a season
    # times the number of seasons 38*10 = 380.0
    avg_home_scored = data_now.HomeGoals.sum() / 380.0
    avg_away_scored = data_now.AwayGoals.sum() / 380.0

    avg_home_conceded = avg_away_scored
    avg_away_conceded = avg_home_scored
    
    
    res_home = data_now.groupby('HomeTeam')
    res_away = data_now.groupby('AwayTeam')
    
    table_new.Team = list(res_home.groups.keys())
    table_new.HGS = res_home.HomeGoals.sum().values
    table_new.HGC = res_home.AwayGoals.sum().values
    table_new.AGS = res_away.AwayGoals.sum().values
    table_new.AGC = res_away.HomeGoals.sum().values
    
    #19 Home matches for each team each season
    table_new.HAS = (table_new.HGS / 19.0) / avg_home_scored
    table_new.AAS = (table_new.AGS / 19.0) / avg_away_scored
    table_new.HDS = (table_new.HGC / 19.0) / avg_home_conceded
    table_new.ADS = (table_new.AGC / 19.0) / avg_away_conceded
    return table_new

get_features_per_season(raw_df)

Unnamed: 0,Team,HGS,AGS,HAS,AAS,HGC,AGC,HDS,ADS
0,Arsenal,398,304,1.349381,1.340979,167,269,0.736656,0.912019
1,Aston Villa,145,125,0.491609,0.55139,191,251,0.842523,0.850992
2,Birmingham,19,18,0.064418,0.0794,22,36,0.097045,0.122055
3,Blackburn,48,46,0.162739,0.202911,49,88,0.216145,0.298356
4,Blackpool,30,25,0.101712,0.110278,37,41,0.163211,0.139007
5,Bolton,57,41,0.193253,0.180856,63,70,0.2779,0.237328
6,Bournemouth,136,105,0.461095,0.463167,148,182,0.652845,0.617054
7,Brighton,63,45,0.213596,0.1985,80,88,0.352889,0.298356
8,Burnley,104,87,0.352602,0.383767,113,152,0.498456,0.515342
9,Cardiff,41,25,0.139007,0.110278,73,70,0.322011,0.237328


In [9]:
map_features = map(get_features_per_season, seasons) 
list_tables = list(map_features)


In [10]:
# Calculate for the most recent season.
import math
feature_table = raw_df.iloc[:,:24]
feature_table = feature_table[['HomeTeam','AwayTeam','FullTime','HomeShotsOnTarget','AwayShotsOnTarget',
                               'HomeCorners','AwayCorners', 'HomeFouls', 'AwayFouls','HomeGoalsHalfTime',
                               'AwayGoalsHalfTime', 'Result']]
def construct_X_per_match(table, match):
        HAS = table[table['Team'] == match['HomeTeam']]['HAS'].values[0]
        HDS = table[table['Team'] == match['HomeTeam']]['HDS'].values[0]
        AAS = table[table['Team'] == match['AwayTeam']]['AAS'].values[0]
        ADS = table[table['Team'] == match['AwayTeam']]['AAS'].values[0]
        return HAS, HDS, AAS, ADS

In [11]:
# 'HomeTeam','AwayTeam','FullTime','HomeShotsOnTarget','AwayShotsOnTarget',
# 'HomeCorners','AwayCorners', 'HomeFouls', 'AwayFouls','HomeGoalsHalfTime',
# 'AwayGoalsHalfTime'
feature_table = feature_table[['HomeTeam','AwayTeam','FullTime','HomeShotsOnTarget','AwayShotsOnTarget',
                               'HomeCorners','AwayCorners', 'HomeFouls', 'AwayFouls','HomeGoalsHalfTime',
                               'AwayGoalsHalfTime', 'Result']]

# Home Attacking Strength(HAS), Home Defensive Strength(HDS), Away Attacking Strength(AAS),
# Away Defensive Strength(ADS)
f_HAS = []
f_HDS = []
f_AAS = []
f_ADS = []
for index,row in feature_table.iterrows():
    table_ix = math.floor(index/380)
    table = list_tables[table_ix]
    f_HAS.append(table[table['Team'] == row['HomeTeam']]['HAS'].values[0])
    f_HDS.append(table[table['Team'] == row['HomeTeam']]['HDS'].values[0])
    f_AAS.append(table[table['Team'] == row['AwayTeam']]['AAS'].values[0])
    f_ADS.append(table[table['Team'] == row['AwayTeam']]['ADS'].values[0])

feature_table['HomeAttackingStrength'] = f_HAS
feature_table['HomeDefensiveStrength'] = f_HDS
feature_table['AwayAttackingStrength'] = f_AAS
feature_table['AwayDefensiveStrength'] = f_ADS
feature_table

Unnamed: 0,HomeTeam,AwayTeam,FullTime,HomeShotsOnTarget,AwayShotsOnTarget,HomeCorners,AwayCorners,HomeFouls,AwayFouls,HomeGoalsHalfTime,AwayGoalsHalfTime,Result,HomeAttackingStrength,HomeDefensiveStrength,AwayAttackingStrength,AwayDefensiveStrength
0,Aston Villa,West Ham,HomeWin,11,2,16,7,15,15,2,0,1,0.842788,0.852018,0.852018,1.264182
1,Blackburn,Everton,HomeWin,2,12,1,3,19,14,1,0,1,0.713128,0.717489,0.896861,0.713128
2,Bolton,Fulham,Draw,9,7,4,8,12,13,0,0,0,1.102107,1.076233,0.852018,0.648298
3,Chelsea,West Brom,HomeWin,13,4,3,1,10,10,2,0,1,1.264182,0.582960,1.165919,1.329011
4,Sunderland,Birmingham,Draw,2,7,3,6,13,10,1,0,0,0.810373,1.210762,0.807175,1.166937
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3795,Leicester,Man Utd,AwayWin,3,3,3,3,12,11,0,0,-1,1.215278,0.742358,1.135371,0.659722
3796,Man City,Norwich,HomeWin,10,4,9,0,7,4,2,0,1,1.979167,0.567686,0.305677,1.319444
3797,Newcastle,Liverpool,AwayWin,2,6,2,4,11,5,1,1,-1,0.694444,0.917031,1.441048,0.590278
3798,Southampton,Sheffield Utd,HomeWin,4,3,9,1,9,16,0,1,1,0.729167,1.528384,0.655022,0.833333


In [12]:
# Set up the dataframe we will use to run our model on - we will need to take out the String Columns
# Home Team, Away Team, and Full Time.
rm_col_list = ['HomeTeam','AwayTeam', 'FullTime']
col_list = feature_table.columns.values.tolist()
col_list = [x for x in col_list if x not in rm_col_list]

to_use = feature_table[col_list]
X = [x for x in col_list if x != 'Result']
Y = [x for x in col_list if x == 'Result']

# Adjust the dataframe so the Results are at the end.
to_use = to_use[['HomeShotsOnTarget','AwayShotsOnTarget','HomeCorners','AwayCorners','HomeFouls','AwayFouls',
                 'HomeGoalsHalfTime','AwayGoalsHalfTime','HomeAttackingStrength','HomeDefensiveStrength',
                 'AwayAttackingStrength','AwayDefensiveStrength','Result']]
to_use

Unnamed: 0,HomeShotsOnTarget,AwayShotsOnTarget,HomeCorners,AwayCorners,HomeFouls,AwayFouls,HomeGoalsHalfTime,AwayGoalsHalfTime,HomeAttackingStrength,HomeDefensiveStrength,AwayAttackingStrength,AwayDefensiveStrength,Result
0,11,2,16,7,15,15,2,0,0.842788,0.852018,0.852018,1.264182,1
1,2,12,1,3,19,14,1,0,0.713128,0.717489,0.896861,0.713128,1
2,9,7,4,8,12,13,0,0,1.102107,1.076233,0.852018,0.648298,0
3,13,4,3,1,10,10,2,0,1.264182,0.582960,1.165919,1.329011,1
4,2,7,3,6,13,10,1,0,0.810373,1.210762,0.807175,1.166937,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3795,3,3,3,3,12,11,0,0,1.215278,0.742358,1.135371,0.659722,-1
3796,10,4,9,0,7,4,2,0,1.979167,0.567686,0.305677,1.319444,1
3797,2,6,2,4,11,5,1,1,0.694444,0.917031,1.441048,0.590278,-1
3798,4,3,9,1,9,16,0,1,0.729167,1.528384,0.655022,0.833333,1


In [13]:
# Set up the X and y for our model and instantiate and fit our model.
X = to_use.drop('Result', axis=1)
Y = to_use['Result']

model = RandomForestClassifier()
model.fit(X, Y)
model.score(X, Y)

1.0

In [14]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

model1 = RandomForestClassifier()
model1.fit(X_train, Y_train)

RandomForestClassifier()

In [15]:
predicted = model1.predict(X_test)
predicted

array([-1,  1,  1,  1,  1,  1,  1,  0, -1, -1, -1, -1,  1,  0,  1,  0,  1,
       -1,  1,  1, -1, -1, -1,  1,  0, -1,  0, -1,  0,  1,  1,  1, -1,  0,
        1, -1,  0, -1, -1,  1,  1,  0, -1,  1,  0,  1, -1,  1,  0,  1,  1,
        1, -1,  1, -1,  0,  0, -1,  1, -1,  0,  1,  1,  1, -1,  1,  0, -1,
        1,  1,  0, -1,  1,  1,  1,  1,  0,  0, -1,  1,  1, -1,  1,  1, -1,
        1, -1,  1,  1,  1,  1, -1, -1,  1, -1,  1,  0,  0, -1, -1, -1, -1,
        0, -1, -1,  1, -1,  1, -1,  1,  1,  1,  0, -1,  1,  1,  1, -1,  0,
       -1, -1, -1,  1,  1, -1, -1,  1,  1,  0,  1,  0,  0,  1,  0,  1,  1,
       -1,  1, -1, -1,  0,  0, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  0,  1, -1, -1,  0, -1,  1,  0,  1,  1,  1, -1,  1,  1,  1, -1,
        1,  1,  1, -1,  1,  1,  0,  1,  1,  0,  1, -1, -1, -1, -1,  1,  1,
        0,  0,  1,  1, -1,  1,  1,  1,  1,  1,  0,  1,  0,  0,  1, -1, -1,
       -1, -1,  1,  0,  1,  1, -1,  1,  1, -1,  1,  0, -1,  1,  1,  1,  1,
        0,  0, -1,  1,  1

In [16]:
acc_score = accuracy_score(Y_test, predicted)
print("Accuracy score: " + str(acc_score))
scores = cross_val_score(model1, X_train, Y_train, cv=10)
print("Cross val scores")
print(scores)

Accuracy score: 0.6368421052631579
Cross val scores
[0.6875     0.66447368 0.64144737 0.68092105 0.61513158 0.63486842
 0.65131579 0.63815789 0.66447368 0.59868421]


In [17]:
score_train = model1.score(X_train, Y_train)
score_test = model1.score(X_test, Y_test)

print("Training set accuracy: ", '%.3f'%(score_train))
print("Test set accuracy: ", '%.3f'%(score_test))

Training set accuracy:  1.000
Test set accuracy:  0.637


In [18]:
predicted = model1.predict(X)
predicted
predictedDF = pd.DataFrame(predicted, columns=['Predicted'])
predictedDF.head()

Unnamed: 0,Predicted
0,1
1,1
2,0
3,1
4,0


In [19]:
to_use = to_use.merge(predictedDF, left_index=True, right_index=True)
feature_table = feature_table.merge(predictedDF, left_index=True, right_index=True)
feature_table

Unnamed: 0,HomeTeam,AwayTeam,FullTime,HomeShotsOnTarget,AwayShotsOnTarget,HomeCorners,AwayCorners,HomeFouls,AwayFouls,HomeGoalsHalfTime,AwayGoalsHalfTime,Result,HomeAttackingStrength,HomeDefensiveStrength,AwayAttackingStrength,AwayDefensiveStrength,Predicted
0,Aston Villa,West Ham,HomeWin,11,2,16,7,15,15,2,0,1,0.842788,0.852018,0.852018,1.264182,1
1,Blackburn,Everton,HomeWin,2,12,1,3,19,14,1,0,1,0.713128,0.717489,0.896861,0.713128,1
2,Bolton,Fulham,Draw,9,7,4,8,12,13,0,0,0,1.102107,1.076233,0.852018,0.648298,0
3,Chelsea,West Brom,HomeWin,13,4,3,1,10,10,2,0,1,1.264182,0.582960,1.165919,1.329011,1
4,Sunderland,Birmingham,Draw,2,7,3,6,13,10,1,0,0,0.810373,1.210762,0.807175,1.166937,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3795,Leicester,Man Utd,AwayWin,3,3,3,3,12,11,0,0,-1,1.215278,0.742358,1.135371,0.659722,-1
3796,Man City,Norwich,HomeWin,10,4,9,0,7,4,2,0,1,1.979167,0.567686,0.305677,1.319444,1
3797,Newcastle,Liverpool,AwayWin,2,6,2,4,11,5,1,1,-1,0.694444,0.917031,1.441048,0.590278,-1
3798,Southampton,Sheffield Utd,HomeWin,4,3,9,1,9,16,0,1,1,0.729167,1.528384,0.655022,0.833333,1
