In [1]:
%matplotlib inline
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
import statsmodels.api as sm

In [2]:
# Load the data
# Statistics of each game for 10 seasons
file_path = './epl_game_data/epl-allseasons-matchstats.csv'
raw_df = pd.read_csv(file_path)
raw_df

Unnamed: 0,Season,Date,Referee,HomeTeam,AwayTeam,FullTime,Halftime,HomeGoals,HomeGoalsHalfTime,HomeShots,...,HomeYellowCards,HomeRedCards,AwayGoals,AwayGoalsHalfTime,AwayShots,AwayShotsOnTarget,AwayCorners,AwayFouls,AwayYellowCards,AwayRedCards
0,2010/11,2010-08-14,M Dean,Aston Villa,West Ham,HomeWin,HomeWin,3,2,23,...,1,0,0,0,12,2,7,15,2,0
1,2010/11,2010-08-14,P Dowd,Blackburn,Everton,HomeWin,HomeWin,1,1,7,...,2,0,0,0,17,12,3,14,1,0
2,2010/11,2010-08-14,S Attwell,Bolton,Fulham,Draw,Draw,0,0,13,...,1,0,0,0,12,7,8,13,3,0
3,2010/11,2010-08-14,M Clattenburg,Chelsea,West Brom,HomeWin,HomeWin,6,2,18,...,1,0,0,0,10,4,1,10,0,0
4,2010/11,2010-08-14,A Taylor,Sunderland,Birmingham,Draw,HomeWin,2,1,6,...,3,1,2,0,13,7,6,10,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3795,2019/20,2020-07-26,M Atkinson,Leicester,Man Utd,AwayWin,Draw,0,0,14,...,1,1,2,0,7,3,3,11,4,0
3796,2019/20,2020-07-26,C Pawson,Man City,Norwich,HomeWin,HomeWin,5,2,31,...,1,0,0,0,5,4,0,4,1,0
3797,2019/20,2020-07-26,A Taylor,Newcastle,Liverpool,AwayWin,Draw,1,1,3,...,1,0,3,1,14,6,4,5,0,0
3798,2019/20,2020-07-26,P Bankes,Southampton,Sheffield Utd,HomeWin,AwayWin,3,0,13,...,0,0,1,1,5,3,1,16,1,0


In [3]:
# Function to transform Full Time Results (FTR) into numeric data type
def transformResult(row):
    if(row.FullTime == 'HomeWin'):
        return 1
    elif(row.FullTime == 'AwayWin'):
        return -1
    else:
        return 0

raw_df['Result'] = raw_df.apply(lambda row: transformResult(row),axis=1)

# Check that it worked.
raw_df

Unnamed: 0,Season,Date,Referee,HomeTeam,AwayTeam,FullTime,Halftime,HomeGoals,HomeGoalsHalfTime,HomeShots,...,HomeRedCards,AwayGoals,AwayGoalsHalfTime,AwayShots,AwayShotsOnTarget,AwayCorners,AwayFouls,AwayYellowCards,AwayRedCards,Result
0,2010/11,2010-08-14,M Dean,Aston Villa,West Ham,HomeWin,HomeWin,3,2,23,...,0,0,0,12,2,7,15,2,0,1
1,2010/11,2010-08-14,P Dowd,Blackburn,Everton,HomeWin,HomeWin,1,1,7,...,0,0,0,17,12,3,14,1,0,1
2,2010/11,2010-08-14,S Attwell,Bolton,Fulham,Draw,Draw,0,0,13,...,0,0,0,12,7,8,13,3,0,0
3,2010/11,2010-08-14,M Clattenburg,Chelsea,West Brom,HomeWin,HomeWin,6,2,18,...,0,0,0,10,4,1,10,0,0,1
4,2010/11,2010-08-14,A Taylor,Sunderland,Birmingham,Draw,HomeWin,2,1,6,...,1,2,0,13,7,6,10,3,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3795,2019/20,2020-07-26,M Atkinson,Leicester,Man Utd,AwayWin,Draw,0,0,14,...,1,2,0,7,3,3,11,4,0,-1
3796,2019/20,2020-07-26,C Pawson,Man City,Norwich,HomeWin,HomeWin,5,2,31,...,0,0,0,5,4,0,4,1,0,1
3797,2019/20,2020-07-26,A Taylor,Newcastle,Liverpool,AwayWin,Draw,1,1,3,...,0,3,1,14,6,4,5,0,0,-1
3798,2019/20,2020-07-26,P Bankes,Southampton,Sheffield Utd,HomeWin,AwayWin,3,0,13,...,0,1,1,5,3,1,16,1,0,1


In [6]:
# Break up the df so each season becomes an individual dataframe.
# Seasons: '2010/11' | '2011/12' | '2012/13' | '2013/14' | '2014/15' | '2015/16' | '2016/17' | '2017/18' | '2018/19'
#          '2019/20'

df_2010 = raw_df[(raw_df['Season'] == '2010/11')].reset_index(drop=True)
df_2011 = raw_df[(raw_df['Season'] == '2011/12')].reset_index(drop=True)
df_2012 = raw_df[(raw_df['Season'] == '2012/13')].reset_index(drop=True)
df_2013 = raw_df[(raw_df['Season'] == '2013/14')].reset_index(drop=True)
df_2014 = raw_df[(raw_df['Season'] == '2014/15')].reset_index(drop=True)
df_2015 = raw_df[(raw_df['Season'] == '2015/16')].reset_index(drop=True)
df_2016 = raw_df[(raw_df['Season'] == '2016/17')].reset_index(drop=True)
df_2017 = raw_df[(raw_df['Season'] == '2017/18')].reset_index(drop=True)
df_2018 = raw_df[(raw_df['Season'] == '2018/19')].reset_index(drop=True)
df_2019 = raw_df[(raw_df['Season'] == '2019/20')].reset_index(drop=True)

# Create an array of dataframes where each DF is a season.
seasons = [df_2010, df_2011, df_2012, df_2013, df_2014, df_2015, df_2016, df_2017, df_2018, df_2019]

# Since we will be using the most reason season to test, we want to take a look at the dataframe.
df_2019.head(10)

Unnamed: 0,Season,Date,Referee,HomeTeam,AwayTeam,FullTime,Halftime,HomeGoals,HomeGoalsHalfTime,HomeShots,...,HomeRedCards,AwayGoals,AwayGoalsHalfTime,AwayShots,AwayShotsOnTarget,AwayCorners,AwayFouls,AwayYellowCards,AwayRedCards,Result
0,2019/20,2019-08-09,M Oliver,Liverpool,Norwich,HomeWin,HomeWin,4,4,15,...,0,1,0,12,5,2,9,2,0,1
1,2019/20,2019-08-10,M Dean,West Ham,Man City,AwayWin,AwayWin,0,0,5,...,0,5,1,14,9,1,13,2,0,-1
2,2019/20,2019-08-10,K Friend,Bournemouth,Sheffield Utd,Draw,Draw,1,0,13,...,0,1,0,8,3,4,19,1,0,0
3,2019/20,2019-08-10,G Scott,Burnley,Southampton,HomeWin,Draw,3,0,10,...,0,0,0,11,3,7,12,0,0,1
4,2019/20,2019-08-10,J Moss,Crystal Palace,Everton,Draw,Draw,0,0,6,...,0,0,0,10,3,2,14,1,1,0
5,2019/20,2019-08-10,C Pawson,Watford,Brighton,AwayWin,AwayWin,0,0,11,...,0,3,1,5,3,2,11,1,0,-1
6,2019/20,2019-08-10,C Kavanagh,Tottenham,Aston Villa,HomeWin,AwayWin,3,0,31,...,0,1,1,7,4,0,9,0,0,1
7,2019/20,2019-08-11,A Marriner,Leicester,Wolves,Draw,Draw,0,0,15,...,0,0,0,8,2,3,13,2,0,0
8,2019/20,2019-08-11,M Atkinson,Newcastle,Arsenal,AwayWin,Draw,0,0,9,...,0,1,0,8,2,3,7,3,0,-1
9,2019/20,2019-08-11,A Taylor,Man Utd,Chelsea,HomeWin,HomeWin,4,1,11,...,0,0,0,18,7,5,13,4,0,1


In [35]:
# Get each team's offensive/defensive capabilities per season.
def get_features_per_season(data_now):
    table_new = pd.DataFrame(columns=('Team','HGS','AGS','HAS','AAS','HGC','AGC','HDS','ADS'))
    
    home_data = data_now.groupby('HomeTeam')
    away_data = data_now.groupby('AwayTeam')
    
    table_new.Team = list(home_data.groups.keys())
    table_new.HGS = home_data.HomeGoals.sum().values
    table_new.HGC = home_data.AwayGoals.sum().values
    table_new.AGS = away_data.AwayGoals.sum().values
    table_new.AGC = away_data.HomeGoals.sum().values
    
    #19 home matches and 19 away matches for each team each season
    table_new.HAS = (table_new.HGS / 19.0)  # avg_home_scored
    table_new.AAS = (table_new.AGS / 19.0)  # avg_away_scored
    table_new.HDS = (table_new.HGC / 19.0)  # avg_home_conceded
    table_new.ADS = (table_new.AGC / 19.0)  # avg_away_conceded
    return table_new

# create a team dataframe using our function to calculate strengths and averages for home and away.
team_data = get_features_per_season(df_2019)
# Set the team as the index (this is used to merge dataframes in the next step).
team_data.set_index('Team', inplace=True)
# Create an empty team dataframe to use at the end, in order to get a final Premier League table.
last_season_table = pd.DataFrame(index=team_data.index)
# Scale the data.
scaler = MinMaxScaler()
# Fit the data to the scaler and show the final form.
team_data = pd.DataFrame(scaler.fit_transform(team_data), columns=team_data.columns, index=team_data.index)
team_data

Unnamed: 0_level_0,HGS,AGS,HAS,AAS,HGC,AGC,HDS,ADS
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Arsenal,0.5,0.342105,0.5,0.342105,0.458333,0.333333,0.458333,0.333333
Aston Villa,0.166667,0.315789,0.166667,0.315789,0.708333,0.952381,0.708333,0.952381
Bournemouth,0.166667,0.289474,0.166667,0.289474,0.708333,0.857143,0.708333,0.857143
Brighton,0.119048,0.315789,0.119048,0.315789,0.583333,0.47619,0.583333,0.47619
Burnley,0.214286,0.315789,0.214286,0.315789,0.416667,0.47619,0.416667,0.47619
Chelsea,0.357143,0.842105,0.357143,0.842105,0.125,1.0,0.125,1.0
Crystal Palace,0.0,0.236842,0.0,0.236842,0.291667,0.619048,0.291667,0.619048
Everton,0.214286,0.342105,0.214286,0.342105,0.333333,0.857143,0.333333,0.857143
Leicester,0.47619,0.657895,0.47619,0.657895,0.166667,0.333333,0.166667,0.333333
Liverpool,0.880952,0.684211,0.880952,0.684211,0.125,0.0,0.125,0.0


In [36]:
# Create a function that ties the HomeAttackingStrength(HAS), AwayAttackingStrenght(AAS),
# HomeDefensiveStrength(HDS), AwayDefensiveStrength(ADS) to each game for the 2019/20 season.
h_a_cols = ['HomeTeam', 'AwayTeam']
data_prep = df_2019[h_a_cols]

# Add in the Strengths by Home and Away teams.
data_prep = data_prep.merge(team_data['HAS'], left_on='HomeTeam', right_index=True, how='left')
data_prep = data_prep.merge(team_data['HDS'], left_on='HomeTeam', right_index=True, how='left')
data_prep = data_prep.merge(team_data['AAS'], left_on='AwayTeam', right_index=True, how='left')
data_prep = data_prep.merge(team_data['ADS'], left_on='AwayTeam', right_index=True, how='left')

# Add in the Total Scored and Conceded.
data_prep = data_prep.merge(team_data['HGS'], left_on='HomeTeam', right_index=True, how='left')
data_prep = data_prep.merge(team_data['HGC'], left_on='HomeTeam', right_index=True, how='left')
data_prep = data_prep.merge(team_data['AGS'], left_on='AwayTeam', right_index=True, how='left')
data_prep = data_prep.merge(team_data['AGC'], left_on='AwayTeam', right_index=True, how='left')

# Add in the Results
data_prep = data_prep.merge(df_2019['Result'], left_index=True, right_index=True)

data_prep

Unnamed: 0,HomeTeam,AwayTeam,HAS,HDS,AAS,ADS,HGS,HGC,AGS,AGC,Result
0,Liverpool,Norwich,0.880952,0.125000,0.000000,1.000000,0.880952,0.125000,0.000000,1.000000,1
1,West Ham,Man City,0.357143,0.833333,1.000000,0.238095,0.357143,0.833333,1.000000,0.238095,-1
2,Bournemouth,Sheffield Utd,0.166667,0.708333,0.210526,0.333333,0.166667,0.708333,0.210526,0.333333,0
3,Burnley,Southampton,0.214286,0.416667,0.605263,0.380952,0.214286,0.416667,0.605263,0.380952,1
4,Crystal Palace,Everton,0.000000,0.291667,0.342105,0.857143,0.000000,0.291667,0.342105,0.857143,0
...,...,...,...,...,...,...,...,...,...,...,...
375,Leicester,Man Utd,0.476190,0.166667,0.500000,0.095238,0.476190,0.166667,0.500000,0.095238,-1
376,Man City,Norwich,1.000000,0.000000,0.000000,1.000000,1.000000,0.000000,0.000000,1.000000,1
377,Newcastle,Liverpool,0.119048,0.333333,0.684211,0.000000,0.119048,0.333333,0.684211,0.000000,-1
378,Southampton,Sheffield Utd,0.142857,0.916667,0.210526,0.333333,0.142857,0.916667,0.210526,0.333333,1


In [37]:
# Check that there are no null values so we can ensure we are using a complete dataset.
data_prep.isnull().sum()

HomeTeam    0
AwayTeam    0
HAS         0
HDS         0
AAS         0
ADS         0
HGS         0
HGC         0
AGS         0
AGC         0
Result      0
dtype: int64

In [38]:
# Turn our game dataframe into a numerical dataframe by dropping the string columns.
rm_col_list = ['HomeTeam','AwayTeam']
col_list = data_prep.columns.values.tolist()
col_list = [x for x in col_list if x not in rm_col_list]

to_use = data_prep[col_list]
X = [x for x in col_list if x != 'Result']
Y = [x for x in col_list if x == 'Result']

to_use

Unnamed: 0,HAS,HDS,AAS,ADS,HGS,HGC,AGS,AGC,Result
0,0.880952,0.125000,0.000000,1.000000,0.880952,0.125000,0.000000,1.000000,1
1,0.357143,0.833333,1.000000,0.238095,0.357143,0.833333,1.000000,0.238095,-1
2,0.166667,0.708333,0.210526,0.333333,0.166667,0.708333,0.210526,0.333333,0
3,0.214286,0.416667,0.605263,0.380952,0.214286,0.416667,0.605263,0.380952,1
4,0.000000,0.291667,0.342105,0.857143,0.000000,0.291667,0.342105,0.857143,0
...,...,...,...,...,...,...,...,...,...
375,0.476190,0.166667,0.500000,0.095238,0.476190,0.166667,0.500000,0.095238,-1
376,1.000000,0.000000,0.000000,1.000000,1.000000,0.000000,0.000000,1.000000,1
377,0.119048,0.333333,0.684211,0.000000,0.119048,0.333333,0.684211,0.000000,-1
378,0.142857,0.916667,0.210526,0.333333,0.142857,0.916667,0.210526,0.333333,1


In [39]:
# Seperate our data into X and Y sections and instantiate, fit and get a score of 
# the first model
X_data = to_use.drop('Result', axis=1)
Y_data = to_use['Result']

model = RandomForestClassifier()
model.fit(X_data, Y_data)
model.score(X_data, Y_data)

0.95

In [40]:
# Seperate our data into training and testing and instantiate our second model.
X_train, X_test, Y_train, Y_test = train_test_split(X_data, Y_data, test_size=0.2, random_state=42)

model1 = RandomForestClassifier()
model1.fit(X_train, Y_train)

RandomForestClassifier()

In [41]:
# Get the predicted scores into from our model.
predicted = model1.predict(X_test)
predicted

array([ 1,  1,  1,  1,  1,  1,  0,  1, -1,  1,  1,  1, -1,  1,  1, -1,  1,
        0,  1,  1, -1, -1,  1,  1, -1, -1,  0, -1, -1,  1,  1,  1,  0,  1,
        1,  1,  1,  0, -1,  1,  0,  1, -1,  1,  0,  1,  1, -1, -1, -1,  0,
        1, -1,  0,  0,  0,  0,  1,  1,  0,  1,  0,  1,  1,  1,  1, -1,  0,
       -1, -1, -1,  0,  1, -1,  1, -1])

In [42]:
# Get the training and test accuracy/variance scores of our models and assess. 
acc_score = accuracy_score(Y_test, predicted)
print("Accuracy score: " + str(acc_score))
scores = cross_val_score(model1, X_train, Y_train, cv=10)
print("Cross val scores")
print(scores)

Accuracy score: 0.3815789473684211
Cross val scores
[0.5483871  0.64516129 0.48387097 0.35483871 0.6        0.4
 0.53333333 0.43333333 0.5        0.56666667]


In [43]:

score_train = model1.score(X_train, Y_train)
score_test = model1.score(X_test, Y_test)

print("Training set accuracy: ", '%.3f'%(score_train))
print("Test set accuracy: ", '%.3f'%(score_test))

Training set accuracy:  0.964
Test set accuracy:  0.382


In [44]:
# Turn the predicted array into a dataframe.
predicted = model1.predict(X_data)
predicted
predictedDF = pd.DataFrame(predicted, columns=['Predicted'])
predictedDF.head()

Unnamed: 0,Predicted
0,1
1,-1
2,0
3,0
4,0


In [45]:
# Merge the games dataframe with actual and predicted results.
to_use = to_use.merge(predictedDF, left_index=True, right_index=True)
fixture_list = data_prep.merge(predictedDF, left_index=True, right_index=True)
fixture_list

Unnamed: 0,HomeTeam,AwayTeam,HAS,HDS,AAS,ADS,HGS,HGC,AGS,AGC,Result,Predicted
0,Liverpool,Norwich,0.880952,0.125000,0.000000,1.000000,0.880952,0.125000,0.000000,1.000000,1,1
1,West Ham,Man City,0.357143,0.833333,1.000000,0.238095,0.357143,0.833333,1.000000,0.238095,-1,-1
2,Bournemouth,Sheffield Utd,0.166667,0.708333,0.210526,0.333333,0.166667,0.708333,0.210526,0.333333,0,0
3,Burnley,Southampton,0.214286,0.416667,0.605263,0.380952,0.214286,0.416667,0.605263,0.380952,1,0
4,Crystal Palace,Everton,0.000000,0.291667,0.342105,0.857143,0.000000,0.291667,0.342105,0.857143,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
375,Leicester,Man Utd,0.476190,0.166667,0.500000,0.095238,0.476190,0.166667,0.500000,0.095238,-1,-1
376,Man City,Norwich,1.000000,0.000000,0.000000,1.000000,1.000000,0.000000,0.000000,1.000000,1,1
377,Newcastle,Liverpool,0.119048,0.333333,0.684211,0.000000,0.119048,0.333333,0.684211,0.000000,-1,-1
378,Southampton,Sheffield Utd,0.142857,0.916667,0.210526,0.333333,0.142857,0.916667,0.210526,0.333333,1,1


In [48]:
# Calculate points based off of wins, draws, and losses.
last_season_table['Actual Points'] = 0 
last_season_table['Predicted Points'] = 0 

def make_actual_table(x, ht, at, which):
    if x == 1:
        last_season_table.loc[ht, which] += 3
    elif x == 0:
        last_season_table.loc[ht, which] += 1
        last_season_table.loc[at, which] += 1
    elif x == -1:
        last_season_table.loc[at, which] += 3

fixture_list.apply(lambda x: make_actual_table(x.Result, x.HomeTeam, x.AwayTeam, 'Actual Points'), axis=1)
fixture_list.apply(lambda x: make_actual_table(x.Predicted, x.HomeTeam, x.AwayTeam, 'Predicted Points'), axis=1)


0      None
1      None
2      None
3      None
4      None
       ... 
375    None
376    None
377    None
378    None
379    None
Length: 380, dtype: object

In [49]:
# 19/20 Season actual table
actual_index = last_season_table.sort_values(by='Actual Points', ascending=0).index
predicted_index = last_season_table.sort_values(by='Predicted Points', ascending=0).index

last_season_table.sort_values(by='Actual Points', ascending=0)

Unnamed: 0_level_0,Actual Points,Predicted Points
Team,Unnamed: 1_level_1,Unnamed: 2_level_1
Liverpool,99,99
Man City,81,78
Man Utd,66,74
Chelsea,66,74
Leicester,62,67
Tottenham,59,56
Wolves,59,60
Arsenal,56,68
Burnley,54,38
Sheffield Utd,54,55
