In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import PoissonRegressor
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score

In [4]:
matches = pd.read_csv('matches.csv', index_col=0)

In [3]:
matches.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,int,tkl+int,err,succ,succ%,crdy,fls,won%,season,team
0,8/5/2022,20:00,Premier League,Matchweek 1,Fri,Away,W,2,0,Crystal Palace,...,9,38,0,13,76.5,2,11,58.3,2022,Arsenal
1,8/13/2022,15:00,Premier League,Matchweek 2,Sat,Home,W,4,2,Leicester City,...,8,16,0,11,64.7,1,15,57.6,2022,Arsenal
2,8/20/2022,17:30,Premier League,Matchweek 3,Sat,Away,W,3,0,Bournemouth,...,10,24,1,12,44.4,1,10,53.8,2022,Arsenal
3,8/27/2022,17:30,Premier League,Matchweek 4,Sat,Home,W,2,1,Fulham,...,7,22,1,13,44.8,2,6,33.3,2022,Arsenal
4,8/31/2022,19:30,Premier League,Matchweek 5,Wed,Home,W,2,1,Aston Villa,...,8,18,0,12,52.2,2,10,50.0,2022,Arsenal


# Data Exploration

In [282]:
matches.shape

(3332, 25)

In [283]:
matches["team"].value_counts()

Everton                     167
Newcastle United            167
Tottenham Hotspur           167
West Ham United             167
Wolverhampton Wanderers     167
Leicester City              167
Southampton                 167
Crystal Palace              166
Manchester City             166
Arsenal                     166
Chelsea                     166
Brighton and Hove Albion    166
Liverpool                   166
Manchester United           166
Burnley                     152
Aston Villa                 129
Watford                     114
Bournemouth                  91
Fulham                       91
Leeds United                 90
Norwich City                 76
Sheffield United             76
Brentford                    53
West Bromwich Albion         38
Cardiff City                 38
Huddersfield Town            38
Nottingham Forest            15
Name: team, dtype: int64

In [284]:
matches["round"].value_counts()

Matchweek 1     100
Matchweek 10    100
Matchweek 2     100
Matchweek 15    100
Matchweek 14    100
Matchweek 13    100
Matchweek 11    100
Matchweek 16    100
Matchweek 9     100
Matchweek 6     100
Matchweek 5     100
Matchweek 4     100
Matchweek 3     100
Matchweek 12     98
Matchweek 8      94
Matchweek 34     80
Matchweek 29     80
Matchweek 31     80
Matchweek 32     80
Matchweek 30     80
Matchweek 36     80
Matchweek 35     80
Matchweek 33     80
Matchweek 37     80
Matchweek 27     80
Matchweek 28     80
Matchweek 20     80
Matchweek 26     80
Matchweek 25     80
Matchweek 24     80
Matchweek 23     80
Matchweek 22     80
Matchweek 21     80
Matchweek 19     80
Matchweek 18     80
Matchweek 17     80
Matchweek 7      80
Matchweek 38     80
Name: round, dtype: int64

In [14]:
matches.dtypes

date             object
time             object
comp             object
round            object
day              object
venue            object
result           object
gf              float64
ga              float64
opponent         object
xg              float64
xga             float64
poss            float64
attendance      float64
captain          object
formation        object
referee          object
match report     object
notes           float64
gls             float64
sh              float64
sot             float64
dist            float64
sota            float64
save%           float64
cs              float64
psxg            float64
cmp             float64
cmp.1           float64
cmp.2           float64
cmp.3           float64
cmp%            float64
cmp%.1          float64
cmp%.2          float64
cmp%.3          float64
prgdist         float64
ast             float64
ppa             float64
prog            float64
sca             float64
gca             float64
tklw            

In [39]:
matches.columns.values

array(['date', 'time', 'comp', 'round', 'day', 'venue', 'result', 'gf',
       'ga', 'opponent', 'xg', 'xga', 'poss', 'attendance', 'captain',
       'formation', 'referee', 'match report', 'notes', 'gls', 'sh',
       'sot', 'dist', 'sota', 'save%', 'cs', 'psxg', 'cmp', 'cmp.1',
       'cmp.2', 'cmp.3', 'cmp%', 'cmp%.1', 'cmp%.2', 'cmp%.3', 'prgdist',
       'ast', 'ppa', 'prog', 'sca', 'gca', 'tklw', 'int', 'tkl+int',
       'err', 'succ', 'succ%', 'crdy', 'fls', 'won%', 'season', 'team'],
      dtype=object)

# Data Preparation

In [60]:
#Creating date as a datetime variable
matches["date"] = pd.to_datetime(matches["date"])

#Creating venue codes as a categorical variable
matches["venue_code"] = matches["venue"].astype("category").cat.codes

#Creating opponent codes as a categorical variable
matches["opp_code"] = matches["opponent"].astype("category").cat.codes

#Creating team codes as a categorical variable
matches["team_code"] = matches["team"].astype("category").cat.codes

#Keeping the hour as integer value
matches["hour"] = matches["time"].str.replace(":.+","", regex = True).astype("int")

#Creating day codes as a categorical variable
matches["day_code"] = matches["date"].dt.dayofweek

#Creating numerical variable for team's form
form_dict = {'W':1, 'D':0, 'L':-1}
matches["form"] = matches["result"].replace(form_dict)

#Creating a target variable
matches["target"] = (matches["result"] == "W").astype("int")
#matches["target"] = matches["result"].astype("category").cat.codes

#Creating a team to score variable
matches["tts"] = np.where(matches["gf"] > 0, 1, 0)
matches["tts"] = matches["tts"].astype("category")

#Creating a team to concede variable
matches["ttc"] = np.where(matches["ga"] > 0, 1, 0)
matches["ttc"] = matches["ttc"].astype("category")

#Creating a both team to score variable
matches["btts"] = np.where(((matches["ga"] > 0) & (matches["gf"] > 0)), 1, 0)
matches["btts"] = matches["btts"].astype("category")

#Creating a goal above 2.5 variable
matches["above_2.5"] = np.where((matches["ga"] + matches["gf"]) > 2, 1, 0)
matches["above_2.5"] = matches["above_2.5"].astype("category")

#Creating a goal above 3.5 variable
matches["above_3.5"] = np.where((matches["ga"] + matches["gf"]) > 3, 1, 0)
matches["above_3.5"] = matches["above_3.5"].astype("category")

In [61]:
matches

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,team_code,hour,day_code,form,target,tts,ttc,btts,above_2.5,above_3.5
0,2022-08-05,20:00,Premier League,Matchweek 1,Fri,Away,W,2.0,0.0,Crystal Palace,...,0,20,4,1.0,1,1,0,0,0,0
1,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,W,4.0,2.0,Leicester City,...,0,15,5,1.0,1,1,1,1,1,1
2,2022-08-20,17:30,Premier League,Matchweek 3,Sat,Away,W,3.0,0.0,Bournemouth,...,0,17,5,1.0,1,1,0,0,1,0
3,2022-08-27,17:30,Premier League,Matchweek 4,Sat,Home,W,2.0,1.0,Fulham,...,0,17,5,1.0,1,1,1,1,1,0
4,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,2.0,1.0,Aston Villa,...,0,19,2,1.0,1,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35,2019-04-13,12:30,Premier League,Matchweek 34,Sat,Away,L,0.0,4.0,Tottenham,...,11,12,5,-1.0,0,0,1,0,1,1
36,2019-04-20,15:00,Premier League,Matchweek 35,Sat,Home,L,1.0,2.0,Watford,...,11,15,5,-1.0,0,1,1,1,1,0
37,2019-04-26,20:00,Premier League,Matchweek 36,Fri,Away,L,0.0,5.0,Liverpool,...,11,20,4,-1.0,0,0,1,0,1,1
38,2019-05-05,14:00,Premier League,Matchweek 37,Sun,Home,D,1.0,1.0,Manchester Utd,...,11,14,6,0.0,0,1,1,1,0,0


In [164]:
matches.dtypes

date            datetime64[ns]
time                    object
comp                    object
round                   object
day                     object
venue                   object
result                  object
gf                     float64
ga                     float64
opponent                object
xg                     float64
xga                    float64
poss                   float64
attendance             float64
captain                 object
formation               object
referee                 object
match report            object
notes                  float64
gls                    float64
sh                     float64
sot                    float64
dist                   float64
season                   int64
team                    object
venue_code                int8
opp_code                  int8
hour                     int32
day_code                 int64
target                   int32
form                   float64
dtype: object

# Modelling Using General Match Data

## Model training

In [62]:
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)
train = matches[matches["season"] != 2022]
test = matches[matches["season"] == 2022]
predictors = ["venue_code", "opp_code", "team_code"]
rf.fit(train[predictors], train["target"])
preds = rf.predict(test[predictors])

## Model Evaluation

In [63]:
acc = accuracy_score(test["target"], preds)
#prec = precision_score(test["target"], preds)
prec = precision_score(test["target"], preds, average='micro')
print("Accuracy:", acc)
print("Precision:", prec)

Accuracy: 0.6288659793814433
Precision: 0.6288659793814433


In [64]:
combined = pd.DataFrame(dict(actual=test["target"], prediction=preds))
pd.crosstab(index=combined["actual"], columns=combined["prediction"])

prediction,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,139,40
1,68,44


# Modelling Using Match Statistic Data

## Data Engineering

In [2]:
rf = RandomForestClassifier(n_estimators=150, min_samples_split=10, random_state=1)

def rolling_averages(group, cols, new_cols, n):
    group = group.sort_values("date")
    rolling_stats = group[cols].rolling(n, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

In [73]:
matches.columns.values

array(['date', 'time', 'comp', 'round', 'day', 'venue', 'result', 'gf',
       'ga', 'opponent', 'xg', 'xga', 'poss', 'attendance', 'captain',
       'formation', 'referee', 'match report', 'notes', 'gls', 'sh',
       'sot', 'dist', 'sota', 'save%', 'cs', 'psxg', 'cmp', 'cmp.1',
       'cmp.2', 'cmp.3', 'cmp%', 'cmp%.1', 'cmp%.2', 'cmp%.3', 'prgdist',
       'ast', 'ppa', 'prog', 'sca', 'gca', 'tklw', 'int', 'tkl+int',
       'err', 'succ', 'succ%', 'crdy', 'fls', 'won%', 'season', 'team',
       'venue_code', 'opp_code', 'team_code', 'hour', 'day_code', 'form',
       'target', 'tts', 'ttc', 'btts', 'above_2.5', 'above_3.5'],
      dtype=object)

In [7]:
#Predictors for match results
predictors = ["venue_code", "opp_code", "team_code"]
cols_form = ["gf", "ga", "sh", "sot", "dist", "form", "xg", "poss", "sota", "save%", "cs", "psxg", "cmp", "cmp%", "prgdist",
            "ast", "ppa", "prog", "sca", "gca", "tklw", "int", "tkl+int", "err", "succ", "succ%", "crdy", "fls", "won%"]
cols_avg = ["gf", "ga", "form", "xg", "xga", "poss", "cs"]
new_cols_form = [f"{c}_rolling_3" for c in cols_form]
new_cols_avg = [f"{c}_rolling_365" for c in cols_avg]

In [26]:
#Predictors for team to score
predictors = ["venue_code", "opp_code", "team_code"]
cols_form = ["gf", "sh", "sot", "dist", "form", "xg", "poss", "sota", "cmp", "cmp%", "prgdist", "ast", "ppa", 
             "prog", "sca", "gca", "succ", "succ%", "crdy", "fls", "won%"]
cols_avg = ["gf", "ga", "form", "xg", "xga", "poss", "cs"]
new_cols_form = [f"{c}_rolling_3" for c in cols_form]
new_cols_avg = [f"{c}_rolling_365" for c in cols_avg]

In [220]:
#Finding rolling average for the team for every 40 matches
#This will be able to show how the team is expected to perform over a long period of time
matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols_avg, new_cols_avg, 40))
matches_rolling = matches_rolling.droplevel('team')
matches_rolling.index = range(matches_rolling.shape[0])

#Find rolling averages for the team for every 3 matches
#This will be able to show the recent form of the team in the short term
matches_rolling = matches_rolling.groupby("team").apply(lambda x: rolling_averages(x, cols_form, new_cols_form, 3))
matches_rolling = matches_rolling.droplevel('team')
matches_rolling.index = range(matches_rolling.shape[0])

In [5]:
matches_rolling
#matches_rolling.to_csv('matches_rolling.csv')

Unnamed: 0.1,Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,...,gca_rolling_3,tklw_rolling_3,int_rolling_3,tkl+int_rolling_3,err_rolling_3,succ_rolling_3,succ%_rolling_3,crdy_rolling_3,fls_rolling_3,won%_rolling_3
0,0,2018-09-23,16:00,Premier League,Matchweek 6,Sun,Home,W,2.0,0.0,...,4.333333,8.666667,8.000000,21.666667,1.000000,7.000000,56.333333,1.666667,13.666667,49.266667
1,1,2018-09-29,15:00,Premier League,Matchweek 7,Sat,Home,W,2.0,0.0,...,4.333333,9.000000,9.000000,24.000000,0.666667,6.000000,53.000000,2.000000,14.000000,50.766667
2,2,2018-10-07,12:00,Premier League,Matchweek 8,Sun,Away,W,5.0,1.0,...,3.000000,7.666667,7.666667,22.333333,0.333333,6.333333,50.400000,1.333333,13.000000,53.266667
3,3,2018-10-22,20:00,Premier League,Matchweek 9,Mon,Home,W,3.0,1.0,...,5.333333,10.333333,10.000000,26.666667,0.666667,7.000000,51.266667,1.333333,13.333333,50.333333
4,4,2018-10-28,13:30,Premier League,Matchweek 10,Sun,Away,D,2.0,2.0,...,6.000000,10.000000,10.333333,27.333333,0.666667,9.000000,59.600000,1.333333,11.000000,48.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2636,2636,2022-10-23,14:00,Premier League,Matchweek 13,Sun,Home,L,0.0,4.0,...,1.000000,10.666667,6.333333,26.000000,0.000000,5.333333,32.233333,1.000000,14.333333,45.033333
2637,2637,2022-10-29,15:00,Premier League,Matchweek 14,Sat,Away,D,1.0,1.0,...,1.000000,10.000000,7.000000,26.000000,0.000000,4.333333,27.700000,1.666667,15.000000,47.133333
2638,2638,2022-11-05,15:00,Premier League,Matchweek 15,Sat,Home,L,2.0,3.0,...,1.333333,9.333333,8.000000,27.000000,0.000000,6.000000,31.500000,2.000000,15.000000,48.700000
2639,2639,2022-11-12,19:45,Premier League,Matchweek 16,Sat,Home,L,0.0,2.0,...,1.333333,6.666667,7.333333,22.666667,0.000000,6.000000,30.133333,2.333333,14.000000,46.500000


In [4]:
matches_rolling = pd.read_csv('matches_engineered.csv')

## Model Training Using All Variables

In [8]:
def make_result_predictions(data, predictors, y):
    train = data[data["season"] != 2022]
    test = data[data["season"] == 2022]
    print(f"Splitting data into {round(len(train)/(len(train)+len(test)),2)}:{round(len(test)/(len(train)+len(test)),2)} ratio")
    
    #Random Forest Classifier Model
    rf = RandomForestClassifier(n_estimators=97, criterion='gini', max_depth = 124, min_samples_split = 6,
                                min_samples_leaf = 3, max_features = None, max_leaf_nodes = 1906, 
                                min_impurity_decrease = 0.004238486800121857, bootstrap = True, random_state = 15,
                                oob_score = False, warm_start = True, ccp_alpha = 0.01769784612106034, max_samples = 35)
    
    rf.fit(train[predictors], train[y])
    preds = rf.predict(test[predictors])
    combined_rf = pd.DataFrame(dict(actual=test[y], predicted = preds), index=test.index)
    precision = precision_score(test[y],preds)
    accuracy = accuracy_score(test[y], preds)
    print("Training Completed for Random Forest")
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    
    #Logistic Regression Model
    logreg = LogisticRegression(penalty = 'elasticnet', C = 0.10205138608051409, fit_intercept = False, solver = 'saga',
                               random_state = 22, max_iter = 317, warm_start = True, l1_ratio = 0.04938219562914714)
    logreg.fit(train[predictors], train[y])
    preds = logreg.predict(test[predictors])
    combined_logreg = pd.DataFrame(dict(actual=test[y], predicted = preds), index=test.index)
    precision = precision_score(test[y],preds)
    accuracy = accuracy_score(test[y], preds)
    print("Training Completed for Logistic Regression")
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    
    #XGBClassifier
    xgb = XGBClassifier(booster = 'dart',
                        objective = 'binary:logitraw',
                        sample_type = 'weighted',
                        normalize_type = 'tree',
                        rate_drop = 0.625430139999562,
                        one_drop = 0,
                        skip_drop = 0.5267890691915503)
    xgb.fit(train[predictors], train[y])
    preds = xgb.predict(test[predictors])
    combined_xgb = pd.DataFrame(dict(actual=test[y], predicted = preds), index=test.index)
    precision = precision_score(test[y],preds)
    accuracy = accuracy_score(test[y], preds)
    print("Training Completed for XGBClassifier")
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    
    return combined_rf, combined_logreg, combined_xgb

In [9]:
#Making predictions on the outcome of the match
combined_rf, combined_logreg, combined_xgb = make_result_predictions(matches_rolling, predictors + new_cols_form + new_cols_avg, "target")

Splitting data into 0.9:0.1 ratio
Training Completed for Random Forest
Accuracy: 0.6704119850187266
Precision: 0.6119402985074627




Training Completed for Logistic Regression
Accuracy: 0.6217228464419475
Precision: 0.5178571428571429
Training Completed for XGBClassifier
Accuracy: 0.6254681647940075
Precision: 0.5217391304347826


In [28]:
#Making predictions on whether both teams will score
combined_rf, combined_logreg, combined_xgb = make_result_predictions(matches_rolling, predictors + new_cols_form + new_cols_avg, "btts")

Splitting data into 0.9:0.1 ratio
Training Completed for Random Forest
Accuracy: 0.5805243445692884
Precision: 0.6388888888888888




Training Completed for Logistic Regression
Accuracy: 0.5243445692883895
Precision: 0.5581395348837209
Training Completed for XGBClassifier
Accuracy: 0.5430711610486891
Precision: 0.5735294117647058


In [35]:
#Making predictions on the total goals above 2.5 / 3.5
combined_rf, combined_logreg, combined_xgb = make_result_predictions(matches_rolling, predictors + new_cols_form + new_cols_avg, "above_2.5")

Splitting data into 0.9:0.1 ratio
Training Completed for Random Forest
Accuracy: 0.5617977528089888
Precision: 0.592964824120603




Training Completed for Logistic Regression
Accuracy: 0.5468164794007491
Precision: 0.5789473684210527
Training Completed for XGBClassifier
Accuracy: 0.5280898876404494
Precision: 0.5714285714285714


In [12]:
combined_rf = combined_rf.merge(matches_rolling[["date", "team", "opponent", "result"]], left_index=True, right_index=True)
combined_logreg = combined_logreg.merge(matches_rolling[["date", "team", "opponent", "result"]], left_index=True, right_index=True)
combined_xgb = combined_xgb.merge(matches_rolling[["date", "team", "opponent", "result"]], left_index=True, right_index=True)

In [10]:
pd.crosstab(index=combined["actual"], columns=combined["predicted"])

predicted,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,136,28
1,61,42


In [13]:
combined_rf

Unnamed: 0,actual,predicted,date,team,opponent,result
141,1,1,2022-08-05,Arsenal,Crystal Palace,W
142,1,1,2022-08-13,Arsenal,Leicester City,W
143,1,1,2022-08-20,Arsenal,Bournemouth,W
144,1,1,2022-08-27,Arsenal,Fulham,W
145,1,1,2022-08-31,Arsenal,Aston Villa,W
...,...,...,...,...,...,...
2636,0,0,2022-10-23,Wolverhampton Wanderers,Leicester City,L
2637,0,0,2022-10-29,Wolverhampton Wanderers,Brentford,D
2638,0,0,2022-11-05,Wolverhampton Wanderers,Brighton,L
2639,0,0,2022-11-12,Wolverhampton Wanderers,Arsenal,L


# Model Training Using Selected Variables

In [179]:
# Making match result predictions using selected variables
selected_predictors = ['team_code','opp_code','venue_code','form_rolling_365','xg_rolling_365','gf_rolling_365',
                       'poss_rolling_365','ga_rolling_365','cs_rolling_365','xga_rolling_365']
combined, precision, accuracy = make_match_result_predictions(matches_rolling, selected_predictors)

Splitting data into 0.88:0.12 ratio
Training Completed
Accuracy: 0.6491935483870968
Precision: 0.5645161290322581


In [191]:
# Making team to score predictions using selected variables
selected_predictors = ['team_code','opp_code','venue_code','form_rolling_365','xg_rolling_365','gf_rolling_365',
                       'poss_rolling_365','ga_rolling_365','cs_rolling_365','xga_rolling_365']
combined, precision, accuracy = make_btts_result_predictions(matches_rolling, selected_predictors)

Splitting data into 0.88:0.12 ratio
Training Completed
Accuracy: 0.6733870967741935
Precision: 0.7123893805309734


In [178]:
# Making team to concede predictions using selected variables
selected_predictors = ['team_code','opp_code','venue_code','form_rolling_365','xg_rolling_365','gf_rolling_365',
                       'poss_rolling_365','ga_rolling_365','cs_rolling_365','xga_rolling_365']
combined, precision, accuracy = make_above_result_predictions(matches_rolling, selected_predictors)

Splitting data into 0.88:0.12 ratio
Training Completed
Accuracy: 0.7016129032258065
Precision: 0.38095238095238093


In [186]:
combined_pred = combined.merge(matches_rolling[["date", "team", "opponent", "result"]], left_index=True, right_index=True)
pd.crosstab(index=combined["actual"], columns=combined["predicted"])

predicted,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,6,65
1,16,161


## Combining Home and Away Predictions

In [29]:
pd.unique(matches_rolling['team'])

array(['Arsenal', 'Aston Villa', 'Bournemouth', 'Brentford',
       'Brighton and Hove Albion', 'Burnley', 'Chelsea', 'Crystal Palace',
       'Everton', 'Fulham', 'Huddersfield Town', 'Leeds United',
       'Leicester City', 'Liverpool', 'Manchester City',
       'Manchester United', 'Newcastle United', 'Norwich City',
       'Sheffield United', 'Southampton', 'Tottenham Hotspur', 'Watford',
       'West Bromwich Albion', 'West Ham United',
       'Wolverhampton Wanderers'], dtype=object)

In [30]:
pd.unique(matches_rolling['opponent'])

array(['Everton', 'Watford', 'Fulham', 'Leicester City', 'Crystal Palace',
       'Liverpool', 'Wolves', 'Bournemouth', 'Tottenham',
       'Manchester Utd', 'Huddersfield', 'West Ham', 'Chelsea',
       'Cardiff City', 'Manchester City', 'Southampton', 'Newcastle Utd',
       'Brighton', 'Burnley', 'Aston Villa', 'Sheffield Utd',
       'Norwich City', 'Leeds United', 'West Brom', 'Brentford',
       "Nott'ham Forest", 'Arsenal'], dtype=object)

In [14]:
class MissingDict(dict):
    __missing__ = lambda self, key: key
    
map_values = {
    "Brighton and Hove Albion": "Brighton",
    "Leeds United": "Leeds",
    "Leiceister City": "Leiceister",
    "Manchester United": "Manchester Utd",
    "Tottenham Hotspur": "Tottenham",
    "West Ham United": "West Ham",
    "Wolverhampton Wanderers": "Wolves",
    "West Bromwich Albion" : "West Brom"
}
mapping = MissingDict(**map_values)

In [15]:
combined_rf['new_team'] = combined_rf['team'].map(mapping)
combined_rf['new_opponent'] = combined_rf['opponent'].map(mapping)
merged_rf = combined_rf.merge(combined_rf, left_on = ["date", "new_team"], right_on=["date", "new_opponent"])

combined_logreg['new_team'] = combined_logreg['team'].map(mapping)
combined_logreg['new_opponent'] = combined_logreg['opponent'].map(mapping)
merged_logreg = combined_logreg.merge(combined_logreg, left_on = ["date", "new_team"], right_on=["date", "new_opponent"])

combined_xgb['new_team'] = combined_xgb['team'].map(mapping)
combined_xgb['new_opponent'] = combined_xgb['opponent'].map(mapping)
merged_xgb = combined_xgb.merge(combined_xgb, left_on = ["date", "new_team"], right_on=["date", "new_opponent"])

In [18]:
merged_rf

Unnamed: 0,actual_x,predicted_x,date,team_x,opponent_x,result_x,new_team_x,new_opponent_x,actual_y,predicted_y,team_y,opponent_y,result_y,new_team_y,new_opponent_y
0,1,1,2022-08-05,Arsenal,Crystal Palace,W,Arsenal,Crystal Palace,0,0,Crystal Palace,Arsenal,L,Crystal Palace,Arsenal
1,1,1,2022-08-13,Arsenal,Leicester City,W,Arsenal,Leicester City,0,0,Leicester City,Arsenal,L,Leicester City,Arsenal
2,1,1,2022-08-20,Arsenal,Bournemouth,W,Arsenal,Bournemouth,0,0,Bournemouth,Arsenal,L,Bournemouth,Arsenal
3,1,1,2022-08-27,Arsenal,Fulham,W,Arsenal,Fulham,0,0,Fulham,Arsenal,L,Fulham,Arsenal
4,1,1,2022-08-31,Arsenal,Aston Villa,W,Arsenal,Aston Villa,0,0,Aston Villa,Arsenal,L,Aston Villa,Arsenal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
215,0,0,2022-10-18,Wolverhampton Wanderers,Crystal Palace,L,Wolves,Crystal Palace,1,0,Crystal Palace,Wolves,W,Crystal Palace,Wolves
216,0,0,2022-10-23,Wolverhampton Wanderers,Leicester City,L,Wolves,Leicester City,1,0,Leicester City,Wolves,W,Leicester City,Wolves
217,0,0,2022-10-29,Wolverhampton Wanderers,Brentford,D,Wolves,Brentford,0,0,Brentford,Wolves,D,Brentford,Wolves
218,0,0,2022-11-12,Wolverhampton Wanderers,Arsenal,L,Wolves,Arsenal,1,1,Arsenal,Wolves,W,Arsenal,Wolves


### For Win-Lose Result Prediction

In [46]:
print("Random Forest Classifier")
win_count = merged_rf[(merged_rf["predicted_x"] == 1) & (merged_rf["predicted_y"] == 0)]
precision_win = len(win_count[win_count['actual_x'] == 1]) / len(win_count)
print(f"Team wins when model predicts that team will win and opponent will lose: {round(precision_win, 5)}")

draw_count = merged_rf[(merged_rf["predicted_x"] == 0) & (merged_rf["predicted_y"] == 0)]
precision_draw = len(draw_count[draw_count['result_x'] == 'D']) / len(draw_count)
print(f"Team draws when model predicts that both teams will lose or win: {round(precision_draw, 5)}")
print("")

print("Logistic Regression")
win_count = merged_logreg[(merged_logreg["predicted_x"] == 1) & (merged_logreg["predicted_y"] == 0)]
precision_win = len(win_count[win_count['actual_x'] == 1]) / len(win_count)
print(f"Team wins when model predicts that team will win and opponent will lose: {round(precision_win, 5)}")

draw_count = merged_logreg[(merged_logreg["predicted_x"] == 0) & (merged_logreg["predicted_y"] == 0)]
precision_draw = len(draw_count[draw_count['result_x'] == 'D']) / len(draw_count)
print(f"Team draws when model predicts that both teams will lose or win: {round(precision_draw, 5)}")
print("")

print("XGBoost Classifier")
win_count = merged_xgb[(merged_xgb["predicted_x"] == 1) & (merged_xgb["predicted_y"] == 0)]
precision_win = len(win_count[win_count['actual_x'] == 1]) / len(win_count)
print(f"Team wins when model predicts that team will win and opponent will lose: {round(precision_win, 5)}")

draw_count = merged_xgb[(merged_xgb["predicted_x"] == 0) & (merged_xgb["predicted_y"] == 0)]
precision_draw = len(draw_count[draw_count['result_x'] == 'D']) / len(draw_count)
print(f"Team draws when model predicts that both teams will lose or win: {round(precision_draw, 5)}")
print("")

Random Forest Classifier
Team wins when model predicts that team will win and opponent will lose: 0.67347
Team draws when model predicts that both teams will lose or win: 0.23478

Logistic Regression
Team wins when model predicts that team will win and opponent will lose: 0.62162
Team draws when model predicts that both teams will lose or win: 0.21168

XGBoost Classifier
Team wins when model predicts that team will win and opponent will lose: 0.55319
Team draws when model predicts that both teams will lose or win: 0.19492



In [141]:
merged[(merged["predicted_x"] == 1) & (merged["predicted_y"] == 0)]

Unnamed: 0,actual_x,predicted_x,date,team_x,opponent_x,result_x,new_team_x,new_opponent_x,actual_y,predicted_y,team_y,opponent_y,result_y,new_team_y,new_opponent_y
0,1,1,2022-08-05,Arsenal,Crystal Palace,W,Arsenal,Crystal Palace,0,0,Crystal Palace,Arsenal,L,Crystal Palace,Arsenal
4,1,1,2022-08-31,Arsenal,Aston Villa,W,Arsenal,Aston Villa,0,0,Aston Villa,Arsenal,L,Aston Villa,Arsenal
6,1,1,2022-09-18,Arsenal,Brentford,W,Arsenal,Brentford,0,0,Brentford,Arsenal,L,Brentford,Arsenal
10,0,1,2022-10-23,Arsenal,Southampton,D,Arsenal,Southampton,0,0,Southampton,Arsenal,D,Southampton,Arsenal
11,1,1,2022-11-06,Arsenal,Chelsea,W,Arsenal,Chelsea,0,0,Chelsea,Arsenal,L,Chelsea,Arsenal
12,1,1,2022-11-12,Arsenal,Wolves,W,Arsenal,Wolves,0,0,Wolverhampton Wanderers,Arsenal,L,Wolves,Arsenal
19,1,1,2022-09-16,Aston Villa,Southampton,W,Aston Villa,Southampton,0,0,Southampton,Aston Villa,L,Southampton,Aston Villa
53,0,1,2022-08-21,Chelsea,Leeds United,L,Chelsea,Leeds,1,0,Leeds United,Chelsea,W,Leeds,Chelsea
54,1,1,2022-08-27,Chelsea,Leicester City,W,Chelsea,Leicester City,0,0,Leicester City,Chelsea,L,Leicester City,Chelsea
55,0,1,2022-08-30,Chelsea,Southampton,L,Chelsea,Southampton,1,0,Southampton,Chelsea,W,Southampton,Chelsea


In [143]:
merged[(merged["predicted_x"] == 0) & (merged["predicted_y"] == 0)]

Unnamed: 0,actual_x,predicted_x,date,team_x,opponent_x,result_x,new_team_x,new_opponent_x,actual_y,predicted_y,team_y,opponent_y,result_y,new_team_y,new_opponent_y
1,1,0,2022-08-13,Arsenal,Leicester City,W,Arsenal,Leicester City,0,0,Leicester City,Arsenal,L,Leicester City,Arsenal
2,1,0,2022-08-20,Arsenal,Bournemouth,W,Arsenal,Bournemouth,0,0,Bournemouth,Arsenal,L,Bournemouth,Arsenal
3,1,0,2022-08-27,Arsenal,Fulham,W,Arsenal,Fulham,0,0,Fulham,Arsenal,L,Fulham,Arsenal
5,0,0,2022-09-04,Arsenal,Manchester Utd,L,Arsenal,Manchester Utd,1,0,Manchester United,Arsenal,W,Manchester Utd,Arsenal
9,1,0,2022-10-16,Arsenal,Leeds United,W,Arsenal,Leeds,0,0,Leeds United,Arsenal,L,Leeds,Arsenal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196,1,0,2022-09-03,Wolverhampton Wanderers,Southampton,W,Wolves,Southampton,0,0,Southampton,Wolves,L,Southampton,Wolves
198,0,0,2022-10-01,Wolverhampton Wanderers,West Ham,L,Wolves,West Ham,1,0,West Ham United,Wolves,W,West Ham,Wolves
200,0,0,2022-10-18,Wolverhampton Wanderers,Crystal Palace,L,Wolves,Crystal Palace,1,0,Crystal Palace,Wolves,W,Crystal Palace,Wolves
201,0,0,2022-10-23,Wolverhampton Wanderers,Leicester City,L,Wolves,Leicester City,1,0,Leicester City,Wolves,W,Leicester City,Wolves


In [97]:
merged[(merged["predicted_x"] == 1) & (merged["predicted_y"] == 1)]

Unnamed: 0,actual_x,predicted_x,date,team_x,opponent_x,result_x,new_team_x,new_opponent_x,actual_y,predicted_y,team_y,opponent_y,result_y,new_team_y,new_opponent_y
7,1,1,2022-10-01,Arsenal,Tottenham,W,Arsenal,Tottenham,0,1,Tottenham Hotspur,Arsenal,L,Tottenham,Arsenal
127,0,1,2022-08-22,Liverpool,Manchester Utd,L,Liverpool,Manchester Utd,1,1,Manchester United,Liverpool,W,Manchester Utd,Liverpool
135,1,1,2022-11-06,Liverpool,Tottenham,W,Liverpool,Tottenham,0,1,Tottenham Hotspur,Liverpool,L,Tottenham,Liverpool
147,1,1,2022-08-22,Manchester United,Liverpool,W,Manchester Utd,Liverpool,0,1,Liverpool,Manchester Utd,L,Liverpool,Manchester Utd
174,0,1,2022-10-01,Tottenham Hotspur,Arsenal,L,Tottenham,Arsenal,1,1,Arsenal,Tottenham,W,Arsenal,Tottenham
177,0,1,2022-11-06,Tottenham Hotspur,Liverpool,L,Tottenham,Liverpool,1,1,Liverpool,Tottenham,W,Liverpool,Tottenham


### For Both Teams To Score Prediction

In [32]:
print("Random Forest Classifier")
btts_count = merged_rf[(merged_rf["predicted_x"] == 1) & (merged_rf["predicted_y"] == 1)]
precision_btts = len(btts_count[(btts_count['actual_x'] == 1) & (btts_count['actual_y'] == 1)]) / len(btts_count)
print(f"Both teams actually score when model predicts that both teams score: {round(precision_btts, 5)}")
print("")

print("Logistic Regression")
btts_count = merged_logreg[(merged_logreg["predicted_x"] == 1) & (merged_logreg["predicted_y"] == 1)]
precision_btts = len(btts_count[(btts_count['actual_x'] == 1) & (btts_count['actual_y'] == 1)]) / len(btts_count)
print(f"Both teams actually score when model predicts that both teams score: {round(precision_btts, 5)}")
print("")

print("XGB Classifier")
btts_count = merged_xgb[(merged_xgb["predicted_x"] == 1) & (merged_xgb["predicted_y"] == 1)]
precision_btts = len(btts_count[(btts_count['actual_x'] == 1) & (btts_count['actual_y'] == 1)]) / len(btts_count)
print(f"Both teams actually score when model predicts that both teams score: {round(precision_btts, 5)}")
print("")

Random Forest Classifier
Both teams actually score when model predicts that both teams score: 0.86207

Logistic Regression
Both teams actually score when model predicts that both teams score: 0.63636

XGB Classifier
Both teams actually score when model predicts that both teams score: 0.63934



In [14]:
btts_count = merged_rf[((merged_rf["predicted_x"] == 1) & (merged_rf["predicted_y"] == 0)) | 
                      ((merged_rf["predicted_x"] == 0) & (merged_rf["predicted_y"] == 1))]
precision_btts = len(btts_count[(btts_count['actual_x'] == 1) & (btts_count['actual_y'] == 1)]) / len(btts_count)
print(f"Both teams actually score when model predicts that both teams score: {round(precision_btts, 5)}")
print("")


Both teams actually score when model predicts that both teams score: 0.55372



### For Above 2.5/3.5 Prediction

In [39]:
print("Random Forest Classifier")
above_count = merged_rf[(merged_rf["predicted_x"] == 1) & (merged_rf["predicted_y"] == 1)]
precision_above = len(above_count[above_count["actual_x"] == 1]) / len(above_count)
print(f"Total goals above 2.5/3.5 when model predicts as such: {round(precision_above, 5)}")

below_count = merged_rf[(merged_rf["predicted_x"] == 0) & (merged_rf["predicted_y"] == 0)]
precision_above = len(below_count[below_count["actual_x"] == 0]) / len(below_count)
print(f"Total goals below 2.5/3.5 when model predicts as such: {round(precision_above, 5)}")

print("Logistic Regression")
above_count = merged_logreg[(merged_logreg["predicted_x"] == 1) & (merged_logreg["predicted_y"] == 1)]
precision_above = len(above_count[above_count["actual_x"] == 1]) / len(above_count)
print(f"Total goals above 2.5/3.5 when model predicts as such: {round(precision_above, 5)}")

below_count = merged_logreg[(merged_logreg["predicted_x"] == 0) & (merged_logreg["predicted_y"] == 0)]
precision_above = len(below_count[below_count["actual_x"] == 0]) / len(below_count)
print(f"Total goals below 2.5/3.5 when model predicts as such: {round(precision_above, 5)}")

print("XGBoost Classifier")
above_count = merged_xgb[(merged_xgb["predicted_x"] == 1) & (merged_xgb["predicted_y"] == 1)]
precision_above = len(above_count[above_count["actual_x"] == 1]) / len(above_count)
print(f"Total goals above 2.5/3.5 when model predicts as such: {round(precision_above, 5)}")

below_count = merged_xgb[(merged_xgb["predicted_x"] == 0) & (merged_xgb["predicted_y"] == 0)]
precision_above = len(below_count[below_count["actual_x"] == 0]) / len(below_count)
print(f"Total goals below 2.5/3.5 when model predicts as such: {round(precision_above, 5)}")

Random Forest Classifier
Total goals above 2.5/3.5 when model predicts as such: 0.5969
Total goals below 2.5/3.5 when model predicts as such: 0.75
Logistic Regression
Total goals above 2.5/3.5 when model predicts as such: 0.61194
Total goals below 2.5/3.5 when model predicts as such: 0.25
XGBoost Classifier
Total goals above 2.5/3.5 when model predicts as such: 0.59524
Total goals below 2.5/3.5 when model predicts as such: 0.3


# Predicting Today's Game

In [43]:
side1 = 'Manchester City'
side2 = 'Leeds'

## Predicting Side 1

In [48]:
#Printing out the result
side1_pred = combined[(combined['new_team'] == side1) & (combined['new_opponent'] == side2)]
for index, row in side1_pred.iterrows():
    if row['predicted'] == 1:
        print(f"{side1} Wins!")
        break
    else:
        print(f"{side1} Loses!")
        break

Manchester City Wins!


## Predicting Side 2

In [49]:
#Printing out the result
side2_pred = combined[(combined['new_team'] == side2) & (combined['new_opponent'] == side1)]
for index, row in side2_pred.iterrows():
    if row['predicted'] == 1:
        print(f"{side2} Wins!")
        break
    else:
        print(f"{side2} Loses!")
        break

Leeds Loses!


In [91]:
#Predictors for match results
predictors = ["venue_code", "opp_code", "team_code"]
cols_form = ["gf", "ga", "sh", "sot", "dist", "form", "xg", "poss", "sota", "save%", "cs", "psxg", "cmp", "cmp%", "prgdist",
            "ast", "ppa", "prog", "sca", "gca", "tklw", "int", "tkl+int", "err", "succ", "succ%", "crdy", "fls", "won%"]
cols_avg = ["gf", "ga", "form", "xg", "xga", "poss", "cs"]
new_cols_form = [f"{c}_rolling_3" for c in cols_form]
new_cols_avg = [f"{c}_rolling_365" for c in cols_avg]

In [86]:
#Predictors for team to score
predictors = ["venue_code", "opp_code", "team_code"]
cols_form = ["gf", "sh", "sot", "dist", "form", "xg", "poss", "sota", "cmp", "cmp%", "prgdist", "ast", "ppa", 
             "prog", "sca", "gca", "succ", "succ%", "crdy", "fls", "won%"]
cols_avg = ["gf", "ga", "form", "xg", "xga", "poss", "cs"]
new_cols_form = [f"{c}_rolling_3" for c in cols_form]
new_cols_avg = [f"{c}_rolling_365" for c in cols_avg]

In [92]:
train = matches_rolling[matches_rolling["season"] != 2022]
test = matches_rolling[matches_rolling["season"] == 2022]
predictors = predictors + new_cols_form + new_cols_avg

xgb = XGBClassifier(booster = 'gbtree',
                   objective = 'binary:logitraw',
                   eta = 0.7728938824808032,
                   gamma = 2.4465720216663955,
                   max_depth = 13,
                   min_child_weight = 2.6132315739082834,
                   max_delta_step = 3,
                   subsample = 0.41452825994375864,
                   reg_lambda = 2.5427342332070166,
                   reg_alpha = 1.97083175662957,
                   tree_method = 'hist',
                   grow_policy = 'depthwise',
                   max_leaves = 6,
                   max_bin = 423)
xgb.fit(train[predictors], train['above_2.5'])

test_preds = xgb.predict(test[predictors])
test_combined = pd.DataFrame(dict(actual=test['above_2.5'], predicted = test_preds), index=test.index)
test_combined = test_combined.merge(matches_rolling[["date", "team", "opponent", "result"]], left_index=True, right_index=True)
test_combined['new_team'] = test_combined['team'].map(mapping)
test_combined['new_opponent'] = test_combined['opponent'].map(mapping)
test_merged = test_combined.merge(test_combined, left_on = ["date", "new_team"], right_on=["date", "new_opponent"])

train_preds = xgb.predict(train[predictors])
train_combined = pd.DataFrame(dict(actual=train['above_2.5'], predicted = train_preds), index=train.index)
train_combined = train_combined.merge(matches_rolling[["date", "team", "opponent", "result"]], left_index=True, right_index=True)
train_combined['new_team'] = train_combined['team'].map(mapping)
train_combined['new_opponent'] = train_combined['opponent'].map(mapping)
train_merged = train_combined.merge(train_combined, left_on = ["date", "new_team"], right_on=["date", "new_opponent"])

above_count = train_merged[(train_merged["predicted_x"] == 1) & (train_merged["predicted_y"] == 1)]
precision_train_above = len(above_count[above_count["actual_x"] == 1]) / len(above_count)

above_count = test_merged[(test_merged["predicted_x"] == 1) & (test_merged["predicted_y"] == 1)]
precision_test_above = len(above_count[above_count["actual_x"] == 1]) / len(above_count)

print(f"Train: {precision_train_above}")
print(f"Test: {precision_test_above}")
print(f"Avg1: {(precision_train_above + precision_test_above) / 2}")
print(f"Avg2: {0.8739837398373984}")

Train: 0.9600886917960089
Test: 0.7878787878787878
Avg1: 0.8739837398373984
Avg2: 0.8739837398373984
