In [1]:
import numpy as np

import pandas as pd

pd.set_option('display.max_columns', 50)

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, cross_val_score

from sklearn.metrics import accuracy_score, classification_report

import xgboost
from xgboost import XGBClassifier

import hyperopt
from hyperopt import tpe, STATUS_OK, Trials, hp, fmin, space_eval

import warnings

warnings.filterwarnings("ignore")

import pickle

In [2]:
match_df = pd.read_csv("matches.csv")

match_df.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Attendance,Captain,Formation,Referee,Match Report,Notes,Sh,SoT,Dist,FK,PK,PKatt,Season,Team
0,2023-08-13,16:30,Premier League,Matchweek 1,Sun,Away,D,1.0,1.0,Chelsea,1.3,1.4,35.0,40096.0,Virgil van Dijk,4-3-3,Anthony Taylor,Match Report,,13.0,1.0,17.8,0.0,0,0,2024,Liverpool
1,2023-08-19,15:00,Premier League,Matchweek 2,Sat,Home,W,3.0,1.0,Bournemouth,3.0,1.3,64.0,53145.0,Virgil van Dijk,4-3-3,Thomas Bramall,Match Report,,25.0,9.0,16.8,1.0,0,1,2024,Liverpool
2,2023-08-27,16:30,Premier League,Matchweek 3,Sun,Away,W,2.0,1.0,Newcastle Utd,0.9,2.0,41.0,52214.0,Virgil van Dijk,4-3-3,John Brooks,Match Report,,9.0,4.0,17.2,1.0,0,0,2024,Liverpool
3,2023-09-03,14:00,Premier League,Matchweek 4,Sun,Home,W,3.0,0.0,Aston Villa,2.5,0.7,63.0,50109.0,Trent Alexander-Arnold,4-3-3,Simon Hooper,Match Report,,17.0,4.0,14.7,0.0,0,0,2024,Liverpool
4,2023-09-16,12:30,Premier League,Matchweek 5,Sat,Away,W,3.0,1.0,Wolves,2.5,0.6,65.0,31257.0,Andrew Robertson,4-3-3,Michael Oliver,Match Report,,16.0,5.0,15.8,0.0,0,0,2024,Liverpool


In [3]:
def clean_data(df):
    df["Date"] = pd.to_datetime(df["Date"]) #converting the date column to a datetime 
    new_df = df.sort_values(by="Date").reset_index(drop=True) 
    new_df["Opponent"] = new_df["Opponent"].replace({"Newcastle Utd": "Newcastle United", "Brighton":"Brighton and Hove Albion", 
                                         "Manchester Utd":"Manchester United", "West Ham": "West Ham United", 
                                         "Tottenham": "Tottenham Hotspur", "Wolves": "Wolverhampton Wanderers", 
                                         "Nott'ham Forest":"Nottingham Forest", "Sheffield Utd":"Sheffield United"})
    new_df["hour"] = new_df["Time"].str.replace(":.+", "", regex=True).astype("int")
    new_df["day_code"] = new_df["Date"].dt.dayofweek
    return new_df

In [4]:
cleaned_df = clean_data(match_df)

In [5]:
cleaned_df.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Attendance,Captain,Formation,Referee,Match Report,Notes,Sh,SoT,Dist,FK,PK,PKatt,Season,Team,hour,day_code
0,2021-08-13,20:00,Premier League,Matchweek 1,Fri,Away,L,0.0,2.0,Brentford,1.3,1.2,64.0,16479.0,Granit Xhaka,4-2-3-1,Michael Oliver,Match Report,,22.0,4.0,19.0,1.0,0,0,2022,Arsenal,20,4
1,2021-08-13,20:00,Premier League,Matchweek 1,Fri,Home,W,2.0,0.0,Arsenal,1.2,1.3,36.0,16479.0,Pontus Jansson,3-5-2,Michael Oliver,Match Report,,8.0,3.0,12.1,0.0,0,0,2022,Brentford,20,4
2,2021-08-14,15:00,Premier League,Matchweek 1,Sat,Home,W,3.0,2.0,Aston Villa,1.2,1.2,38.0,20051.0,Tom Cleverley,4-1-4-1,Mike Dean,Match Report,,13.0,7.0,17.9,0.0,0,0,2022,Watford,15,5
3,2021-08-14,12:30,Premier League,Matchweek 1,Sat,Away,L,1.0,5.0,Manchester United,0.5,1.5,51.0,72732.0,Liam Cooper,4-1-4-1,Paul Tierney,Match Report,,10.0,3.0,21.3,0.0,0,0,2022,Leeds United,12,5
4,2021-08-14,17:30,Premier League,Matchweek 1,Sat,Away,W,3.0,0.0,Norwich City,1.6,1.4,50.0,27023.0,James Milner,4-3-3,Andre Marriner,Match Report,,19.0,6.0,17.3,1.0,0,0,2022,Liverpool,17,5


In [6]:
cleaned_df["Team"].value_counts()

Arsenal                     96
Chelsea                     96
Newcastle United            96
Everton                     96
Manchester United           96
Wolverhampton Wanderers     96
Tottenham Hotspur           96
West Ham United             96
Brighton and Hove Albion    96
Crystal Palace              96
Aston Villa                 96
Liverpool                   96
Brentford                   95
Manchester City             95
Leeds United                76
Leicester City              76
Southampton                 76
Fulham                      58
Nottingham Forest           58
Burnley                     58
Bournemouth                 57
Norwich City                38
Watford                     38
Sheffield United            20
Luton Town                  19
Name: Team, dtype: int64

In [7]:
cleaned_df["Opponent"].value_counts()

Brighton and Hove Albion    96
Wolverhampton Wanderers     96
Aston Villa                 96
Manchester United           96
Newcastle United            96
Chelsea                     96
Tottenham Hotspur           96
Crystal Palace              96
Everton                     96
West Ham United             96
Liverpool                   96
Arsenal                     96
Manchester City             95
Brentford                   95
Southampton                 76
Leeds United                76
Leicester City              76
Burnley                     58
Fulham                      58
Nottingham Forest           58
Bournemouth                 57
Watford                     38
Norwich City                38
Sheffield United            20
Luton Town                  19
Name: Opponent, dtype: int64

In [8]:
cleaned_df['Venue'].value_counts()

Away    958
Home    958
Name: Venue, dtype: int64

In [9]:
encoder = LabelEncoder()
def add_predictors(df, cols):
    for col in cols:
        df[f"{col}_code"] = encoder.fit_transform(df[col])
    return df
    
    

In [10]:
cat_cols = ["Venue", "Team", "Opponent"]

In [11]:
encoded_df = add_predictors(cleaned_df, cat_cols)

In [12]:
encoded_df.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Attendance,Captain,Formation,Referee,Match Report,Notes,Sh,SoT,Dist,FK,PK,PKatt,Season,Team,hour,day_code,Venue_code,Team_code,Opponent_code
0,2021-08-13,20:00,Premier League,Matchweek 1,Fri,Away,L,0.0,2.0,Brentford,1.3,1.2,64.0,16479.0,Granit Xhaka,4-2-3-1,Michael Oliver,Match Report,,22.0,4.0,19.0,1.0,0,0,2022,Arsenal,20,4,0,0,3
1,2021-08-13,20:00,Premier League,Matchweek 1,Fri,Home,W,2.0,0.0,Arsenal,1.2,1.3,36.0,16479.0,Pontus Jansson,3-5-2,Michael Oliver,Match Report,,8.0,3.0,12.1,0.0,0,0,2022,Brentford,20,4,1,3,0
2,2021-08-14,15:00,Premier League,Matchweek 1,Sat,Home,W,3.0,2.0,Aston Villa,1.2,1.2,38.0,20051.0,Tom Cleverley,4-1-4-1,Mike Dean,Match Report,,13.0,7.0,17.9,0.0,0,0,2022,Watford,15,5,1,22,1
3,2021-08-14,12:30,Premier League,Matchweek 1,Sat,Away,L,1.0,5.0,Manchester United,0.5,1.5,51.0,72732.0,Liam Cooper,4-1-4-1,Paul Tierney,Match Report,,10.0,3.0,21.3,0.0,0,0,2022,Leeds United,12,5,0,10,15
4,2021-08-14,17:30,Premier League,Matchweek 1,Sat,Away,W,3.0,0.0,Norwich City,1.6,1.4,50.0,27023.0,James Milner,4-3-3,Andre Marriner,Match Report,,19.0,6.0,17.3,1.0,0,0,2022,Liverpool,17,5,0,12,17


In [13]:
groups_df = encoded_df.groupby("Team")

In [14]:
mancity_df = groups_df.get_group("Manchester City")

In [15]:
mancity_df.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Attendance,Captain,Formation,Referee,Match Report,Notes,Sh,SoT,Dist,FK,PK,PKatt,Season,Team,hour,day_code,Venue_code,Team_code,Opponent_code
17,2021-08-15,16:30,Premier League,Matchweek 1,Sun,Away,L,0.0,1.0,Tottenham Hotspur,1.8,1.0,65.0,58262.0,Fernandinho,4-3-3,Anthony Taylor,Match Report,,18.0,4.0,17.3,1.0,0,0,2022,Manchester City,16,6,0,14,21
28,2021-08-21,15:00,Premier League,Matchweek 2,Sat,Home,W,5.0,0.0,Norwich City,2.6,0.1,67.0,51437.0,İlkay Gündoğan,4-3-3,Graham Scott,Match Report,,16.0,4.0,18.5,1.0,0,0,2022,Manchester City,15,5,1,14,17
46,2021-08-28,12:30,Premier League,Matchweek 3,Sat,Home,W,5.0,0.0,Arsenal,4.4,0.2,80.0,52276.0,İlkay Gündoğan,4-3-3,Martin Atkinson,Match Report,,25.0,10.0,14.8,0.0,0,0,2022,Manchester City,12,5,1,14,0
68,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Away,W,1.0,0.0,Leicester City,2.8,0.6,61.0,32087.0,İlkay Gündoğan,4-3-3,Paul Tierney,Match Report,,25.0,8.0,14.3,0.0,0,0,2022,Manchester City,15,5,0,14,11
85,2021-09-18,15:00,Premier League,Matchweek 5,Sat,Home,D,0.0,0.0,Southampton,1.0,0.4,64.0,52698.0,Fernandinho,4-3-3,Jonathan Moss,Match Report,,16.0,1.0,16.4,1.0,0,0,2022,Manchester City,15,5,1,14,20


In [16]:
def rolling_averages(group, cols, new_cols):
    rolling_stats = group[cols].rolling(5, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.fillna(0)
    return group

In [17]:
cols = ["GF", "GA", "Poss", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]
new_cols = [f"{col}_rolling" for col in cols]

In [18]:
rolled_group = rolling_averages(mancity_df, cols, new_cols)

In [19]:
rolled_group

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Attendance,Captain,Formation,Referee,Match Report,Notes,Sh,SoT,Dist,FK,PK,PKatt,Season,Team,hour,day_code,Venue_code,Team_code,Opponent_code,GF_rolling,GA_rolling,Poss_rolling,Sh_rolling,SoT_rolling,Dist_rolling,FK_rolling,PK_rolling,PKatt_rolling
17,2021-08-15,16:30,Premier League,Matchweek 1,Sun,Away,L,0.0,1.0,Tottenham Hotspur,1.8,1.0,65.0,58262.0,Fernandinho,4-3-3,Anthony Taylor,Match Report,0.0,18.0,4.0,17.3,1.0,0,0,2022,Manchester City,16,6,0,14,21,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0
28,2021-08-21,15:00,Premier League,Matchweek 2,Sat,Home,W,5.0,0.0,Norwich City,2.6,0.1,67.0,51437.0,İlkay Gündoğan,4-3-3,Graham Scott,Match Report,0.0,16.0,4.0,18.5,1.0,0,0,2022,Manchester City,15,5,1,14,17,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0
46,2021-08-28,12:30,Premier League,Matchweek 3,Sat,Home,W,5.0,0.0,Arsenal,4.4,0.2,80.0,52276.0,İlkay Gündoğan,4-3-3,Martin Atkinson,Match Report,0.0,25.0,10.0,14.8,0.0,0,0,2022,Manchester City,12,5,1,14,0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0
68,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Away,W,1.0,0.0,Leicester City,2.8,0.6,61.0,32087.0,İlkay Gündoğan,4-3-3,Paul Tierney,Match Report,0.0,25.0,8.0,14.3,0.0,0,0,2022,Manchester City,15,5,0,14,11,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0
85,2021-09-18,15:00,Premier League,Matchweek 5,Sat,Home,D,0.0,0.0,Southampton,1.0,0.4,64.0,52698.0,Fernandinho,4-3-3,Jonathan Moss,Match Report,0.0,16.0,1.0,16.4,1.0,0,0,2022,Manchester City,15,5,1,14,20,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1805,2023-12-06,20:15,Premier League,Matchweek 15,Wed,Away,L,0.0,1.0,Aston Villa,0.6,2.3,54.0,41421.0,Kyle Walker,3-2-4-1,John Brooks,Match Report,0.0,2.0,2.0,7.4,0.0,0,0,2024,Manchester City,20,2,0,14,1,3.4,1.8,58.8,17.6,7.0,15.98,0.8,0.4,0.4
1834,2023-12-10,14:00,Premier League,Matchweek 16,Sun,Away,W,2.0,1.0,Luton Town,1.6,0.3,65.0,11047.0,Kyle Walker,4-2-3-1,Tim Robinson,Match Report,0.0,18.0,6.0,18.7,1.0,0,0,2024,Manchester City,14,6,0,14,13,2.8,2.0,57.6,14.0,5.6,14.02,0.4,0.2,0.2
1843,2023-12-16,15:00,Premier League,Matchweek 17,Sat,Home,D,2.0,2.0,Crystal Palace,1.6,1.8,74.0,53248.0,Kyle Walker,4-2-3-1,Paul Tierney,Match Report,0.0,19.0,9.0,18.2,0.0,0,0,2024,Manchester City,15,5,1,14,7,2.0,2.0,57.6,13.4,5.2,14.48,0.6,0.2,0.2
1888,2023-12-27,20:15,Premier League,Matchweek 19,Wed,Away,W,3.0,1.0,Everton,2.4,1.0,72.0,39327.0,Kyle Walker,4-2-3-1,John Brooks,Match Report,0.0,22.0,8.0,19.5,1.0,1,1,2024,Manchester City,20,2,0,14,8,1.6,1.6,61.4,14.4,5.2,14.80,0.4,0.0,0.0


In [20]:
matches_rolling = encoded_df.groupby("Team").apply(lambda x: rolling_averages(x, cols, new_cols))

In [21]:
matches_rolling.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Attendance,Captain,Formation,Referee,Match Report,Notes,Sh,SoT,Dist,FK,PK,PKatt,Season,Team,hour,day_code,Venue_code,Team_code,Opponent_code,GF_rolling,GA_rolling,Poss_rolling,Sh_rolling,SoT_rolling,Dist_rolling,FK_rolling,PK_rolling,PKatt_rolling
0,2021-08-13,20:00,Premier League,Matchweek 1,Fri,Away,L,0.0,2.0,Brentford,1.3,1.2,64.0,16479.0,Granit Xhaka,4-2-3-1,Michael Oliver,Match Report,0.0,22.0,4.0,19.0,1.0,0,0,2022,Arsenal,20,4,0,0,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2021-08-13,20:00,Premier League,Matchweek 1,Fri,Home,W,2.0,0.0,Arsenal,1.2,1.3,36.0,16479.0,Pontus Jansson,3-5-2,Michael Oliver,Match Report,0.0,8.0,3.0,12.1,0.0,0,0,2022,Brentford,20,4,1,3,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2021-08-14,15:00,Premier League,Matchweek 1,Sat,Home,W,3.0,2.0,Aston Villa,1.2,1.2,38.0,20051.0,Tom Cleverley,4-1-4-1,Mike Dean,Match Report,0.0,13.0,7.0,17.9,0.0,0,0,2022,Watford,15,5,1,22,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2021-08-14,12:30,Premier League,Matchweek 1,Sat,Away,L,1.0,5.0,Manchester United,0.5,1.5,51.0,72732.0,Liam Cooper,4-1-4-1,Paul Tierney,Match Report,0.0,10.0,3.0,21.3,0.0,0,0,2022,Leeds United,12,5,0,10,15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2021-08-14,17:30,Premier League,Matchweek 1,Sat,Away,W,3.0,0.0,Norwich City,1.6,1.4,50.0,27023.0,James Milner,4-3-3,Andre Marriner,Match Report,0.0,19.0,6.0,17.3,1.0,0,0,2022,Liverpool,17,5,0,12,17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
matches_rolling["Target"] = matches_rolling["Result"].astype("category").cat.codes

In [23]:
matches_rolling.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Attendance,Captain,Formation,Referee,Match Report,Notes,Sh,SoT,Dist,FK,PK,PKatt,Season,Team,hour,day_code,Venue_code,Team_code,Opponent_code,GF_rolling,GA_rolling,Poss_rolling,Sh_rolling,SoT_rolling,Dist_rolling,FK_rolling,PK_rolling,PKatt_rolling,Target
0,2021-08-13,20:00,Premier League,Matchweek 1,Fri,Away,L,0.0,2.0,Brentford,1.3,1.2,64.0,16479.0,Granit Xhaka,4-2-3-1,Michael Oliver,Match Report,0.0,22.0,4.0,19.0,1.0,0,0,2022,Arsenal,20,4,0,0,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,2021-08-13,20:00,Premier League,Matchweek 1,Fri,Home,W,2.0,0.0,Arsenal,1.2,1.3,36.0,16479.0,Pontus Jansson,3-5-2,Michael Oliver,Match Report,0.0,8.0,3.0,12.1,0.0,0,0,2022,Brentford,20,4,1,3,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
2,2021-08-14,15:00,Premier League,Matchweek 1,Sat,Home,W,3.0,2.0,Aston Villa,1.2,1.2,38.0,20051.0,Tom Cleverley,4-1-4-1,Mike Dean,Match Report,0.0,13.0,7.0,17.9,0.0,0,0,2022,Watford,15,5,1,22,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
3,2021-08-14,12:30,Premier League,Matchweek 1,Sat,Away,L,1.0,5.0,Manchester United,0.5,1.5,51.0,72732.0,Liam Cooper,4-1-4-1,Paul Tierney,Match Report,0.0,10.0,3.0,21.3,0.0,0,0,2022,Leeds United,12,5,0,10,15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,2021-08-14,17:30,Premier League,Matchweek 1,Sat,Away,W,3.0,0.0,Norwich City,1.6,1.4,50.0,27023.0,James Milner,4-3-3,Andre Marriner,Match Report,0.0,19.0,6.0,17.3,1.0,0,0,2022,Liverpool,17,5,0,12,17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2


In [24]:
matches_rolling["Target"].value_counts()

1    746
2    746
0    424
Name: Target, dtype: int64

In [25]:
predictors = ["Venue_code", "Team_code", "Opponent_code"] + new_cols
predictors

['day_code',
 'Venue_code',
 'Team_code',
 'Opponent_code',
 'GF_rolling',
 'GA_rolling',
 'Poss_rolling',
 'Sh_rolling',
 'SoT_rolling',
 'Dist_rolling',
 'FK_rolling',
 'PK_rolling',
 'PKatt_rolling']

In [26]:
matches_rolling[matches_rolling["Season"]==2024]

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Attendance,Captain,Formation,Referee,Match Report,Notes,Sh,SoT,Dist,FK,PK,PKatt,Season,Team,hour,day_code,Venue_code,Team_code,Opponent_code,GF_rolling,GA_rolling,Poss_rolling,Sh_rolling,SoT_rolling,Dist_rolling,FK_rolling,PK_rolling,PKatt_rolling,Target
1520,2023-08-11,20:00,Premier League,Matchweek 1,Fri,Away,W,3.0,0.0,Burnley,1.9,0.3,65.0,21572.0,Kevin De Bruyne,4-2-3-1,Craig Pawson,Match Report,0.0,17.0,8.0,13.9,0.0,0,0,2024,Manchester City,20,4,0,14,5,1.4,0.6,66.4,14.2,3.6,17.50,0.8,0.0,0.2,2
1521,2023-08-11,20:00,Premier League,Matchweek 1,Fri,Home,L,0.0,3.0,Manchester City,0.3,1.9,35.0,21572.0,Josh Cullen,5-4-1,Craig Pawson,Match Report,0.0,6.0,1.0,14.9,0.0,0,0,2024,Burnley,20,4,1,5,14,1.0,1.6,43.0,12.0,4.0,16.12,0.4,0.2,0.2,1
1522,2023-08-12,15:00,Premier League,Matchweek 1,Sat,Away,W,1.0,0.0,Sheffield United,1.9,0.5,68.0,31194.0,Joel Ward,4-2-3-1,John Brooks,Match Report,0.0,24.0,8.0,18.5,1.0,0,0,2024,Crystal Palace,15,5,0,7,19,1.8,1.4,55.8,13.0,4.0,17.02,0.8,0.2,0.2,2
1523,2023-08-12,15:00,Premier League,Matchweek 1,Sat,Away,L,1.0,4.0,Brighton and Hove Albion,1.5,4.0,30.0,31872.0,Tom Lockyer,3-5-2,David Coote,Match Report,0.0,8.0,2.0,16.6,0.0,1,1,2024,Luton Town,15,5,0,13,4,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,1
1524,2023-08-12,15:00,Premier League,Matchweek 1,Sat,Home,L,0.0,1.0,Fulham,2.7,1.5,42.0,39940.0,James Tarkowski,4-4-1-1,Stuart Attwell,Match Report,0.0,19.0,9.0,15.8,0.0,0,0,2024,Everton,15,5,1,8,9,1.8,1.4,38.2,14.2,4.8,13.92,0.4,0.2,0.2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1911,2023-12-31,14:00,Premier League,Matchweek 20,Sun,Home,W,3.0,1.0,Bournemouth,2.3,1.7,50.0,61780.0,Son Heung-min,4-3-3,Simon Hooper,Match Report,0.0,12.0,6.0,14.7,0.0,0,0,2024,Tottenham Hotspur,14,6,1,21,2,2.2,1.6,62.4,17.8,6.2,15.08,0.4,0.2,0.2,2
1912,2024-01-01,20:00,Premier League,Matchweek 20,Mon,Home,W,4.0,2.0,Newcastle United,7.0,0.6,61.0,0.0,Virgil van Dijk,4-3-3,Anthony Taylor,Match Report,0.0,32.0,12.0,16.2,0.0,1,2,2024,Liverpool,20,0,1,12,16,1.4,0.4,67.8,19.0,6.2,17.80,0.2,0.0,0.0,2
1913,2024-01-01,20:00,Premier League,Matchweek 20,Mon,Away,L,2.0,4.0,Liverpool,0.6,7.0,39.0,0.0,Dan Burn,4-3-3,Anthony Taylor,Match Report,0.0,5.0,3.0,16.1,0.0,0,0,2024,Newcastle United,20,0,0,16,12,1.0,2.2,60.6,16.4,4.6,15.08,0.4,0.2,0.2,1
1914,2024-01-02,19:30,Premier League,Matchweek 20,Tue,Away,D,0.0,0.0,West Ham United,2.3,0.7,68.0,62462.0,Pascal Groß,4-3-1-2,Samuel Barrott,Match Report,0.0,22.0,8.0,14.4,0.0,0,0,2024,Brighton and Hove Albion,19,1,0,4,23,1.6,1.4,59.0,16.8,6.4,16.00,0.4,0.4,0.4,0


In [27]:
X_train = matches_rolling[matches_rolling["Date"] < "2023-10-30"][predictors]
X_test =  matches_rolling[matches_rolling["Date"] >= "2023-10-30"][predictors]
y_train = matches_rolling[matches_rolling["Date"] < "2023-10-30"]["Target"]
y_test = matches_rolling[matches_rolling["Date"] >= "2023-10-30"]["Target"]

In [28]:
X_train.shape

(1720, 13)

In [29]:
scaler = StandardScaler()
scaledX_train = scaler.fit_transform(X_train)
scaledX_test = scaler.transform(X_test)

In [30]:
def make_predictions(model):
    model.fit(scaledX_train, y_train)
    train_pred = model.predict(scaledX_train)
    test_pred = model.predict(scaledX_test)
    train_score = accuracy_score(y_train, train_pred)
    test_score = accuracy_score(y_test, test_pred)
    report = classification_report(y_test, test_pred)
    return train_score, test_score, report

In [31]:
forest = RandomForestClassifier(random_state=1)

In [32]:
forest.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 1,
 'verbose': 0,
 'warm_start': False}

In [33]:
forest_train_score, forest_test_score, forest_report = make_predictions(forest)

In [34]:
forest_train_score

1.0

In [35]:
forest_test_score

0.4642857142857143

In [36]:
print(forest_report)

              precision    recall  f1-score   support

           0       0.10      0.06      0.08        32
           1       0.53      0.57      0.55        82
           2       0.49      0.51      0.50        82

    accuracy                           0.46       196
   macro avg       0.37      0.38      0.38       196
weighted avg       0.44      0.46      0.45       196



In [37]:
param_dist ={'n_estimators': range(80, 500, 20), 
             'max_depth': range(0, 12, 1), 
             'min_samples_split': range(0, 30, 5), 
             'max_features': ['auto', 'sqrt', 'log2', None], 
             "bootstrap":[True, False]}

kfold_forest = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)

In [38]:
random_search = RandomizedSearchCV(
    forest,
    param_distributions=param_dist,
    n_iter=50, 
    cv=kfold_forest,
    scoring="accuracy",
    n_jobs=-1, 
    random_state=6
)

random_search.fit(scaledX_train, y_train)

In [39]:
random_search.best_params_

{'n_estimators': 480,
 'min_samples_split': 25,
 'max_features': 'auto',
 'max_depth': 7,
 'bootstrap': False}

In [40]:
tuned_forest = random_search.best_estimator_

In [41]:
tforest_train_score, tforest_test_score, tforest_report = make_predictions(tuned_forest)

In [42]:
print(tforest_report)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        32
           1       0.54      0.68      0.61        82
           2       0.53      0.59      0.56        82

    accuracy                           0.53       196
   macro avg       0.36      0.42      0.39       196
weighted avg       0.45      0.53      0.49       196



In [43]:
tforest_train_score

0.641860465116279

In [49]:
forest_space = {
    "n_estimators": hp.choice("n_eatimators", range(80, 500, 20)),
    "max_depth": hp.choice("max_depth", range(0, 12, 1)),
    "min_samples_split": hp.choice("min_samples_split", range(1, 30, 5)),
    "max_features": hp.choice("max_features", ['auto', 'sqrt', 'log2', None]),
    "bootstrap": hp.choice("bootstrap", [True, False])
}

In [50]:
def objective_forest(params_f):
    score_forest = cross_val_score(forest, X=scaledX_train, y=y_train, cv=kfold_forest, scoring="accuracy", n_jobs=-1).mean()
    loss_forest = -score_forest
    
    return {"loss":loss_forest, "params":params_f, "status":STATUS_OK}

In [51]:
best_forest = fmin(fn=objective_forest, space=forest_space, algo=tpe.suggest, max_evals=50, trials=Trials())

100%|██████████████████████████████████████████████| 50/50 [00:32<00:00,  1.53trial/s, best loss: -0.47500167223063405]


In [52]:
bo_forest = RandomForestClassifier(random_state = 30,
                        n_estimators=space_eval(forest_space, best_forest)['n_estimators'],
                        min_samples_split=space_eval(forest_space, best_forest)['min_samples_split'],
                        max_features=space_eval(forest_space, best_forest)['max_features'],
                        max_depth=space_eval(forest_space, best_forest)['max_depth'],
                        bootstrap=space_eval(forest_space, best_forest)['bootstrap'])

In [53]:
boforest_train_score, boforest_test_score, boforest_report = make_predictions(bo_forest)

In [54]:
print(boforest_report)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        32
           1       0.51      0.55      0.53        82
           2       0.51      0.59      0.54        82

    accuracy                           0.47       196
   macro avg       0.34      0.38      0.36       196
weighted avg       0.43      0.47      0.45       196



In [55]:
boost = XGBClassifier(random_state = 10)

In [56]:
boost_train_score, boost_test_score, boost_report = make_predictions(boost)
boost_test_score

0.42857142857142855

In [57]:
print(boost_report)

              precision    recall  f1-score   support

           0       0.20      0.25      0.22        32
           1       0.49      0.51      0.50        82
           2       0.49      0.41      0.45        82

    accuracy                           0.43       196
   macro avg       0.39      0.39      0.39       196
weighted avg       0.44      0.43      0.43       196



In [58]:
params = {
    "learning_rate": [0.0001, 0.001, 0.01, 0.1, 1],
    "max_depth": range(2, 21, 3),
    "gamma": [i/10.0 for i in range(0,5)],
    "colsample_bytree": [i/10.0 for i in range(3,10)],
    "reg_alpha": [1e-5, 1e-2, 0.1, 1, 10, 100],
    "reg_lamda":[1e-5, 1e-2, 0.1, 1, 10, 100]
}

In [59]:
random_search_boost = RandomizedSearchCV(
    boost,
    param_distributions=params,
    n_iter=48, 
    cv=kfold_forest,
    scoring="accuracy",
    n_jobs=-1, 
    random_state=7
)

random_search_boost.fit(scaledX_train, y_train)

Parameters: { "reg_lamda" } are not used.



In [60]:
random_search_boost.best_params_

{'reg_lamda': 1e-05,
 'reg_alpha': 10,
 'max_depth': 20,
 'learning_rate': 0.1,
 'gamma': 0.2,
 'colsample_bytree': 0.3}

In [61]:
tuned_boost = random_search_boost.best_estimator_

In [62]:
tboost_train_score, tboost_test_score, tboost_report = make_predictions(tuned_boost)

Parameters: { "reg_lamda" } are not used.



In [63]:
print(tboost_report)

              precision    recall  f1-score   support

           0       0.33      0.03      0.06        32
           1       0.55      0.68      0.61        82
           2       0.53      0.59      0.55        82

    accuracy                           0.54       196
   macro avg       0.47      0.43      0.41       196
weighted avg       0.50      0.54      0.50       196



In [64]:
boost_space = {
    "learning_rate": hp.choice("learning_rate", [0.0001, 0.001, 0.01, 0.1, 1]),
    "max_depth": hp.choice("max_depth", range(2, 21, 3)),
    "gamma": hp.choice("gamma", [i/10.0 for i in range(0,5)]),
    "colsample_bytree": hp.choice("colsample_bytree", [i/10.0 for i in range(3,10)]),
    "reg_alpha": hp.choice("reg_alpha", [1e-5, 1e-2, 0.1, 1, 10, 100]),
    "reg_lambda": hp.choice("reg_lambda", [1e-5, 1e-2, 0.1, 1, 10, 100])
}

In [65]:
def objective_boost(params):
    score_boost = cross_val_score(boost, X=scaledX_train, y=y_train, cv=kfold_forest, scoring="accuracy", n_jobs=-1).mean()
    loss = -score_boost
    
    return {"loss":loss, "params":params, "status":STATUS_OK}

In [66]:
best_boost = fmin(fn=objective_boost, space=boost_space, algo=tpe.suggest, max_evals=50, trials=Trials())

100%|██████████████████████████████████████████████| 50/50 [00:38<00:00,  1.30trial/s, best loss: -0.46627972263267886]


In [67]:
bo_boost = XGBClassifier(random_state = 20,
                        colsample_bytree=space_eval(boost_space, best_boost)['colsample_bytree'],
                        gamma=space_eval(boost_space, best_boost)['gamma'],
                        learning_rate=space_eval(boost_space, best_boost)['learning_rate'],
                        max_depth=space_eval(boost_space, best_boost)['max_depth'],
                        reg_alpha=space_eval(boost_space, best_boost)['reg_alpha'],
                        reg_lambda=space_eval(boost_space, best_boost)['reg_lambda'])

In [68]:
boboost_train_score, boboost_test_score, boboost_report = make_predictions(bo_boost)

In [69]:
print(boboost_report)

              precision    recall  f1-score   support

           0       0.14      0.12      0.13        32
           1       0.48      0.52      0.50        82
           2       0.47      0.45      0.46        82

    accuracy                           0.43       196
   macro avg       0.37      0.37      0.37       196
weighted avg       0.42      0.43      0.43       196



In [70]:
scores_df = pd.DataFrame([["Random Forest (random search tuned)", tforest_train_score, tforest_test_score],
                         ["xgboost (random search tuned)", tboost_train_score, tboost_test_score],
                         ["xgboost (bayesian opt)", boboost_train_score, boboost_test_score]], 
                         columns = ["Model", "Training score", "Validation score"])

In [71]:
scores_df

Unnamed: 0,Model,Training score,Validation score
0,Random Forest (random search tuned),0.64186,0.530612
1,xgboost (random search tuned),0.573837,0.535714
2,xgboost (bayesian opt),0.99593,0.428571


In [85]:
# Specify the file path where you want to save the model
save_path = "model.pkl"

# Save the model as a pickle file
with open(save_path, 'wb') as file:
    pickle.dump(tuned_forest, file)      #the random forest was saved even though the xgboost performed a little better than it because xgboost requires installing more dependencies in the production environment
    

print("Model saved as pickle file.")

Model saved as pickle file.


In [86]:
# Specify the file path where you want to save the model
save_path = "scaler.pkl"

# Save the model as a pickle file
with open(save_path, 'wb') as file:
    pickle.dump(scaler, file)

print("Scaler saved as pickle file.")

Scaler saved as pickle file.


In [87]:
# Specify the file path where you want to save the model
save_path = "encoder.pkl"

# Save the model as a pickle file
with open(save_path, 'wb') as file:
    pickle.dump(encoder, file)

print("Encoder saved as pickle file.")

Encoder saved as pickle file.
