In [1]:
import numpy as np

import pandas as pd

pd.set_option('display.max_columns', 50)

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, cross_val_score

from sklearn.metrics import accuracy_score, classification_report

import xgboost
from xgboost import XGBClassifier

import hyperopt
from hyperopt import tpe, STATUS_OK, Trials, hp, fmin, space_eval

import warnings

warnings.filterwarnings("ignore")

import pickle

In [3]:
match_df = pd.read_csv("datasets/matches.csv")

match_df.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Attendance,Captain,Formation,Referee,Match Report,Notes,Sh,SoT,Dist,FK,PK,PKatt,Season,Team
0,2023-08-13,16:30,Premier League,Matchweek 1,Sun,Away,D,1.0,1.0,Chelsea,1.3,1.4,35.0,40096.0,Virgil van Dijk,4-3-3,Anthony Taylor,Match Report,,13.0,1.0,17.8,0.0,0,0,2024,Liverpool
1,2023-08-19,15:00,Premier League,Matchweek 2,Sat,Home,W,3.0,1.0,Bournemouth,3.0,1.3,64.0,53145.0,Virgil van Dijk,4-3-3,Thomas Bramall,Match Report,,25.0,9.0,16.8,1.0,0,1,2024,Liverpool
2,2023-08-27,16:30,Premier League,Matchweek 3,Sun,Away,W,2.0,1.0,Newcastle Utd,0.9,2.0,41.0,52214.0,Virgil van Dijk,4-3-3,John Brooks,Match Report,,9.0,4.0,17.2,1.0,0,0,2024,Liverpool
3,2023-09-03,14:00,Premier League,Matchweek 4,Sun,Home,W,3.0,0.0,Aston Villa,2.5,0.7,63.0,50109.0,Trent Alexander-Arnold,4-3-3,Simon Hooper,Match Report,,17.0,4.0,14.7,0.0,0,0,2024,Liverpool
4,2023-09-16,12:30,Premier League,Matchweek 5,Sat,Away,W,3.0,1.0,Wolves,2.5,0.6,65.0,31257.0,Andrew Robertson,4-3-3,Michael Oliver,Match Report,,16.0,5.0,15.8,0.0,0,0,2024,Liverpool


In [4]:
def clean_data(df):
    df["Date"] = pd.to_datetime(df["Date"]) #converting the date column to a datetime 
    new_df = df.sort_values(by="Date").reset_index(drop=True) 
    new_df["Opponent"] = new_df["Opponent"].replace({"Newcastle Utd": "Newcastle United", "Brighton":"Brighton and Hove Albion", 
                                         "Manchester Utd":"Manchester United", "West Ham": "West Ham United", 
                                         "Tottenham": "Tottenham Hotspur", "Wolves": "Wolverhampton Wanderers", 
                                         "Nott'ham Forest":"Nottingham Forest", "Sheffield Utd":"Sheffield United"})
    new_df["hour"] = new_df["Time"].str.replace(":.+", "", regex=True).astype("int")
    new_df["day_code"] = new_df["Date"].dt.dayofweek
    return new_df

In [5]:
cleaned_df = clean_data(match_df)

In [6]:
cleaned_df.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Attendance,Captain,Formation,Referee,Match Report,Notes,Sh,SoT,Dist,FK,PK,PKatt,Season,Team,hour,day_code
0,2021-08-13,20:00,Premier League,Matchweek 1,Fri,Home,W,2.0,0.0,Arsenal,1.2,1.3,36.0,16479.0,Pontus Jansson,3-5-2,Michael Oliver,Match Report,,8.0,3.0,12.1,0.0,0,0,2022,Brentford,20,4
1,2021-08-13,20:00,Premier League,Matchweek 1,Fri,Away,L,0.0,2.0,Brentford,1.3,1.2,64.0,16479.0,Granit Xhaka,4-2-3-1,Michael Oliver,Match Report,,22.0,4.0,19.0,1.0,0,0,2022,Arsenal,20,4
2,2021-08-14,17:30,Premier League,Matchweek 1,Sat,Away,W,3.0,0.0,Norwich City,1.6,1.4,50.0,27023.0,James Milner,4-3-3,Andre Marriner,Match Report,,19.0,6.0,17.3,1.0,0,0,2022,Liverpool,17,5
3,2021-08-14,15:00,Premier League,Matchweek 1,Sat,Away,W,2.0,1.0,Burnley,1.0,1.5,63.0,16910.0,Lewis Dunk,4-1-4-1,David Coote,Match Report,,14.0,6.0,14.9,1.0,0,0,2022,Brighton and Hove Albion,15,5
4,2021-08-14,15:00,Premier League,Matchweek 1,Sat,Away,L,2.0,3.0,Watford,1.2,1.2,62.0,20051.0,Tyrone Mings,4-2-3-1,Mike Dean,Match Report,,10.0,1.0,22.8,1.0,1,1,2022,Aston Villa,15,5


In [7]:
cleaned_df["Team"].value_counts()

Everton                     97
West Ham United             97
Liverpool                   97
Brighton and Hove Albion    97
Aston Villa                 97
Manchester United           97
Tottenham Hotspur           97
Chelsea                     97
Wolverhampton Wanderers     97
Crystal Palace              97
Newcastle United            97
Arsenal                     97
Manchester City             96
Brentford                   96
Leicester City              76
Leeds United                76
Southampton                 76
Burnley                     59
Nottingham Forest           59
Fulham                      59
Bournemouth                 58
Watford                     38
Norwich City                38
Sheffield United            21
Luton Town                  20
Name: Team, dtype: int64

In [8]:
cleaned_df["Opponent"].value_counts()

Arsenal                     97
Aston Villa                 97
West Ham United             97
Tottenham Hotspur           97
Newcastle United            97
Liverpool                   97
Crystal Palace              97
Wolverhampton Wanderers     97
Chelsea                     97
Everton                     97
Brighton and Hove Albion    97
Manchester United           97
Manchester City             96
Brentford                   96
Southampton                 76
Leicester City              76
Leeds United                76
Burnley                     59
Nottingham Forest           59
Fulham                      59
Bournemouth                 58
Watford                     38
Norwich City                38
Sheffield United            21
Luton Town                  20
Name: Opponent, dtype: int64

In [9]:
cleaned_df['Venue'].value_counts()

Home    968
Away    968
Name: Venue, dtype: int64

In [10]:
encoder = LabelEncoder()
def add_predictors(df, cols):
    for col in cols:
        df[f"{col}_code"] = encoder.fit_transform(df[col])
    return df
    
    

In [11]:
cat_cols = ["Venue", "Team", "Opponent"]

In [12]:
encoded_df = add_predictors(cleaned_df, cat_cols)

In [13]:
encoded_df.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Attendance,Captain,Formation,Referee,Match Report,Notes,Sh,SoT,Dist,FK,PK,PKatt,Season,Team,hour,day_code,Venue_code,Team_code,Opponent_code
0,2021-08-13,20:00,Premier League,Matchweek 1,Fri,Home,W,2.0,0.0,Arsenal,1.2,1.3,36.0,16479.0,Pontus Jansson,3-5-2,Michael Oliver,Match Report,,8.0,3.0,12.1,0.0,0,0,2022,Brentford,20,4,1,3,0
1,2021-08-13,20:00,Premier League,Matchweek 1,Fri,Away,L,0.0,2.0,Brentford,1.3,1.2,64.0,16479.0,Granit Xhaka,4-2-3-1,Michael Oliver,Match Report,,22.0,4.0,19.0,1.0,0,0,2022,Arsenal,20,4,0,0,3
2,2021-08-14,17:30,Premier League,Matchweek 1,Sat,Away,W,3.0,0.0,Norwich City,1.6,1.4,50.0,27023.0,James Milner,4-3-3,Andre Marriner,Match Report,,19.0,6.0,17.3,1.0,0,0,2022,Liverpool,17,5,0,12,17
3,2021-08-14,15:00,Premier League,Matchweek 1,Sat,Away,W,2.0,1.0,Burnley,1.0,1.5,63.0,16910.0,Lewis Dunk,4-1-4-1,David Coote,Match Report,,14.0,6.0,14.9,1.0,0,0,2022,Brighton and Hove Albion,15,5,0,4,5
4,2021-08-14,15:00,Premier League,Matchweek 1,Sat,Away,L,2.0,3.0,Watford,1.2,1.2,62.0,20051.0,Tyrone Mings,4-2-3-1,Mike Dean,Match Report,,10.0,1.0,22.8,1.0,1,1,2022,Aston Villa,15,5,0,1,22


In [14]:
groups_df = encoded_df.groupby("Team")

In [15]:
mancity_df = groups_df.get_group("Manchester City")

In [16]:
mancity_df.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Attendance,Captain,Formation,Referee,Match Report,Notes,Sh,SoT,Dist,FK,PK,PKatt,Season,Team,hour,day_code,Venue_code,Team_code,Opponent_code
17,2021-08-15,16:30,Premier League,Matchweek 1,Sun,Away,L,0.0,1.0,Tottenham Hotspur,1.8,1.0,65.0,58262.0,Fernandinho,4-3-3,Anthony Taylor,Match Report,,18.0,4.0,17.3,1.0,0,0,2022,Manchester City,16,6,0,14,21
30,2021-08-21,15:00,Premier League,Matchweek 2,Sat,Home,W,5.0,0.0,Norwich City,2.6,0.1,67.0,51437.0,İlkay Gündoğan,4-3-3,Graham Scott,Match Report,,16.0,4.0,18.5,1.0,0,0,2022,Manchester City,15,5,1,14,17
48,2021-08-28,12:30,Premier League,Matchweek 3,Sat,Home,W,5.0,0.0,Arsenal,4.4,0.2,80.0,52276.0,İlkay Gündoğan,4-3-3,Martin Atkinson,Match Report,,25.0,10.0,14.8,0.0,0,0,2022,Manchester City,12,5,1,14,0
66,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Away,W,1.0,0.0,Leicester City,2.8,0.6,61.0,32087.0,İlkay Gündoğan,4-3-3,Paul Tierney,Match Report,,25.0,8.0,14.3,0.0,0,0,2022,Manchester City,15,5,0,14,11
93,2021-09-18,15:00,Premier League,Matchweek 5,Sat,Home,D,0.0,0.0,Southampton,1.0,0.4,64.0,52698.0,Fernandinho,4-3-3,Jonathan Moss,Match Report,,16.0,1.0,16.4,1.0,0,0,2022,Manchester City,15,5,1,14,20


In [17]:
def rolling_averages(group, cols, new_cols):
    rolling_stats = group[cols].rolling(5, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.fillna(0)
    return group

In [18]:
cols = ["GF", "GA", "Poss", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]
new_cols = [f"{col}_rolling" for col in cols]

In [19]:
rolled_group = rolling_averages(mancity_df, cols, new_cols)

In [20]:
rolled_group

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Attendance,Captain,Formation,Referee,Match Report,Notes,Sh,SoT,Dist,FK,PK,PKatt,Season,Team,hour,day_code,Venue_code,Team_code,Opponent_code,GF_rolling,GA_rolling,Poss_rolling,Sh_rolling,SoT_rolling,Dist_rolling,FK_rolling,PK_rolling,PKatt_rolling
17,2021-08-15,16:30,Premier League,Matchweek 1,Sun,Away,L,0.0,1.0,Tottenham Hotspur,1.8,1.0,65.0,58262.0,Fernandinho,4-3-3,Anthony Taylor,Match Report,0.0,18.0,4.0,17.3,1.0,0,0,2022,Manchester City,16,6,0,14,21,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0
30,2021-08-21,15:00,Premier League,Matchweek 2,Sat,Home,W,5.0,0.0,Norwich City,2.6,0.1,67.0,51437.0,İlkay Gündoğan,4-3-3,Graham Scott,Match Report,0.0,16.0,4.0,18.5,1.0,0,0,2022,Manchester City,15,5,1,14,17,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0
48,2021-08-28,12:30,Premier League,Matchweek 3,Sat,Home,W,5.0,0.0,Arsenal,4.4,0.2,80.0,52276.0,İlkay Gündoğan,4-3-3,Martin Atkinson,Match Report,0.0,25.0,10.0,14.8,0.0,0,0,2022,Manchester City,12,5,1,14,0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0
66,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Away,W,1.0,0.0,Leicester City,2.8,0.6,61.0,32087.0,İlkay Gündoğan,4-3-3,Paul Tierney,Match Report,0.0,25.0,8.0,14.3,0.0,0,0,2022,Manchester City,15,5,0,14,11,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0
93,2021-09-18,15:00,Premier League,Matchweek 5,Sat,Home,D,0.0,0.0,Southampton,1.0,0.4,64.0,52698.0,Fernandinho,4-3-3,Jonathan Moss,Match Report,0.0,16.0,1.0,16.4,1.0,0,0,2022,Manchester City,15,5,1,14,20,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1832,2023-12-10,14:00,Premier League,Matchweek 16,Sun,Away,W,2.0,1.0,Luton Town,1.6,0.3,65.0,11047.0,Kyle Walker,4-2-3-1,Tim Robinson,Match Report,0.0,18.0,6.0,18.7,1.0,0,0,2024,Manchester City,14,6,0,14,13,2.8,2.0,57.6,14.0,5.6,14.02,0.4,0.2,0.2
1849,2023-12-16,15:00,Premier League,Matchweek 17,Sat,Home,D,2.0,2.0,Crystal Palace,1.6,1.8,74.0,53248.0,Kyle Walker,4-2-3-1,Paul Tierney,Match Report,0.0,19.0,9.0,18.2,0.0,0,0,2024,Manchester City,15,5,1,14,7,2.0,2.0,57.6,13.4,5.2,14.48,0.6,0.2,0.2
1890,2023-12-27,20:15,Premier League,Matchweek 19,Wed,Away,W,3.0,1.0,Everton,2.4,1.0,72.0,39327.0,Kyle Walker,4-2-3-1,John Brooks,Match Report,0.0,22.0,8.0,19.5,1.0,1,1,2024,Manchester City,20,2,0,14,8,1.6,1.6,61.4,14.4,5.2,14.80,0.4,0.0,0.0
1899,2023-12-30,15:00,Premier League,Matchweek 20,Sat,Home,W,2.0,0.0,Sheffield United,2.2,0.3,80.0,53108.0,Kyle Walker,4-2-3-1,David Coote,Match Report,0.0,18.0,4.0,17.4,0.0,0,0,2024,Manchester City,15,5,1,14,19,2.0,1.6,64.0,15.6,5.8,15.44,0.4,0.2,0.2


In [21]:
matches_rolling = encoded_df.groupby("Team").apply(lambda x: rolling_averages(x, cols, new_cols))

In [22]:
matches_rolling.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Attendance,Captain,Formation,Referee,Match Report,Notes,Sh,SoT,Dist,FK,PK,PKatt,Season,Team,hour,day_code,Venue_code,Team_code,Opponent_code,GF_rolling,GA_rolling,Poss_rolling,Sh_rolling,SoT_rolling,Dist_rolling,FK_rolling,PK_rolling,PKatt_rolling
0,2021-08-13,20:00,Premier League,Matchweek 1,Fri,Home,W,2.0,0.0,Arsenal,1.2,1.3,36.0,16479.0,Pontus Jansson,3-5-2,Michael Oliver,Match Report,0.0,8.0,3.0,12.1,0.0,0,0,2022,Brentford,20,4,1,3,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2021-08-13,20:00,Premier League,Matchweek 1,Fri,Away,L,0.0,2.0,Brentford,1.3,1.2,64.0,16479.0,Granit Xhaka,4-2-3-1,Michael Oliver,Match Report,0.0,22.0,4.0,19.0,1.0,0,0,2022,Arsenal,20,4,0,0,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2021-08-14,17:30,Premier League,Matchweek 1,Sat,Away,W,3.0,0.0,Norwich City,1.6,1.4,50.0,27023.0,James Milner,4-3-3,Andre Marriner,Match Report,0.0,19.0,6.0,17.3,1.0,0,0,2022,Liverpool,17,5,0,12,17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2021-08-14,15:00,Premier League,Matchweek 1,Sat,Away,W,2.0,1.0,Burnley,1.0,1.5,63.0,16910.0,Lewis Dunk,4-1-4-1,David Coote,Match Report,0.0,14.0,6.0,14.9,1.0,0,0,2022,Brighton and Hove Albion,15,5,0,4,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2021-08-14,15:00,Premier League,Matchweek 1,Sat,Away,L,2.0,3.0,Watford,1.2,1.2,62.0,20051.0,Tyrone Mings,4-2-3-1,Mike Dean,Match Report,0.0,10.0,1.0,22.8,1.0,1,1,2022,Aston Villa,15,5,0,1,22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
matches_rolling["Target"] = matches_rolling["Result"].astype("category").cat.codes

In [24]:
matches_rolling.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Attendance,Captain,Formation,Referee,Match Report,Notes,Sh,SoT,Dist,FK,PK,PKatt,Season,Team,hour,day_code,Venue_code,Team_code,Opponent_code,GF_rolling,GA_rolling,Poss_rolling,Sh_rolling,SoT_rolling,Dist_rolling,FK_rolling,PK_rolling,PKatt_rolling,Target
0,2021-08-13,20:00,Premier League,Matchweek 1,Fri,Home,W,2.0,0.0,Arsenal,1.2,1.3,36.0,16479.0,Pontus Jansson,3-5-2,Michael Oliver,Match Report,0.0,8.0,3.0,12.1,0.0,0,0,2022,Brentford,20,4,1,3,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
1,2021-08-13,20:00,Premier League,Matchweek 1,Fri,Away,L,0.0,2.0,Brentford,1.3,1.2,64.0,16479.0,Granit Xhaka,4-2-3-1,Michael Oliver,Match Report,0.0,22.0,4.0,19.0,1.0,0,0,2022,Arsenal,20,4,0,0,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,2021-08-14,17:30,Premier League,Matchweek 1,Sat,Away,W,3.0,0.0,Norwich City,1.6,1.4,50.0,27023.0,James Milner,4-3-3,Andre Marriner,Match Report,0.0,19.0,6.0,17.3,1.0,0,0,2022,Liverpool,17,5,0,12,17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
3,2021-08-14,15:00,Premier League,Matchweek 1,Sat,Away,W,2.0,1.0,Burnley,1.0,1.5,63.0,16910.0,Lewis Dunk,4-1-4-1,David Coote,Match Report,0.0,14.0,6.0,14.9,1.0,0,0,2022,Brighton and Hove Albion,15,5,0,4,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
4,2021-08-14,15:00,Premier League,Matchweek 1,Sat,Away,L,2.0,3.0,Watford,1.2,1.2,62.0,20051.0,Tyrone Mings,4-2-3-1,Mike Dean,Match Report,0.0,10.0,1.0,22.8,1.0,1,1,2022,Aston Villa,15,5,0,1,22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [25]:
matches_rolling["Target"].value_counts()

2    751
1    751
0    434
Name: Target, dtype: int64

In [26]:
predictors = ["Venue_code", "Team_code", "Opponent_code"] + new_cols
predictors

['Venue_code',
 'Team_code',
 'Opponent_code',
 'GF_rolling',
 'GA_rolling',
 'Poss_rolling',
 'Sh_rolling',
 'SoT_rolling',
 'Dist_rolling',
 'FK_rolling',
 'PK_rolling',
 'PKatt_rolling']

In [27]:
matches_rolling[matches_rolling["Season"]==2024]

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Attendance,Captain,Formation,Referee,Match Report,Notes,Sh,SoT,Dist,FK,PK,PKatt,Season,Team,hour,day_code,Venue_code,Team_code,Opponent_code,GF_rolling,GA_rolling,Poss_rolling,Sh_rolling,SoT_rolling,Dist_rolling,FK_rolling,PK_rolling,PKatt_rolling,Target
1520,2023-08-11,20:00,Premier League,Matchweek 1,Fri,Away,W,3.0,0.0,Burnley,1.9,0.3,65.0,21572.0,Kevin De Bruyne,4-2-3-1,Craig Pawson,Match Report,0.0,17.0,8.0,13.9,0.0,0,0,2024,Manchester City,20,4,0,14,5,1.4,0.6,66.4,14.2,3.6,17.50,0.8,0.0,0.2,2
1521,2023-08-11,20:00,Premier League,Matchweek 1,Fri,Home,L,0.0,3.0,Manchester City,0.3,1.9,35.0,21572.0,Josh Cullen,5-4-1,Craig Pawson,Match Report,0.0,6.0,1.0,14.9,0.0,0,0,2024,Burnley,20,4,1,5,14,1.0,1.6,43.0,12.0,4.0,16.12,0.4,0.2,0.2,1
1522,2023-08-12,15:00,Premier League,Matchweek 1,Sat,Away,L,1.0,4.0,Brighton and Hove Albion,1.5,4.0,30.0,31872.0,Tom Lockyer,3-5-2,David Coote,Match Report,0.0,8.0,2.0,16.6,0.0,1,1,2024,Luton Town,15,5,0,13,4,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,1
1523,2023-08-12,15:00,Premier League,Matchweek 1,Sat,Home,L,0.0,1.0,Fulham,2.7,1.5,42.0,39940.0,James Tarkowski,4-4-1-1,Stuart Attwell,Match Report,0.0,19.0,9.0,15.8,0.0,0,0,2024,Everton,15,5,1,8,9,1.8,1.4,38.2,14.2,4.8,13.92,0.4,0.2,0.2,1
1524,2023-08-12,12:30,Premier League,Matchweek 1,Sat,Away,L,1.0,2.0,Arsenal,1.2,0.8,22.0,59984.0,Joe Worrall,3-4-3,Michael Oliver,Match Report,0.0,6.0,2.0,17.3,0.0,0,0,2024,Nottingham Forest,12,5,0,18,0,1.8,1.6,29.2,7.4,2.8,14.86,0.0,0.2,0.2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1931,2024-01-21,16:30,Premier League,Matchweek 21,Sun,Home,L,0.0,4.0,Liverpool,1.4,1.5,40.0,11228.0,Neto,4-2-3-1,Andy Madley,Match Report,0.0,11.0,1.0,19.8,0.0,0,0,2024,Bournemouth,16,6,1,2,12,2.4,1.0,46.4,15.6,4.2,15.16,0.4,0.2,0.2,1
1932,2024-01-21,14:00,Premier League,Matchweek 21,Sun,Home,D,2.0,2.0,West Ham United,2.5,2.3,57.0,29164.0,Anel Ahmedhodžić,4-1-4-1,Michael Salisbury,Match Report,0.0,20.0,5.0,18.5,1.0,1,1,2024,Sheffield United,14,6,1,19,23,0.8,1.6,31.4,9.0,2.6,18.46,0.4,0.0,0.0,0
1933,2024-01-21,16:30,Premier League,Matchweek 21,Sun,Away,W,4.0,0.0,Bournemouth,1.5,1.4,60.0,11228.0,Virgil van Dijk,4-3-3,Andy Madley,Match Report,0.0,14.0,7.0,19.2,0.0,0,0,2024,Liverpool,16,6,0,12,2,1.8,0.8,65.0,22.4,7.0,17.76,0.2,0.2,0.4,2
1934,2024-01-22,19:45,Premier League,Matchweek 21,Mon,Home,D,0.0,0.0,Wolverhampton Wanderers,1.0,1.1,72.0,41505.0,Lewis Dunk,4-3-3,Craig Pawson,Match Report,0.0,11.0,3.0,14.2,0.0,0,0,2024,Brighton and Hove Albion,19,0,1,4,24,1.2,1.2,60.2,17.6,6.6,15.66,0.4,0.4,0.4,0


In [28]:
X_train = matches_rolling[matches_rolling["Date"] < "2023-10-30"][predictors]
X_test =  matches_rolling[matches_rolling["Date"] >= "2023-10-30"][predictors]
y_train = matches_rolling[matches_rolling["Date"] < "2023-10-30"]["Target"]
y_test = matches_rolling[matches_rolling["Date"] >= "2023-10-30"]["Target"]

In [29]:
X_train.shape

(1720, 12)

In [30]:
scaler = StandardScaler()
scaledX_train = scaler.fit_transform(X_train)
scaledX_test = scaler.transform(X_test)

In [31]:
def make_predictions(model):
    model.fit(scaledX_train, y_train)
    train_pred = model.predict(scaledX_train)
    test_pred = model.predict(scaledX_test)
    train_score = accuracy_score(y_train, train_pred)
    test_score = accuracy_score(y_test, test_pred)
    report = classification_report(y_test, test_pred)
    return train_score, test_score, report

In [32]:
forest = RandomForestClassifier(random_state=1)

In [33]:
forest.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 1,
 'verbose': 0,
 'warm_start': False}

In [34]:
forest_train_score, forest_test_score, forest_report = make_predictions(forest)

In [35]:
forest_train_score

1.0

In [36]:
forest_test_score

0.41203703703703703

In [37]:
print(forest_report)

              precision    recall  f1-score   support

           0       0.07      0.05      0.06        42
           1       0.46      0.53      0.49        87
           2       0.46      0.47      0.47        87

    accuracy                           0.41       216
   macro avg       0.33      0.35      0.34       216
weighted avg       0.39      0.41      0.40       216



In [38]:
param_dist ={'n_estimators': range(80, 500, 20), 
             'max_depth': range(0, 12, 1), 
             'min_samples_split': range(0, 30, 5), 
             'max_features': ['auto', 'sqrt', 'log2', None], 
             "bootstrap":[True, False]}

kfold_forest = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)

In [39]:
random_search = RandomizedSearchCV(
    forest,
    param_distributions=param_dist,
    n_iter=50, 
    cv=kfold_forest,
    scoring="accuracy",
    n_jobs=-1, 
    random_state=6
)

random_search.fit(scaledX_train, y_train)

In [40]:
random_search.best_params_

{'n_estimators': 200,
 'min_samples_split': 20,
 'max_features': 'auto',
 'max_depth': 8,
 'bootstrap': True}

In [41]:
tuned_forest = random_search.best_estimator_

In [42]:
tforest_train_score, tforest_test_score, tforest_report = make_predictions(tuned_forest)

In [43]:
print(tforest_report)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        42
           1       0.52      0.67      0.58        87
           2       0.51      0.61      0.56        87

    accuracy                           0.51       216
   macro avg       0.34      0.43      0.38       216
weighted avg       0.42      0.51      0.46       216



In [44]:
tforest_train_score

0.65

In [45]:
forest_space = {
    "n_estimators": hp.choice("n_eatimators", range(80, 500, 20)),
    "max_depth": hp.choice("max_depth", range(0, 12, 1)),
    "min_samples_split": hp.choice("min_samples_split", range(1, 30, 5)),
    "max_features": hp.choice("max_features", ['auto', 'sqrt', 'log2', None]),
    "bootstrap": hp.choice("bootstrap", [True, False])
}

In [46]:
def objective_forest(params_f):
    score_forest = cross_val_score(forest, X=scaledX_train, y=y_train, cv=kfold_forest, scoring="accuracy", n_jobs=-1).mean()
    loss_forest = -score_forest
    
    return {"loss":loss_forest, "params":params_f, "status":STATUS_OK}

In [47]:
best_forest = fmin(fn=objective_forest, space=forest_space, algo=tpe.suggest, max_evals=50, trials=Trials())

100%|███████████████████████████████████████████████| 50/50 [00:35<00:00,  1.41trial/s, best loss: -0.4802433551635441]


In [48]:
bo_forest = RandomForestClassifier(random_state = 30,
                        n_estimators=space_eval(forest_space, best_forest)['n_estimators'],
                        min_samples_split=space_eval(forest_space, best_forest)['min_samples_split'],
                        max_features=space_eval(forest_space, best_forest)['max_features'],
                        max_depth=space_eval(forest_space, best_forest)['max_depth'],
                        bootstrap=space_eval(forest_space, best_forest)['bootstrap'])

In [49]:
boforest_train_score, boforest_test_score, boforest_report = make_predictions(bo_forest)

In [50]:
print(boforest_report)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        42
           1       0.49      0.56      0.52        87
           2       0.50      0.59      0.54        87

    accuracy                           0.46       216
   macro avg       0.33      0.38      0.35       216
weighted avg       0.40      0.46      0.43       216



In [51]:
boost = XGBClassifier(random_state = 10)

In [52]:
boost_train_score, boost_test_score, boost_report = make_predictions(boost)
boost_test_score

0.4074074074074074

In [53]:
print(boost_report)

              precision    recall  f1-score   support

           0       0.15      0.14      0.15        42
           1       0.47      0.51      0.49        87
           2       0.46      0.44      0.45        87

    accuracy                           0.41       216
   macro avg       0.36      0.36      0.36       216
weighted avg       0.40      0.41      0.41       216



In [54]:
params = {
    "learning_rate": [0.0001, 0.001, 0.01, 0.1, 1],
    "max_depth": range(2, 21, 3),
    "gamma": [i/10.0 for i in range(0,5)],
    "colsample_bytree": [i/10.0 for i in range(3,10)],
    "reg_alpha": [1e-5, 1e-2, 0.1, 1, 10, 100],
    "reg_lamda":[1e-5, 1e-2, 0.1, 1, 10, 100]
}

In [55]:
random_search_boost = RandomizedSearchCV(
    boost,
    param_distributions=params,
    n_iter=48, 
    cv=kfold_forest,
    scoring="accuracy",
    n_jobs=-1, 
    random_state=7
)

random_search_boost.fit(scaledX_train, y_train)

Parameters: { "reg_lamda" } are not used.



In [56]:
random_search_boost.best_params_

{'reg_lamda': 0.1,
 'reg_alpha': 0.1,
 'max_depth': 14,
 'learning_rate': 0.01,
 'gamma': 0.3,
 'colsample_bytree': 0.4}

In [57]:
tuned_boost = random_search_boost.best_estimator_

In [58]:
tboost_train_score, tboost_test_score, tboost_report = make_predictions(tuned_boost)

Parameters: { "reg_lamda" } are not used.



In [59]:
print(tboost_report)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        42
           1       0.50      0.63      0.56        87
           2       0.48      0.54      0.51        87

    accuracy                           0.47       216
   macro avg       0.33      0.39      0.36       216
weighted avg       0.40      0.47      0.43       216



In [60]:
boost_space = {
    "learning_rate": hp.choice("learning_rate", [0.0001, 0.001, 0.01, 0.1, 1]),
    "max_depth": hp.choice("max_depth", range(2, 21, 3)),
    "gamma": hp.choice("gamma", [i/10.0 for i in range(0,5)]),
    "colsample_bytree": hp.choice("colsample_bytree", [i/10.0 for i in range(3,10)]),
    "reg_alpha": hp.choice("reg_alpha", [1e-5, 1e-2, 0.1, 1, 10, 100]),
    "reg_lambda": hp.choice("reg_lambda", [1e-5, 1e-2, 0.1, 1, 10, 100])
}

In [61]:
def objective_boost(params):
    score_boost = cross_val_score(boost, X=scaledX_train, y=y_train, cv=kfold_forest, scoring="accuracy", n_jobs=-1).mean()
    loss = -score_boost
    
    return {"loss":loss, "params":params, "status":STATUS_OK}

In [62]:
best_boost = fmin(fn=objective_boost, space=boost_space, algo=tpe.suggest, max_evals=50, trials=Trials())

100%|███████████████████████████████████████████████| 50/50 [00:40<00:00,  1.23trial/s, best loss: -0.4657000160128752]


In [63]:
bo_boost = XGBClassifier(random_state = 20,
                        colsample_bytree=space_eval(boost_space, best_boost)['colsample_bytree'],
                        gamma=space_eval(boost_space, best_boost)['gamma'],
                        learning_rate=space_eval(boost_space, best_boost)['learning_rate'],
                        max_depth=space_eval(boost_space, best_boost)['max_depth'],
                        reg_alpha=space_eval(boost_space, best_boost)['reg_alpha'],
                        reg_lambda=space_eval(boost_space, best_boost)['reg_lambda'])

In [64]:
boboost_train_score, boboost_test_score, boboost_report = make_predictions(bo_boost)

In [65]:
print(boboost_report)

              precision    recall  f1-score   support

           0       0.18      0.21      0.20        42
           1       0.45      0.45      0.45        87
           2       0.45      0.41      0.43        87

    accuracy                           0.39       216
   macro avg       0.36      0.36      0.36       216
weighted avg       0.40      0.39      0.39       216



In [66]:
scores_df = pd.DataFrame([["Random Forest (random search tuned)", tforest_train_score, tforest_test_score],
                         ["xgboost (random search tuned)", tboost_train_score, tboost_test_score],
                         ["xgboost (bayesian opt)", boboost_train_score, boboost_test_score]], 
                         columns = ["Model", "Training score", "Validation score"])

In [67]:
scores_df

Unnamed: 0,Model,Training score,Validation score
0,Random Forest (random search tuned),0.65,0.513889
1,xgboost (random search tuned),0.975,0.472222
2,xgboost (bayesian opt),0.99186,0.388889


In [72]:
matches_rolling.to_csv('rolling.csv', index=False)

In [68]:
# Specify the file path where you want to save the model
save_path = "model.pkl"

# Save the model as a pickle file
with open(save_path, 'wb') as file:
    pickle.dump(tuned_forest, file)      #the random forest was saved even though the xgboost performed a little better than it because xgboost requires installing more dependencies in the production environment
    

print("Model saved as pickle file.")

Model saved as pickle file.


In [69]:
# Specify the file path where you want to save the model
save_path = "scaler.pkl"

# Save the model as a pickle file
with open(save_path, 'wb') as file:
    pickle.dump(scaler, file)

print("Scaler saved as pickle file.")

Scaler saved as pickle file.


In [70]:
# Specify the file path where you want to save the model
save_path = "encoder.pkl"

# Save the model as a pickle file
with open(save_path, 'wb') as file:
    pickle.dump(encoder, file)

print("Encoder saved as pickle file.")

Encoder saved as pickle file.
