In [1]:
import numpy as np

import pandas as pd

pd.set_option('display.max_columns', 50)

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, cross_val_score

from sklearn.metrics import accuracy_score, classification_report

import xgboost
from xgboost import XGBClassifier

import hyperopt
from hyperopt import tpe, STATUS_OK, Trials, hp, fmin, space_eval

import warnings

warnings.filterwarnings("ignore")

import pickle

In [2]:
match_df = pd.read_csv("../datasets/matches.csv")

match_df.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Attendance,Captain,Formation,Referee,Match Report,Notes,Sh,SoT,Dist,FK,PK,PKatt,Season,Team
0,2023-08-13,16:30,Premier League,Matchweek 1,Sun,Away,D,1.0,1.0,Chelsea,1.3,1.4,35.0,40096.0,Virgil van Dijk,4-3-3,Anthony Taylor,Match Report,,13.0,1.0,17.8,0.0,0,0,2024,Liverpool
1,2023-08-19,15:00,Premier League,Matchweek 2,Sat,Home,W,3.0,1.0,Bournemouth,3.0,1.3,64.0,53145.0,Virgil van Dijk,4-3-3,Thomas Bramall,Match Report,,25.0,9.0,16.8,1.0,0,1,2024,Liverpool
2,2023-08-27,16:30,Premier League,Matchweek 3,Sun,Away,W,2.0,1.0,Newcastle Utd,0.9,2.0,41.0,52214.0,Virgil van Dijk,4-3-3,John Brooks,Match Report,,9.0,4.0,17.2,1.0,0,0,2024,Liverpool
3,2023-09-03,14:00,Premier League,Matchweek 4,Sun,Home,W,3.0,0.0,Aston Villa,2.5,0.7,63.0,50109.0,Trent Alexander-Arnold,4-3-3,Simon Hooper,Match Report,,17.0,4.0,14.7,0.0,0,0,2024,Liverpool
4,2023-09-16,12:30,Premier League,Matchweek 5,Sat,Away,W,3.0,1.0,Wolves,2.5,0.6,65.0,31257.0,Andrew Robertson,4-3-3,Michael Oliver,Match Report,,16.0,5.0,15.8,0.0,0,0,2024,Liverpool


In [3]:
def clean_data(df):
    df["Date"] = pd.to_datetime(df["Date"]) #converting the date column to a datetime 
    new_df = df.sort_values(by="Date").reset_index(drop=True) 
    new_df["Opponent"] = new_df["Opponent"].replace({"Newcastle Utd": "Newcastle United", "Brighton":"Brighton and Hove Albion", 
                                         "Manchester Utd":"Manchester United", "West Ham": "West Ham United", 
                                         "Tottenham": "Tottenham Hotspur", "Wolves": "Wolverhampton Wanderers", 
                                         "Nott'ham Forest":"Nottingham Forest", "Sheffield Utd":"Sheffield United"})
    new_df["hour"] = new_df["Time"].str.replace(":.+", "", regex=True).astype("int")
    new_df["day_code"] = new_df["Date"].dt.dayofweek
    return new_df

In [4]:
cleaned_df = clean_data(match_df)

In [5]:
cleaned_df.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Attendance,Captain,Formation,Referee,Match Report,Notes,Sh,SoT,Dist,FK,PK,PKatt,Season,Team,hour,day_code
0,2021-08-13,20:00,Premier League,Matchweek 1,Fri,Away,L,0.0,2.0,Brentford,1.3,1.2,64.0,16479.0,Granit Xhaka,4-2-3-1,Michael Oliver,Match Report,,22.0,4.0,19.0,1.0,0,0,2022,Arsenal,20,4
1,2021-08-13,20:00,Premier League,Matchweek 1,Fri,Home,W,2.0,0.0,Arsenal,1.2,1.3,36.0,16479.0,Pontus Jansson,3-5-2,Michael Oliver,Match Report,,8.0,3.0,12.1,0.0,0,0,2022,Brentford,20,4
2,2021-08-14,15:00,Premier League,Matchweek 1,Sat,Away,L,1.0,3.0,Everton,0.8,2.4,52.0,38487.0,James Ward-Prowse,4-4-2,Andy Madley,Match Report,,6.0,3.0,15.5,0.0,0,0,2022,Southampton,15,5
3,2021-08-14,12:30,Premier League,Matchweek 1,Sat,Home,W,5.0,1.0,Leeds United,1.5,0.5,49.0,72732.0,Harry Maguire,4-2-3-1,Paul Tierney,Match Report,,16.0,8.0,18.2,0.0,0,0,2022,Manchester United,12,5
4,2021-08-14,15:00,Premier League,Matchweek 1,Sat,Home,L,1.0,2.0,Brighton and Hove Albion,1.5,1.0,37.0,16910.0,Ben Mee,4-4-2,David Coote,Match Report,,14.0,3.0,16.2,1.0,0,0,2022,Burnley,15,5


In [6]:
cleaned_df["Team"].value_counts()

Arsenal                     100
Crystal Palace              100
Newcastle United            100
West Ham United             100
Tottenham Hotspur           100
Chelsea                     100
Liverpool                   100
Brighton and Hove Albion    100
Everton                     100
Aston Villa                 100
Wolverhampton Wanderers     100
Manchester United           100
Brentford                    99
Manchester City              99
Leicester City               76
Leeds United                 76
Southampton                  76
Burnley                      62
Nottingham Forest            62
Fulham                       62
Bournemouth                  61
Norwich City                 38
Watford                      38
Sheffield United             24
Luton Town                   23
Name: Team, dtype: int64

In [7]:
cleaned_df["Opponent"].value_counts()

Crystal Palace              100
Everton                     100
Tottenham Hotspur           100
Brighton and Hove Albion    100
Manchester United           100
West Ham United             100
Newcastle United            100
Liverpool                   100
Chelsea                     100
Wolverhampton Wanderers     100
Arsenal                     100
Aston Villa                 100
Brentford                    99
Manchester City              99
Southampton                  76
Leicester City               76
Leeds United                 76
Burnley                      62
Fulham                       62
Nottingham Forest            62
Bournemouth                  61
Norwich City                 38
Watford                      38
Sheffield United             24
Luton Town                   23
Name: Opponent, dtype: int64

In [8]:
cleaned_df['Venue'].value_counts()

Away    998
Home    998
Name: Venue, dtype: int64

In [9]:
encoder = LabelEncoder()
def add_predictors(df, cols):
    for col in cols:
        df[f"{col}_code"] = encoder.fit_transform(df[col])
    return df
    
    

In [10]:
cat_cols = ["Venue", "Team", "Opponent"]

In [11]:
encoded_df = add_predictors(cleaned_df, cat_cols)

In [12]:
encoded_df.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Attendance,Captain,Formation,Referee,Match Report,Notes,Sh,SoT,Dist,FK,PK,PKatt,Season,Team,hour,day_code,Venue_code,Team_code,Opponent_code
0,2021-08-13,20:00,Premier League,Matchweek 1,Fri,Away,L,0.0,2.0,Brentford,1.3,1.2,64.0,16479.0,Granit Xhaka,4-2-3-1,Michael Oliver,Match Report,,22.0,4.0,19.0,1.0,0,0,2022,Arsenal,20,4,0,0,3
1,2021-08-13,20:00,Premier League,Matchweek 1,Fri,Home,W,2.0,0.0,Arsenal,1.2,1.3,36.0,16479.0,Pontus Jansson,3-5-2,Michael Oliver,Match Report,,8.0,3.0,12.1,0.0,0,0,2022,Brentford,20,4,1,3,0
2,2021-08-14,15:00,Premier League,Matchweek 1,Sat,Away,L,1.0,3.0,Everton,0.8,2.4,52.0,38487.0,James Ward-Prowse,4-4-2,Andy Madley,Match Report,,6.0,3.0,15.5,0.0,0,0,2022,Southampton,15,5,0,20,8
3,2021-08-14,12:30,Premier League,Matchweek 1,Sat,Home,W,5.0,1.0,Leeds United,1.5,0.5,49.0,72732.0,Harry Maguire,4-2-3-1,Paul Tierney,Match Report,,16.0,8.0,18.2,0.0,0,0,2022,Manchester United,12,5,1,15,10
4,2021-08-14,15:00,Premier League,Matchweek 1,Sat,Home,L,1.0,2.0,Brighton and Hove Albion,1.5,1.0,37.0,16910.0,Ben Mee,4-4-2,David Coote,Match Report,,14.0,3.0,16.2,1.0,0,0,2022,Burnley,15,5,1,5,4


In [13]:
groups_df = encoded_df.groupby("Team")

In [14]:
mancity_df = groups_df.get_group("Manchester City")

In [15]:
mancity_df.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Attendance,Captain,Formation,Referee,Match Report,Notes,Sh,SoT,Dist,FK,PK,PKatt,Season,Team,hour,day_code,Venue_code,Team_code,Opponent_code
19,2021-08-15,16:30,Premier League,Matchweek 1,Sun,Away,L,0.0,1.0,Tottenham Hotspur,1.8,1.0,65.0,58262.0,Fernandinho,4-3-3,Anthony Taylor,Match Report,,18.0,4.0,17.3,1.0,0,0,2022,Manchester City,16,6,0,14,21
24,2021-08-21,15:00,Premier League,Matchweek 2,Sat,Home,W,5.0,0.0,Norwich City,2.6,0.1,67.0,51437.0,İlkay Gündoğan,4-3-3,Graham Scott,Match Report,,16.0,4.0,18.5,1.0,0,0,2022,Manchester City,15,5,1,14,17
43,2021-08-28,12:30,Premier League,Matchweek 3,Sat,Home,W,5.0,0.0,Arsenal,4.4,0.2,80.0,52276.0,İlkay Gündoğan,4-3-3,Martin Atkinson,Match Report,,25.0,10.0,14.8,0.0,0,0,2022,Manchester City,12,5,1,14,0
60,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Away,W,1.0,0.0,Leicester City,2.8,0.6,61.0,32087.0,İlkay Gündoğan,4-3-3,Paul Tierney,Match Report,,25.0,8.0,14.3,0.0,0,0,2022,Manchester City,15,5,0,14,11
88,2021-09-18,15:00,Premier League,Matchweek 5,Sat,Home,D,0.0,0.0,Southampton,1.0,0.4,64.0,52698.0,Fernandinho,4-3-3,Jonathan Moss,Match Report,,16.0,1.0,16.4,1.0,0,0,2022,Manchester City,15,5,1,14,20


In [16]:
def rolling_averages(group, cols, new_cols):
    rolling_stats = group[cols].rolling(5, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.fillna(0)
    return group

In [17]:
cols = ["GF", "GA", "Poss", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]
new_cols = [f"{col}_rolling" for col in cols]

In [18]:
matches_rolling = encoded_df.groupby("Team").apply(lambda x: rolling_averages(x, cols, new_cols))

In [19]:
matches_rolling.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Attendance,Captain,Formation,Referee,Match Report,Notes,Sh,SoT,Dist,FK,PK,PKatt,Season,Team,hour,day_code,Venue_code,Team_code,Opponent_code,GF_rolling,GA_rolling,Poss_rolling,Sh_rolling,SoT_rolling,Dist_rolling,FK_rolling,PK_rolling,PKatt_rolling
0,2021-08-13,20:00,Premier League,Matchweek 1,Fri,Away,L,0.0,2.0,Brentford,1.3,1.2,64.0,16479.0,Granit Xhaka,4-2-3-1,Michael Oliver,Match Report,0.0,22.0,4.0,19.0,1.0,0,0,2022,Arsenal,20,4,0,0,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2021-08-13,20:00,Premier League,Matchweek 1,Fri,Home,W,2.0,0.0,Arsenal,1.2,1.3,36.0,16479.0,Pontus Jansson,3-5-2,Michael Oliver,Match Report,0.0,8.0,3.0,12.1,0.0,0,0,2022,Brentford,20,4,1,3,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2021-08-14,15:00,Premier League,Matchweek 1,Sat,Away,L,1.0,3.0,Everton,0.8,2.4,52.0,38487.0,James Ward-Prowse,4-4-2,Andy Madley,Match Report,0.0,6.0,3.0,15.5,0.0,0,0,2022,Southampton,15,5,0,20,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2021-08-14,12:30,Premier League,Matchweek 1,Sat,Home,W,5.0,1.0,Leeds United,1.5,0.5,49.0,72732.0,Harry Maguire,4-2-3-1,Paul Tierney,Match Report,0.0,16.0,8.0,18.2,0.0,0,0,2022,Manchester United,12,5,1,15,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2021-08-14,15:00,Premier League,Matchweek 1,Sat,Home,L,1.0,2.0,Brighton and Hove Albion,1.5,1.0,37.0,16910.0,Ben Mee,4-4-2,David Coote,Match Report,0.0,14.0,3.0,16.2,1.0,0,0,2022,Burnley,15,5,1,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
matches_rolling["Target"] = matches_rolling["Result"].astype("category").cat.codes

In [21]:
matches_rolling.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Attendance,Captain,Formation,Referee,Match Report,Notes,Sh,SoT,Dist,FK,PK,PKatt,Season,Team,hour,day_code,Venue_code,Team_code,Opponent_code,GF_rolling,GA_rolling,Poss_rolling,Sh_rolling,SoT_rolling,Dist_rolling,FK_rolling,PK_rolling,PKatt_rolling,Target
0,2021-08-13,20:00,Premier League,Matchweek 1,Fri,Away,L,0.0,2.0,Brentford,1.3,1.2,64.0,16479.0,Granit Xhaka,4-2-3-1,Michael Oliver,Match Report,0.0,22.0,4.0,19.0,1.0,0,0,2022,Arsenal,20,4,0,0,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,2021-08-13,20:00,Premier League,Matchweek 1,Fri,Home,W,2.0,0.0,Arsenal,1.2,1.3,36.0,16479.0,Pontus Jansson,3-5-2,Michael Oliver,Match Report,0.0,8.0,3.0,12.1,0.0,0,0,2022,Brentford,20,4,1,3,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
2,2021-08-14,15:00,Premier League,Matchweek 1,Sat,Away,L,1.0,3.0,Everton,0.8,2.4,52.0,38487.0,James Ward-Prowse,4-4-2,Andy Madley,Match Report,0.0,6.0,3.0,15.5,0.0,0,0,2022,Southampton,15,5,0,20,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,2021-08-14,12:30,Premier League,Matchweek 1,Sat,Home,W,5.0,1.0,Leeds United,1.5,0.5,49.0,72732.0,Harry Maguire,4-2-3-1,Paul Tierney,Match Report,0.0,16.0,8.0,18.2,0.0,0,0,2022,Manchester United,12,5,1,15,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
4,2021-08-14,15:00,Premier League,Matchweek 1,Sat,Home,L,1.0,2.0,Brighton and Hove Albion,1.5,1.0,37.0,16910.0,Ben Mee,4-4-2,David Coote,Match Report,0.0,14.0,3.0,16.2,1.0,0,0,2022,Burnley,15,5,1,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [22]:
matches_rolling["Target"].value_counts()

1    775
2    775
0    446
Name: Target, dtype: int64

In [23]:
predictors = ["Venue_code", "Team_code", "Opponent_code"] + new_cols
predictors

['Venue_code',
 'Team_code',
 'Opponent_code',
 'GF_rolling',
 'GA_rolling',
 'Poss_rolling',
 'Sh_rolling',
 'SoT_rolling',
 'Dist_rolling',
 'FK_rolling',
 'PK_rolling',
 'PKatt_rolling']

In [24]:
X_train = matches_rolling[matches_rolling["Date"] < "2023-10-30"][predictors]
X_test =  matches_rolling[matches_rolling["Date"] >= "2023-10-30"][predictors]
y_train = matches_rolling[matches_rolling["Date"] < "2023-10-30"]["Target"]
y_test = matches_rolling[matches_rolling["Date"] >= "2023-10-30"]["Target"]

In [25]:
X_train.shape

(1720, 12)

In [26]:
scaler = StandardScaler()
scaledX_train = scaler.fit_transform(X_train)
scaledX_test = scaler.transform(X_test)

In [27]:
def make_predictions(model):
    model.fit(scaledX_train, y_train)
    train_pred = model.predict(scaledX_train)
    test_pred = model.predict(scaledX_test)
    train_score = accuracy_score(y_train, train_pred)
    test_score = accuracy_score(y_test, test_pred)
    report = classification_report(y_test, test_pred)
    return train_score, test_score, report

In [28]:
forest = RandomForestClassifier(random_state=1)

In [29]:
forest_train_score, forest_test_score, forest_report = make_predictions(forest)

In [30]:
print(forest_report)

              precision    recall  f1-score   support

           0       0.14      0.07      0.10        54
           1       0.50      0.60      0.55       111
           2       0.52      0.53      0.53       111

    accuracy                           0.47       276
   macro avg       0.39      0.40      0.39       276
weighted avg       0.44      0.47      0.45       276



In [31]:
param_dist ={'n_estimators': range(80, 500, 20), 
             'max_depth': range(0, 12, 1), 
             'min_samples_split': range(0, 30, 5), 
             'max_features': ['auto', 'sqrt', 'log2', None], 
             "bootstrap":[True, False]}

kfold_forest = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)

In [32]:
random_search = RandomizedSearchCV(
    forest,
    param_distributions=param_dist,
    n_iter=50, 
    cv=kfold_forest,
    scoring="accuracy",
    n_jobs=-1, 
    random_state=6
)

random_search.fit(scaledX_train, y_train)

In [33]:
random_search.best_params_

{'n_estimators': 300,
 'min_samples_split': 25,
 'max_features': 'auto',
 'max_depth': 8,
 'bootstrap': False}

In [34]:
tuned_forest = random_search.best_estimator_

In [35]:
tforest_train_score, tforest_test_score, tforest_report = make_predictions(tuned_forest)

In [36]:
print(tforest_report)

              precision    recall  f1-score   support

           0       0.11      0.02      0.03        54
           1       0.53      0.64      0.58       111
           2       0.53      0.63      0.58       111

    accuracy                           0.51       276
   macro avg       0.39      0.43      0.40       276
weighted avg       0.45      0.51      0.47       276



In [37]:
forest_space = {
    "n_estimators": hp.choice("n_eatimators", range(80, 500, 20)),
    "max_depth": hp.choice("max_depth", range(0, 12, 1)),
    "min_samples_split": hp.choice("min_samples_split", range(1, 30, 5)),
    "max_features": hp.choice("max_features", ['auto', 'sqrt', 'log2', None]),
    "bootstrap": hp.choice("bootstrap", [True, False])
}

In [38]:
def objective_forest(params_f):
    score_forest = cross_val_score(forest, X=scaledX_train, y=y_train, cv=kfold_forest, scoring="accuracy", n_jobs=-1).mean()
    loss_forest = -score_forest
    
    return {"loss":loss_forest, "params":params_f, "status":STATUS_OK}

In [39]:
best_forest = fmin(fn=objective_forest, space=forest_space, algo=tpe.suggest, max_evals=50, trials=Trials())

100%|██████████████████████████████████████████████| 50/50 [00:57<00:00,  1.15s/trial, best loss: -0.48198652891540134]


In [40]:
bo_forest = RandomForestClassifier(random_state = 30,
                        n_estimators=space_eval(forest_space, best_forest)['n_estimators'],
                        min_samples_split=space_eval(forest_space, best_forest)['min_samples_split'],
                        max_features=space_eval(forest_space, best_forest)['max_features'],
                        max_depth=space_eval(forest_space, best_forest)['max_depth'],
                        bootstrap=space_eval(forest_space, best_forest)['bootstrap'])

In [41]:
boforest_train_score, boforest_test_score, boforest_report = make_predictions(bo_forest)

In [42]:
print(boforest_report)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        54
           1       0.52      0.71      0.60       111
           2       0.52      0.59      0.55       111

    accuracy                           0.52       276
   macro avg       0.35      0.43      0.39       276
weighted avg       0.42      0.52      0.47       276



In [43]:
boost = XGBClassifier(random_state = 10)

In [44]:
boost_train_score, boost_test_score, boost_report = make_predictions(boost)

In [45]:
print(boost_report)

              precision    recall  f1-score   support

           0       0.19      0.17      0.18        54
           1       0.51      0.58      0.54       111
           2       0.51      0.48      0.49       111

    accuracy                           0.46       276
   macro avg       0.40      0.41      0.40       276
weighted avg       0.45      0.46      0.45       276



In [46]:
params = {
    "learning_rate": [0.0001, 0.001, 0.01, 0.1, 1],
    "max_depth": range(2, 21, 3),
    "gamma": [i/10.0 for i in range(0,5)],
    "colsample_bytree": [i/10.0 for i in range(3,10)],
    "reg_alpha": [1e-5, 1e-2, 0.1, 1, 10, 100],
    "reg_lamda":[1e-5, 1e-2, 0.1, 1, 10, 100]
}

In [47]:
random_search_boost = RandomizedSearchCV(
    boost,
    param_distributions=params,
    n_iter=48, 
    cv=kfold_forest,
    scoring="accuracy",
    n_jobs=-1, 
    random_state=7
)

random_search_boost.fit(scaledX_train, y_train)

Parameters: { "reg_lamda" } are not used.



In [48]:
random_search_boost.best_params_

{'reg_lamda': 1e-05,
 'reg_alpha': 10,
 'max_depth': 20,
 'learning_rate': 1,
 'gamma': 0.0,
 'colsample_bytree': 0.3}

In [49]:
tuned_boost = random_search_boost.best_estimator_

In [50]:
tboost_train_score, tboost_test_score, tboost_report = make_predictions(tuned_boost)

Parameters: { "reg_lamda" } are not used.



In [51]:
print(tboost_report)

              precision    recall  f1-score   support

           0       0.23      0.06      0.09        54
           1       0.54      0.65      0.59       111
           2       0.57      0.67      0.61       111

    accuracy                           0.54       276
   macro avg       0.45      0.46      0.43       276
weighted avg       0.49      0.54      0.50       276



In [52]:
boost_space = {
    "learning_rate": hp.choice("learning_rate", [0.0001, 0.001, 0.01, 0.1, 1]),
    "max_depth": hp.choice("max_depth", range(2, 21, 3)),
    "gamma": hp.choice("gamma", [i/10.0 for i in range(0,5)]),
    "colsample_bytree": hp.choice("colsample_bytree", [i/10.0 for i in range(3,10)]),
    "reg_alpha": hp.choice("reg_alpha", [1e-5, 1e-2, 0.1, 1, 10, 100]),
    "reg_lambda": hp.choice("reg_lambda", [1e-5, 1e-2, 0.1, 1, 10, 100])
}

In [53]:
def objective_boost(params):
    score_boost = cross_val_score(boost, X=scaledX_train, y=y_train, cv=kfold_forest, scoring="accuracy", n_jobs=-1).mean()
    loss = -score_boost
    
    return {"loss":loss, "params":params, "status":STATUS_OK}

In [54]:
best_boost = fmin(fn=objective_boost, space=boost_space, algo=tpe.suggest, max_evals=50, trials=Trials())

100%|███████████████████████████████████████████████| 50/50 [00:53<00:00,  1.07s/trial, best loss: -0.4738473263565845]


In [55]:
bo_boost = XGBClassifier(random_state = 20,
                        colsample_bytree=space_eval(boost_space, best_boost)['colsample_bytree'],
                        gamma=space_eval(boost_space, best_boost)['gamma'],
                        learning_rate=space_eval(boost_space, best_boost)['learning_rate'],
                        max_depth=space_eval(boost_space, best_boost)['max_depth'],
                        reg_alpha=space_eval(boost_space, best_boost)['reg_alpha'],
                        reg_lambda=space_eval(boost_space, best_boost)['reg_lambda'])

In [56]:
boboost_train_score, boboost_test_score, boboost_report = make_predictions(bo_boost)

In [57]:
print(boboost_report)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        54
           1       0.47      0.59      0.53       111
           2       0.46      0.57      0.51       111

    accuracy                           0.47       276
   macro avg       0.31      0.39      0.35       276
weighted avg       0.38      0.47      0.42       276



In [58]:
scores_df = pd.DataFrame([["Random Forest (random search tuned)", tforest_train_score, tforest_test_score],
                         ["xgboost (random search tuned)", tboost_train_score, tboost_test_score],
                         ["xgboost (bayesian opt)", boboost_train_score, boboost_test_score],
                         ["Random Forest (bayesian opt)", boforest_train_score, boforest_test_score]], 
                         columns = ["Model", "Training score", "Validation score"])

In [59]:
scores_df

Unnamed: 0,Model,Training score,Validation score
0,Random Forest (random search tuned),0.688953,0.514493
1,xgboost (random search tuned),0.60814,0.539855
2,xgboost (bayesian opt),0.490116,0.467391
3,Random Forest (bayesian opt),0.57093,0.521739


In [60]:
matches_rolling.to_csv("../datasets/rolling.csv", index=False)

In [61]:
# Specify the file path where you want to save the model
save_path = "model.pkl"

# Save the model as a pickle file
with open(save_path, 'wb') as file:
    pickle.dump(tuned_forest, file)      #the random forest was saved even though the xgboost performed a little better than it because xgboost requires installing more dependencies in the production environment
    

print("Model saved as pickle file.")

Model saved as pickle file.


In [62]:
# Specify the file path where you want to save the model
save_path = "scaler.pkl"

# Save the model as a pickle file
with open(save_path, 'wb') as file:
    pickle.dump(scaler, file)

print("Scaler saved as pickle file.")

Scaler saved as pickle file.


In [63]:
# Specify the file path where you want to save the model
save_path = "encoder.pkl"

# Save the model as a pickle file
with open(save_path, 'wb') as file:
    pickle.dump(encoder, file)

print("Encoder saved as pickle file.")

Encoder saved as pickle file.
