In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
import pandas as pd

In [2]:
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)

In [3]:
def make_prediction(data, predictors):
    train = data[data['date'] < "2023-05-12"]
    test = data[data['date'] > "2023-05-12"]
    rf.fit(train[predictors], train["target"])
    preds = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(actual=test["target"], predicted=preds), index=test.index)
    error = accuracy_score(test["target"], preds)
    precision = precision_score(test["target"], preds, average='weighted')
    return combined, precision, error

In [4]:
# New Prediction with rolling matches
rolling_matches = pd.read_csv("../data/processed/rolling_matches.csv")
predictors = ["venue_code","opp_code","hour","day_code","gf_rolling_mean","ga_rolling_mean","sh_rolling_mean","sot_rolling_mean"]
combined, precision, error = make_prediction(rolling_matches, predictors)

In [15]:
print(combined)
print(f"precision: {precision}")
print(f"error: {error}")

      actual  predicted        date                     team         opponent  \
34         1          2  2023-05-14                  Arsenal         Brighton   
35         1          2  2023-05-20                  Arsenal  Nott'ham Forest   
36         2          2  2023-05-28                  Arsenal           Wolves   
37         2          2  2023-08-12                  Arsenal  Nott'ham Forest   
38         2          2  2023-08-21                  Arsenal   Crystal Palace   
...      ...        ...         ...                      ...              ...   
1261       2          1  2024-02-04  Wolverhampton Wanderers          Chelsea   
1262       1          2  2024-02-10  Wolverhampton Wanderers        Brentford   
1263       2          2  2024-02-17  Wolverhampton Wanderers        Tottenham   
1264       2          2  2024-02-25  Wolverhampton Wanderers    Sheffield Utd   
1265       1          1  2024-03-02  Wolverhampton Wanderers    Newcastle Utd   

     result new_team  
34  

In [6]:
pd.crosstab(index=combined["actual"], columns=combined["predicted"])

predicted,0,1,2
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,5,49,72
1,12,129,91
2,5,66,164


In [7]:
combined = combined.merge(rolling_matches[["date", "team", "opponent", "result"]], left_index=True, right_index=True)
combined

Unnamed: 0,actual,predicted,date,team,opponent,result
34,1,2,2023-05-14,Arsenal,Brighton,L
35,1,2,2023-05-20,Arsenal,Nott'ham Forest,L
36,2,2,2023-05-28,Arsenal,Wolves,W
37,2,2,2023-08-12,Arsenal,Nott'ham Forest,W
38,2,2,2023-08-21,Arsenal,Crystal Palace,W
...,...,...,...,...,...,...
1261,2,1,2024-02-04,Wolverhampton Wanderers,Chelsea,W
1262,1,2,2024-02-10,Wolverhampton Wanderers,Brentford,L
1263,2,2,2024-02-17,Wolverhampton Wanderers,Tottenham,W
1264,2,2,2024-02-25,Wolverhampton Wanderers,Sheffield Utd,W


In [8]:
# Unifying team names
class MissingDict(dict):
    __missing__ = lambda self, key: key

map_values = {
    "Brighton & Hove Albion": "Brighton",
    "Manchester United": "Manchester Utd",
    "Newcastle United": "Newcastle",
    "Tottenham Hotspur": "Tottenham",
    "West Ham United": "West Ham",
    "Wolverhampton Wanderers": "Wolves"
}
mapping = MissingDict(**map_values)

In [9]:
combined["new_team"] = combined["team"].map(mapping)
combined

Unnamed: 0,actual,predicted,date,team,opponent,result,new_team
34,1,2,2023-05-14,Arsenal,Brighton,L,Arsenal
35,1,2,2023-05-20,Arsenal,Nott'ham Forest,L,Arsenal
36,2,2,2023-05-28,Arsenal,Wolves,W,Arsenal
37,2,2,2023-08-12,Arsenal,Nott'ham Forest,W,Arsenal
38,2,2,2023-08-21,Arsenal,Crystal Palace,W,Arsenal
...,...,...,...,...,...,...,...
1261,2,1,2024-02-04,Wolverhampton Wanderers,Chelsea,W,Wolves
1262,1,2,2024-02-10,Wolverhampton Wanderers,Brentford,L,Wolves
1263,2,2,2024-02-17,Wolverhampton Wanderers,Tottenham,W,Wolves
1264,2,2,2024-02-25,Wolverhampton Wanderers,Sheffield Utd,W,Wolves


In [10]:
merged = combined.merge(combined, left_on=["date","new_team"], right_on=["date","opponent"])
merged

Unnamed: 0,actual_x,predicted_x,date,team_x,opponent_x,result_x,new_team_x,actual_y,predicted_y,team_y,opponent_y,result_y,new_team_y
0,1,2,2023-05-14,Arsenal,Brighton,L,Arsenal,2,1,Brighton and Hove Albion,Arsenal,W,Brighton and Hove Albion
1,1,2,2023-05-20,Arsenal,Nott'ham Forest,L,Arsenal,2,1,Nottingham Forest,Arsenal,W,Nottingham Forest
2,2,2,2023-05-28,Arsenal,Wolves,W,Arsenal,1,1,Wolverhampton Wanderers,Arsenal,L,Wolves
3,2,2,2023-08-12,Arsenal,Nott'ham Forest,W,Arsenal,1,1,Nottingham Forest,Arsenal,L,Nottingham Forest
4,2,2,2023-08-21,Arsenal,Crystal Palace,W,Arsenal,1,1,Crystal Palace,Arsenal,L,Crystal Palace
...,...,...,...,...,...,...,...,...,...,...,...,...,...
468,2,1,2024-02-04,Wolverhampton Wanderers,Chelsea,W,Wolves,1,0,Chelsea,Wolves,L,Chelsea
469,1,2,2024-02-10,Wolverhampton Wanderers,Brentford,L,Wolves,2,1,Brentford,Wolves,W,Brentford
470,2,2,2024-02-17,Wolverhampton Wanderers,Tottenham,W,Wolves,1,2,Tottenham Hotspur,Wolves,L,Tottenham
471,2,2,2024-02-25,Wolverhampton Wanderers,Sheffield Utd,W,Wolves,1,1,Sheffield United,Wolves,L,Sheffield United


In [11]:
# Count Team A wins and Team B losses to get the accuracy
merged[(merged["predicted_x"] == 2) & (merged["predicted_y"] == 1)]["actual_x"].value_counts()

actual_x
2    95
1    37
0    28
Name: count, dtype: int64

In [16]:
# Export model
from joblib import dump
from datetime import datetime

# Get the current date as a string
current_date = datetime.now().strftime("%Y%m%d")

# Create the filename
filename = f"../models/model_{current_date}.joblib"

# Save the model to a file
dump(rf, filename)

['../models/model_20240317.joblib']

: 

In [13]:
# Load the model from the file
from joblib import load

rf_loaded = load("../models/model_2024-03-11.joblib")