In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [5]:
df = pd.read_csv(
    filepath_or_buffer="./results.csv",
    header=0,
    delimiter=",",
    parse_dates=["date"],
)

df = df.dropna(
    axis=0,
    how="any"
)

df = df.rename({"date": "datetime"}, axis=1)

df["home_score"] = df["home_score"].astype(int)
df["away_score"] = df["away_score"].astype(int)
df["neutral"] = df["neutral"].astype(int)

In [6]:
df

Unnamed: 0,datetime,home_team,away_team,home_score,away_score,tournament,city,country,neutral
0,1872-11-30,Scotland,England,0,0,Friendly,Glasgow,Scotland,0
1,1873-03-08,England,Scotland,4,2,Friendly,London,England,0
2,1874-03-07,Scotland,England,2,1,Friendly,Glasgow,Scotland,0
3,1875-03-06,England,Scotland,2,2,Friendly,London,England,0
4,1876-03-04,Scotland,England,3,0,Friendly,Glasgow,Scotland,0
...,...,...,...,...,...,...,...,...,...
44054,2022-09-27,Albania,Iceland,1,1,UEFA Nations League,Tirana,Albania,0
44055,2022-09-27,Norway,Serbia,0,2,UEFA Nations League,Oslo,Norway,0
44056,2022-09-27,Sweden,Slovenia,1,1,UEFA Nations League,Stockholm,Sweden,0
44057,2022-09-27,Kosovo,Cyprus,5,1,UEFA Nations League,Pristina,Kosovo,0


In [7]:
td_h = pd.to_timedelta(np.random.randint(0, 23, size=len(df)), unit="h")
td_m = pd.to_timedelta(np.random.randint(0, 59, size=len(df)), unit="m")
td_s = pd.to_timedelta(np.random.randint(0, 59, size=len(df)), unit="s")
td = td_h+td_m+td_s 

In [8]:
df["datetime"] = df["datetime"] + td

conditions = [
    (df["home_score"] == df["away_score"]),
    (df["home_score"] > df["away_score"]),
    (df["home_score"] < df["away_score"]),
]

values = [0, 1, 2]

df["result"] = np.select(conditions, values)

In [9]:
grouped_home_wins = df.sort_values("datetime") \
        .set_index("datetime").groupby("home_team") \
        .apply(lambda df: df.home_score > df.away_score) \
        .rolling(5, closed="left", min_periods=1) \
        .agg({"num_win_5_home_game": np.sum})

grouped_away_wins = df.sort_values("datetime") \
        .set_index("datetime").groupby("away_team") \
        .apply(lambda df: df.away_score > df.home_score) \
        .rolling(5, closed="left", min_periods=1) \
        .agg({"num_win_5_away_game": np.sum})

grouped_home_scores = df.sort_values("datetime") \
    .set_index("datetime").groupby("home_team") \
    .rolling(5, closed="left", min_periods=1)["home_score"].mean() \
    .rename("avg_5_home_score")

grouped_away_scores = df.sort_values("datetime") \
    .set_index("datetime").groupby("away_team") \
    .rolling(5, closed="left", min_periods=1)["away_score"].mean() \
    .rename("avg_5_away_score")

grouped_home_tournament = df.sort_values("datetime") \
    .set_index("datetime").groupby("home_team") \
    .apply(lambda df: df.tournament != "Friendly") \
    .rolling(5, closed="left", min_periods=1) \
    .agg({"num_home_non_friendly_tournament": np.sum})

grouped_away_tournament = df.sort_values("datetime") \
    .set_index("datetime").groupby("away_team") \
    .apply(lambda df: df.tournament != "Friendly") \
    .rolling(3, closed="left", min_periods=1) \
    .agg({"num_away_non_friendly_tournament": np.sum})

merged = df \
    .join(grouped_home_wins, on=["home_team", "datetime"], validate="1:1") \
    .join(grouped_away_wins, on=["away_team", "datetime"], validate="1:1") \
    .join(grouped_home_scores, on=["home_team", "datetime"], validate="1:1") \
    .join(grouped_away_scores, on=["away_team", "datetime"], validate="1:1") \
    .join(grouped_home_tournament, on=["home_team", "datetime"], validate="1:1") \
    .join(grouped_away_tournament, on=["away_team", "datetime"], validate="1:1")

merged = merged.fillna(0).astype(
        {
            "num_win_5_home_game": int,
            "num_win_5_away_game": int,
            "num_home_non_friendly_tournament": int,
            "num_away_non_friendly_tournament": int    
        }
    )

In [10]:
DROP_COLS = [
    "datetime",
    "home_team",
    "away_team",
    "home_score",
    "away_score",
    "tournament",
    "city",
    "country"	
]

merged = merged.drop(DROP_COLS, axis=1)

In [12]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, matthews_corrcoef, accuracy_score

def custom_scorer(y_true, y_pred, actual_scorer):
    score = np.nan
    try:
      score = actual_scorer(y_true, y_pred)
    except Exception: 
      pass
    return score

mcc = make_scorer(custom_scorer, actual_scorer=matthews_corrcoef)
acc = make_scorer(custom_scorer, actual_scorer=accuracy_score)

models = {
    "rf": {
        "instance": RandomForestClassifier(n_jobs=-1),
        "distributions": 
            dict(
                max_features=[2, 3],
                min_samples_leaf=[3, 4, 5],
                min_samples_split=[8, 10, 12],
                max_depth=np.arange(1, 31),
                n_estimators=np.arange(100, 1500, 100),
                criterion=["gini", "entropy"],
                bootstrap=[True, False]
            )
        }
}

model = "rf"
params_dist = models[model]["distributions"]
est = models[model]["instance"]

X = merged.drop(["result"], axis=1)
y = merged["result"].values
score_select_best = "acc"

clf = RandomizedSearchCV(
    estimator=est,
    param_distributions=params_dist,
    n_iter=10,
    scoring={"acc": acc, "mcc": mcc},
    cv=5,
    refit=score_select_best
)

clf_tuned = clf.fit(X, y)
print(f"Best {score_select_best!r}: {clf_tuned.best_score_}")
print("Best set of parameters:")
best_parameters = clf_tuned.best_estimator_.get_params()
for param_name in sorted(params_dist.keys()):
    print(f"\t{param_name}: {best_parameters[param_name]}")

Best 'mcc': 0.15070035377075822
Best set of parameters:
	bootstrap: True
	criterion: gini
	max_depth: 12
	max_features: 2
	min_samples_leaf: 3
	min_samples_split: 10
	n_estimators: 900
