In [1]:
import pandas as pd

df = pd.read_csv("matches.csv", index_col=0)

df["Date"] = pd.to_datetime(df["Date"])
print(df)

         Date   Time            Comp         Round  Day Venue Result  GF  GA  \
1  2023-08-11  20:00  Premier League   Matchweek 1  Fri  Away      W   3   0   
3  2023-08-19  20:00  Premier League   Matchweek 2  Sat  Home      W   1   0   
4  2023-08-27  14:00  Premier League   Matchweek 3  Sun  Away      W   2   1   
5  2023-09-02  15:00  Premier League   Matchweek 4  Sat  Home      W   5   1   
6  2023-09-16  15:00  Premier League   Matchweek 5  Sat  Away      W   3   1   
..        ...    ...             ...           ...  ...   ...    ...  ..  ..   
38 2020-07-07  18:00  Premier League  Matchweek 34  Tue  Away      L   1   2   
39 2020-07-11  12:30  Premier League  Matchweek 35  Sat  Home      L   0   4   
40 2020-07-14  20:15  Premier League  Matchweek 36  Tue  Away      L   0   1   
41 2020-07-18  17:30  Premier League  Matchweek 37  Sat  Home      L   0   2   
42 2020-07-26  16:00  Premier League  Matchweek 38  Sun  Away      L   0   5   

           Opponent  ...  Match Report 

In [2]:
df["venue_code"] = df["Venue"].astype("category").cat.codes
df["opp_code"] = df["Opponent"].astype("category").cat.codes
df["hour"] = df["Time"].str.replace(":.+", "", regex=True).astype("int")
df["day_code"] = df["Date"].dt.dayofweek
df["target"] = (df["Result"] == "W").astype("int")

In [3]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators = 50, min_samples_split = 10, random_state=1)
train = df[df["Date"] < "2024-01-01"]
test = df[df["Date"] > "2024-01-01"]
predictors = ["venue_code", "opp_code", "hour", "day_code"]
rf.fit(train[predictors], train["target"])
RandomForestClassifier(min_samples_split=10, n_estimators=50, random_state=1)
preds = rf.predict(test[predictors])



In [4]:
def rolling_averages(group, cols, new_cols):
    group = group.sort_values("Date")
    rolling_stats = group[cols].rolling(3, closed="left").mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

In [5]:
cols = ["GF", "GA", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]
new_cols = [f"{c}_rolling" for c in cols]

In [6]:
df_rolling = df.groupby("Team").apply(lambda x: rolling_averages(x, cols, new_cols))

  df_rolling = df.groupby("Team").apply(lambda x: rolling_averages(x, cols, new_cols))


In [7]:
df_rolling.index = range(df_rolling.shape[0])

In [8]:
from sklearn.metrics import precision_score
def make_predictions(df, predictors):
    train = df[df["Date"] < "2024-01-01"]
    test = df[df["Date"] > "2024-01-01"]
    rf.fit(train[predictors], train["target"])
    preds = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(actual=test["target"], predicted=preds), index=test.index)
    precision = precision_score(test["target"], preds)
    return combined, precision

In [9]:
combined, precision = make_predictions(df_rolling, predictors + new_cols)

combined = combined.merge(df_rolling[["Date", "Team", "Opponent", "Result"]], left_index=True, right_index=True)

In [10]:
class MissingDict(dict):
    __missing__ = lambda self, key: key

map_values = {"Brighton and Hove Albion": "Brighton", "Manchester United": "Manchester Utd", "Newcastle United": "Newcastle Utd", "Tottenham Hotspur": "Tottenham", "West Ham United": "West Ham", "Wolverhampton Wanderers": "Wolves"} 
mapping = MissingDict(**map_values)
combined["new_team"] = combined["Team"].map(mapping)
merged = combined.merge(combined, left_on=["Date", "new_team"], right_on=["Date", "Opponent"])
print(merged)
merged[(merged["predicted_x"] == 1) & (merged["predicted_y"] == 0)]["actual_x"].value_counts()

     actual_x  predicted_x       Date                   Team_x  \
0           1            1 2024-01-20                  Arsenal   
1           1            1 2024-01-30                  Arsenal   
2           1            0 2024-02-04                  Arsenal   
3           1            1 2024-02-11                  Arsenal   
4           1            1 2024-02-17                  Arsenal   
..        ...          ...        ...                      ...   
325         0            0 2024-04-24  Wolverhampton Wanderers   
326         1            0 2024-04-27  Wolverhampton Wanderers   
327         0            0 2024-05-04  Wolverhampton Wanderers   
328         0            0 2024-05-11  Wolverhampton Wanderers   
329         0            0 2024-05-19  Wolverhampton Wanderers   

          Opponent_x Result_x new_team_x  actual_y  predicted_y  \
0     Crystal Palace        W    Arsenal         0            0   
1    Nott'ham Forest        W    Arsenal         0            0   
2     

actual_x
1    58
0    45
Name: count, dtype: int64