In [5]:
import pandas as pd

In [6]:
df = pd.read_csv("matches.csv",index_col=0)

In [7]:
df.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
0,2020-09-21,20:15 (21:15),Premier League,Matchweek 2,Mon,Away,W,3,1,Wolves,...,Match Report,,13,8,21.1,2,1,1,2024,Manchester City
2,2020-09-27,16:30 (17:30),Premier League,Matchweek 3,Sun,Home,L,2,5,Leicester City,...,Match Report,,16,5,19.8,1,0,0,2024,Manchester City
4,2020-10-03,17:30 (18:30),Premier League,Matchweek 4,Sat,Away,D,1,1,Leeds United,...,Match Report,,23,1,18.2,1,0,0,2024,Manchester City
5,2020-10-17,17:30 (18:30),Premier League,Matchweek 5,Sat,Home,W,1,0,Arsenal,...,Match Report,,13,5,17.7,0,0,0,2024,Manchester City
7,2020-10-24,12:30 (13:30),Premier League,Matchweek 6,Sat,Away,D,1,1,West Ham,...,Match Report,,14,7,20.9,1,0,0,2024,Manchester City


In [8]:
df.columns

Index(['date', 'time', 'comp', 'round', 'day', 'venue', 'result', 'gf', 'ga',
       'opponent', 'xg', 'xga', 'poss', 'attendance', 'captain', 'formation',
       'referee', 'match report', 'notes', 'sh', 'sot', 'dist', 'fk', 'pk',
       'pkatt', 'season', 'team'],
      dtype='object')

In [9]:
df.isnull().sum()

date               0
time               0
comp               0
round              0
day                0
venue              0
result             0
gf                 0
ga                 0
opponent           0
xg                 0
xga                0
poss               0
attendance      1633
captain            0
formation          0
referee            0
match report       0
notes           4788
sh                 0
sot                0
dist               2
fk                 0
pk                 0
pkatt              0
season             0
team               0
dtype: int64

In [53]:
df.drop(columns = ["attendance","referee","notes","match report","season"])

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,sot,dist,fk,pk,pkatt,team,H/A,opp_code,hours,target
0,2020-09-21,20:15 (21:15),Premier League,Matchweek 2,0,Away,W,3,1,Wolves,...,8,21.1,2,1,1,Manchester City,0,25,20,1
2,2020-09-27,16:30 (17:30),Premier League,Matchweek 3,6,Home,L,2,5,Leicester City,...,5,19.8,1,0,0,Manchester City,1,11,16,0
4,2020-10-03,17:30 (18:30),Premier League,Matchweek 4,5,Away,D,1,1,Leeds United,...,1,18.2,1,0,0,Manchester City,0,10,17,0
5,2020-10-17,17:30 (18:30),Premier League,Matchweek 5,5,Home,W,1,0,Arsenal,...,5,17.7,0,0,0,Manchester City,1,0,17,1
7,2020-10-24,12:30 (13:30),Premier League,Matchweek 6,5,Away,D,1,1,West Ham,...,7,20.9,1,0,0,Manchester City,0,24,12,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87,2020-07-07,18:00 (19:00),Premier League,Matchweek 34,1,Away,L,1,2,Watford,...,3,16.9,1,0,0,Norwich City,0,22,18,0
88,2020-07-11,12:30 (13:30),Premier League,Matchweek 35,5,Home,L,0,4,West Ham,...,2,19.8,1,0,0,Norwich City,1,24,12,0
89,2020-07-14,20:15 (21:15),Premier League,Matchweek 36,1,Away,L,0,1,Chelsea,...,0,20.6,0,0,0,Norwich City,0,6,20,0
90,2020-07-18,17:30 (18:30),Premier League,Matchweek 37,5,Home,L,0,2,Burnley,...,2,13.4,0,0,0,Norwich City,1,5,17,0


In [54]:
df["date"] = pd.to_datetime(df["date"])

In [55]:
df["H/A"] = df["venue"].astype("category").cat.codes

In [56]:
df["opp_code"] = df["opponent"].astype("category").cat.codes

In [57]:
df["hours"] = df["time"].str.replace(":.+","",regex=True).astype("int")

In [58]:
df["day"] = df["date"].dt.dayofweek

In [59]:
df["target"] = (df["result"]=="W").astype("int")

In [60]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(min_samples_split = 10,n_estimators = 300,random_state = 1)

In [61]:
train = df[df["date"] < '2024-01-01']

In [62]:
test = df[df["date"] > '2024-01-01']

In [63]:
predictors = ["H/A","opp_code","hours","day"]

In [64]:

rf.fit(train[predictors],train["target"])

In [65]:
preds = rf.predict(test[predictors])

In [66]:
from sklearn.metrics import accuracy_score

In [67]:
acc = accuracy_score(test["target"],preds)

In [68]:
acc

0.6420765027322405

In [69]:
combined = pd.DataFrame(dict(actual = test["target"],prediction = preds)) 

In [70]:
pd.crosstab(index=combined["actual"],columns=combined["prediction"])

prediction,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,182,47
1,84,53


In [71]:
from sklearn.metrics import precision_score

In [72]:
precision_score(test["target"],preds)

0.53

In [73]:
grouped_matches = df.groupby("team")
group = grouped_matches.get_group("Liverpool")

In [74]:
group.reset_index()

Unnamed: 0,index,date,time,comp,round,day,venue,result,gf,ga,...,dist,fk,pk,pkatt,season,team,H/A,opp_code,hours,target
0,1,2020-09-12,17:30 (18:30),Premier League,Matchweek 1,5,Home,W,4,3,...,18.4,0,2,2,2024,Liverpool,1,10,17,1
1,2,2020-09-20,16:30 (17:30),Premier League,Matchweek 2,6,Away,W,2,0,...,18.2,1,0,0,2024,Liverpool,0,6,16,1
2,4,2020-09-28,20:00 (21:00),Premier League,Matchweek 3,0,Home,W,3,1,...,17.9,0,0,0,2024,Liverpool,1,0,20,1
3,6,2020-10-04,19:15 (20:15),Premier League,Matchweek 4,6,Away,L,2,7,...,16.3,1,0,0,2024,Liverpool,0,1,19,0
4,7,2020-10-17,12:30 (13:30),Premier League,Matchweek 5,5,Away,D,2,2,...,16.0,2,0,0,2024,Liverpool,0,8,12,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
261,131,2020-07-08,20:15 (21:15),Premier League,Matchweek 34,2,Away,W,3,1,...,14.2,0,0,0,2020,Liverpool,0,4,20,1
262,132,2020-07-11,15:00 (16:00),Premier League,Matchweek 35,5,Home,D,1,1,...,18.3,1,0,0,2020,Liverpool,1,5,15,0
263,133,2020-07-15,20:15 (21:15),Premier League,Matchweek 36,2,Away,L,1,2,...,16.7,0,0,0,2020,Liverpool,0,0,20,0
264,134,2020-07-22,20:15 (21:15),Premier League,Matchweek 37,2,Home,W,5,3,...,14.4,1,0,0,2020,Liverpool,1,6,20,1


In [75]:
def rolling_averages(group,cols,new_cols):
    group = group.sort_values("date")
    rolling_stats = group[cols].rolling(5,closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset = new_cols)
    return group

In [76]:
cols = ["gf","ga","sh","sot","dist", "fk", "pk", "pkatt"]
new_cols = [f"{c}_rolling" for c in cols]
rolling_averages(group,cols,new_cols)

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,hours,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
4,2019-08-24,17:30 (18:30),Premier League,Matchweek 3,5,Home,W,3,1,Arsenal,...,17,1,3.0,1.0,16.8,6.0,18.04,0.8,0.2,0.2
5,2019-08-31,17:30 (18:30),Premier League,Matchweek 4,5,Away,W,3,0,Burnley,...,17,1,2.8,1.0,18.6,5.4,18.38,0.6,0.4,0.4
86,2019-08-31,17:30 (18:30),Premier League,Matchweek 4,5,Away,W,3,0,Burnley,...,17,1,2.6,0.8,18.6,5.4,19.16,0.4,0.4,0.4
6,2019-09-14,12:30 (13:30),Premier League,Matchweek 5,5,Home,W,3,1,Newcastle Utd,...,12,1,2.8,0.6,18.6,5.6,19.64,0.2,0.4,0.4
87,2019-09-14,12:30 (13:30),Premier League,Matchweek 5,5,Home,W,3,1,Newcastle Utd,...,12,1,3.0,0.6,19.8,6.0,18.64,0.0,0.4,0.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57,2024-04-24,20:00 (21:00),Premier League,Matchweek 29,2,Away,L,0,2,Everton,...,20,0,2.0,1.2,24.2,7.0,16.70,0.8,0.2,0.2
58,2024-04-27,12:30 (14:30),Premier League,Matchweek 35,5,Away,D,2,2,West Ham,...,12,0,1.6,1.4,22.8,6.8,16.74,0.8,0.2,0.2
59,2024-05-05,16:30 (18:30),Premier League,Matchweek 36,6,Home,W,4,2,Tottenham,...,16,1,1.4,1.6,22.6,6.6,16.44,0.8,0.2,0.2
60,2024-05-13,20:00 (22:00),Premier League,Matchweek 37,0,Away,D,3,3,Aston Villa,...,20,0,1.8,1.6,22.2,7.8,16.00,0.8,0.0,0.0


In [77]:
df_rolling = df.groupby(df["team"]).apply(lambda x: rolling_averages(x, cols, new_cols))
df_rolling

  df_rolling = df.groupby(df["team"]).apply(lambda x: rolling_averages(x, cols, new_cols))


Unnamed: 0_level_0,Unnamed: 1_level_0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,hours,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Arsenal,87,2019-09-22,16:30 (17:30),Premier League,Matchweek 6,6,Home,W,3,2,Aston Villa,...,16,1,1.6,1.6,13.0,5.2,18.10,0.6,0.0,0.0
Arsenal,89,2019-09-30,20:00 (21:00),Premier League,Matchweek 7,0,Away,D,1,1,Manchester Utd,...,20,0,2.0,2.0,15.4,5.8,18.70,1.0,0.2,0.2
Arsenal,91,2019-10-06,14:00 (15:00),Premier League,Matchweek 8,6,Home,W,1,0,Bournemouth,...,14,1,1.8,2.0,14.4,5.0,18.28,1.2,0.2,0.2
Arsenal,92,2019-10-21,20:00 (21:00),Premier League,Matchweek 9,0,Away,L,0,1,Sheffield Utd,...,20,0,1.8,1.4,15.0,4.8,17.54,1.0,0.2,0.2
Arsenal,94,2019-10-27,16:30 (18:30),Premier League,Matchweek 10,6,Home,D,2,2,Crystal Palace,...,16,0,1.4,1.2,11.6,3.8,17.68,0.8,0.2,0.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wolverhampton Wanderers,41,2024-04-24,19:45 (20:45),Premier League,Matchweek 29,2,Home,L,0,1,Bournemouth,...,19,0,0.8,1.8,10.0,3.4,17.72,0.4,0.2,0.2
Wolverhampton Wanderers,42,2024-04-27,15:00 (17:00),Premier League,Matchweek 35,5,Home,W,2,1,Luton Town,...,15,1,0.8,1.6,10.4,3.6,18.82,0.2,0.2,0.2
Wolverhampton Wanderers,43,2024-05-04,17:30 (19:30),Premier League,Matchweek 36,5,Away,L,1,5,Manchester City,...,17,0,1.0,1.6,11.4,4.2,20.10,0.2,0.2,0.2
Wolverhampton Wanderers,44,2024-05-11,15:00 (17:00),Premier League,Matchweek 37,5,Home,L,1,3,Crystal Palace,...,15,0,1.0,2.2,9.2,3.2,16.82,0.0,0.0,0.0


In [78]:
df_rolling.index = range(df_rolling.shape[0])

In [79]:
df_rolling

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,hours,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
0,2019-09-22,16:30 (17:30),Premier League,Matchweek 6,6,Home,W,3,2,Aston Villa,...,16,1,1.6,1.6,13.0,5.2,18.10,0.6,0.0,0.0
1,2019-09-30,20:00 (21:00),Premier League,Matchweek 7,0,Away,D,1,1,Manchester Utd,...,20,0,2.0,2.0,15.4,5.8,18.70,1.0,0.2,0.2
2,2019-10-06,14:00 (15:00),Premier League,Matchweek 8,6,Home,W,1,0,Bournemouth,...,14,1,1.8,2.0,14.4,5.0,18.28,1.2,0.2,0.2
3,2019-10-21,20:00 (21:00),Premier League,Matchweek 9,0,Away,L,0,1,Sheffield Utd,...,20,0,1.8,1.4,15.0,4.8,17.54,1.0,0.2,0.2
4,2019-10-27,16:30 (18:30),Premier League,Matchweek 10,6,Home,D,2,2,Crystal Palace,...,16,0,1.4,1.2,11.6,3.8,17.68,0.8,0.2,0.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4647,2024-04-24,19:45 (20:45),Premier League,Matchweek 29,2,Home,L,0,1,Bournemouth,...,19,0,0.8,1.8,10.0,3.4,17.72,0.4,0.2,0.2
4648,2024-04-27,15:00 (17:00),Premier League,Matchweek 35,5,Home,W,2,1,Luton Town,...,15,1,0.8,1.6,10.4,3.6,18.82,0.2,0.2,0.2
4649,2024-05-04,17:30 (19:30),Premier League,Matchweek 36,5,Away,L,1,5,Manchester City,...,17,0,1.0,1.6,11.4,4.2,20.10,0.2,0.2,0.2
4650,2024-05-11,15:00 (17:00),Premier League,Matchweek 37,5,Home,L,1,3,Crystal Palace,...,15,0,1.0,2.2,9.2,3.2,16.82,0.0,0.0,0.0


In [80]:
def make_predictions(data,predictors):
    train = data[data["date"] < '2024-01-01']
    test = data[data["date"] > '2024-01-01']
    rf.fit(train[predictors],train["target"])
    preds = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(actual = test["target"],prediction = preds, index=test.index))
    precision = precision_score(test["target"],preds)
    return combined,precision

In [81]:
combined,precision = make_predictions(df_rolling, predictors + new_cols)

In [82]:
precision

0.6574074074074074

In [83]:
combined

Unnamed: 0,actual,prediction,index
205,1,1,205
206,1,0,206
207,1,0,207
208,1,1,208
209,1,1,209
...,...,...,...
4647,0,0,4647
4648,1,0,4648
4649,0,0,4649
4650,0,0,4650


In [84]:
combined = combined.merge(df_rolling[["date","team","opponent","result"]], left_index = True, right_index = True)

In [85]:
combined

Unnamed: 0,actual,prediction,index,date,team,opponent,result
205,1,1,205,2024-01-20,Arsenal,Crystal Palace,W
206,1,0,206,2024-01-30,Arsenal,Nott'ham Forest,W
207,1,0,207,2024-02-04,Arsenal,Liverpool,W
208,1,1,208,2024-02-11,Arsenal,West Ham,W
209,1,1,209,2024-02-17,Arsenal,Burnley,W
...,...,...,...,...,...,...,...
4647,0,0,4647,2024-04-24,Wolverhampton Wanderers,Bournemouth,L
4648,1,0,4648,2024-04-27,Wolverhampton Wanderers,Luton Town,W
4649,0,0,4649,2024-05-04,Wolverhampton Wanderers,Manchester City,L
4650,0,0,4650,2024-05-11,Wolverhampton Wanderers,Crystal Palace,L


In [86]:
class MissingDict(dict):
    __missing__ = lambda self, key:key

map_values = {
    "Brighton and Hove Albion": "Brighton",
    "Manchester United": "Manchester Utd",
    "Tottenham Hotspur": "Tottenham", 
    "West Ham United": "West Ham", 
    "Wolverhampton Wanderers": "Wolves"
}
mapping = MissingDict(**map_values)

In [87]:
mapping['Brighton and Hove Albion']

'Brighton'

In [88]:
combined["new team"] = combined["team"].map(mapping)

In [89]:
combined.drop(columns = "index")

Unnamed: 0,actual,prediction,date,team,opponent,result,new team
205,1,1,2024-01-20,Arsenal,Crystal Palace,W,Arsenal
206,1,0,2024-01-30,Arsenal,Nott'ham Forest,W,Arsenal
207,1,0,2024-02-04,Arsenal,Liverpool,W,Arsenal
208,1,1,2024-02-11,Arsenal,West Ham,W,Arsenal
209,1,1,2024-02-17,Arsenal,Burnley,W,Arsenal
...,...,...,...,...,...,...,...
4647,0,0,2024-04-24,Wolverhampton Wanderers,Bournemouth,L,Wolves
4648,1,0,2024-04-27,Wolverhampton Wanderers,Luton Town,W,Wolves
4649,0,0,2024-05-04,Wolverhampton Wanderers,Manchester City,L,Wolves
4650,0,0,2024-05-11,Wolverhampton Wanderers,Crystal Palace,L,Wolves


In [90]:
merged = combined.merge(combined,left_on = ["date","new team"], right_on = ["date","opponent"])

In [91]:
merged

Unnamed: 0,actual_x,prediction_x,index_x,date,team_x,opponent_x,result_x,new team_x,actual_y,prediction_y,index_y,team_y,opponent_y,result_y,new team_y
0,1,1,205,2024-01-20,Arsenal,Crystal Palace,W,Arsenal,0,0,1532,Crystal Palace,Arsenal,L,Crystal Palace
1,1,0,206,2024-01-30,Arsenal,Nott'ham Forest,W,Arsenal,0,0,3454,Nottingham Forest,Arsenal,L,Nottingham Forest
2,1,0,207,2024-02-04,Arsenal,Liverpool,W,Arsenal,0,1,2535,Liverpool,Arsenal,L,Liverpool
3,1,1,208,2024-02-11,Arsenal,West Ham,W,Arsenal,0,0,4414,West Ham United,Arsenal,L,West Ham
4,1,1,209,2024-02-17,Arsenal,Burnley,W,Arsenal,0,0,1058,Burnley,Arsenal,L,Burnley
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307,0,0,4647,2024-04-24,Wolverhampton Wanderers,Bournemouth,L,Wolves,1,0,550,Bournemouth,Wolves,W,Bournemouth
308,1,0,4648,2024-04-27,Wolverhampton Wanderers,Luton Town,W,Wolves,0,0,2580,Luton Town,Wolves,L,Luton Town
309,0,0,4649,2024-05-04,Wolverhampton Wanderers,Manchester City,L,Wolves,1,1,2841,Manchester City,Wolves,W,Manchester City
310,0,0,4650,2024-05-11,Wolverhampton Wanderers,Crystal Palace,L,Wolves,1,1,1548,Crystal Palace,Wolves,W,Crystal Palace
