In [50]:
import pandas as pd
matches = pd.read_csv("matches2.csv", index_col = 0)
matches.shape

(2792, 21)

In [51]:
matches['team'].value_counts()

Aston Villa                 141
Everton                     141
Newcastle United            140
Tottenham                   140
Manchester United           140
Wolves                      140
Southampton                 140
Chelsea                     140
Leicester City              140
Arsenal                     139
West Ham                    139
Crystal Palace              139
Brighton                    139
Manchester City             138
Liverpool                   138
Burnley                     114
Leeds United                101
Watford                      76
Sheffield United             76
Norwich City                 76
Brentford                    65
Bournemouth                  63
Fulham                       63
West Brom                    38
Nottingham Forest Forest     15
Nottingham Forest            11
Name: team, dtype: int64

In [52]:
matches["round"].value_counts()

Matchweek 1     80
Matchweek 10    80
Matchweek 2     80
Matchweek 15    80
Matchweek 14    80
Matchweek 13    80
Matchweek 11    80
Matchweek 16    80
Matchweek 9     80
Matchweek 6     80
Matchweek 5     80
Matchweek 4     80
Matchweek 3     80
Matchweek 12    78
Matchweek 8     74
Matchweek 34    70
Matchweek 29    70
Matchweek 31    70
Matchweek 32    70
Matchweek 30    70
Matchweek 36    70
Matchweek 35    70
Matchweek 33    70
Matchweek 37    70
Matchweek 27    70
Matchweek 28    70
Matchweek 20    70
Matchweek 26    70
Matchweek 25    70
Matchweek 24    70
Matchweek 23    70
Matchweek 22    70
Matchweek 21    70
Matchweek 19    70
Matchweek 18    70
Matchweek 17    70
Matchweek 38    70
Matchweek 7     60
Name: round, dtype: int64

In [53]:
matches["date"] = pd.to_datetime(matches["date"])

In [54]:
matches["venue_code"] = matches["venue"].astype("category").cat.codes
matches["opp_code"] = matches["opponent"].astype("category").cat.codes
matches["hour"] = matches["time"].astype("int")
matches["day_code"] = matches["date"].dt.dayofweek
matches["target"] = (matches["result"] == "W").astype("int")

In [55]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=2)
train = matches[matches["date"] < "2022-01-01"]
test = matches[matches["date"] > "2022-01-01"]
predictors = ["venue_code", "opp_code", "hour", "day_code"]

In [56]:
rf.fit(train[predictors],train["target"])
preds = rf.predict(test[predictors])

In [57]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(test["target"], preds)
acc

0.6288888888888889

In [58]:
combined = pd.DataFrame(dict(actual=test["target"], prediction = preds))
pd.crosstab(index=combined["actual"], columns=combined["prediction"])

prediction,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,481,152
1,182,85


In [59]:
from sklearn.metrics import precision_score
precision_score(test["target"],preds)


0.35864978902953587

In [60]:
grouped_matches = matches.groupby("team")
group = grouped_matches.get_group("Liverpool")

In [61]:
def rolling_averages(group, cols, new_cols):
    group = group.sort_values("date")
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group
cols = ["gf","ga","sh","sot","dist","fk","pk","pkatt"]
new_cols = [f"{c}_rolling" for c in cols]
rolling_averages(group,cols,new_cols)



Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
5,2019-08-31,17,Premier League,Matchweek 4,Sat,Away,W,3.0,0.0,Burnley,...,5,1,3.000000,1.000000,18.000000,5.666667,18.166667,0.666667,0.333333,0.333333
6,2019-09-14,12,Premier League,Matchweek 5,Sat,Home,W,3.0,1.0,Newcastle United,...,5,1,2.666667,0.666667,18.000000,5.666667,19.466667,0.333333,0.333333,0.333333
8,2019-09-22,16,Premier League,Matchweek 6,Sun,Away,W,2.0,1.0,Chelsea,...,6,1,3.000000,0.666667,20.000000,6.333333,17.800000,0.000000,0.333333,0.333333
10,2019-09-28,12,Premier League,Matchweek 7,Sat,Away,W,1.0,0.0,Sheffield United,...,5,1,2.666667,0.666667,14.000000,6.000000,16.166667,0.000000,0.000000,0.000000
12,2019-10-05,15,Premier League,Matchweek 8,Sat,Home,W,2.0,1.0,Leicester City,...,5,1,2.000000,0.666667,14.333333,5.000000,15.566667,0.333333,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15,2022-10-22,12,Premier League,Matchweek 13,Sat,Away,L,0.0,1.0,Nottingham Forest,...,5,0,1.333333,1.000000,14.333333,4.000000,17.666667,0.000000,0.000000,0.000000
17,2022-10-29,19,Premier League,Matchweek 14,Sat,Home,L,1.0,2.0,Leeds United,...,5,0,0.666667,0.333333,16.666667,5.333333,15.133333,0.000000,0.000000,0.000000
19,2022-11-06,16,Premier League,Matchweek 15,Sun,Away,W,2.0,1.0,Tottenham,...,6,1,0.666667,1.000000,19.666667,8.000000,14.766667,0.000000,0.000000,0.000000
21,2022-11-12,15,Premier League,Matchweek 16,Sat,Home,W,3.0,1.0,Southampton,...,5,1,1.000000,1.333333,16.666667,7.666667,16.000000,0.000000,0.000000,0.000000


In [62]:
matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x,cols,new_cols))
matches_rolling = matches_rolling.droplevel("team")
matches_rolling.index = range(matches_rolling.shape[0])

In [68]:
def make_predictions(data, predictors):
    train = data[data["date"] < '2022-01-01']
    test = data[data["date"] > '2022-01-01']
    rf.fit(train[predictors], train["target"])
    preds = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(actual=test["target"], predicted=preds), index=test.index)
    error = precision_score(test["target"], preds)
    return combined, error

In [69]:
combined, error = make_predictions(matches_rolling, predictors + new_cols)
combined = combined.merge(matches_rolling[["date", "team", "opponent", "result"]], left_index=True, right_index=True)
combined

Unnamed: 0,actual,predicted,date,team,opponent,result
93,0,1,2022-01-23,Arsenal,Burnley,D
94,1,0,2022-02-10,Arsenal,Wolves,W
95,1,0,2022-02-19,Arsenal,Brentford,W
96,1,1,2022-02-24,Arsenal,Wolves,W
97,1,1,2022-03-06,Arsenal,Watford,W
...,...,...,...,...,...,...
2508,0,0,2022-10-23,Wolves,Leicester City,L
2509,0,0,2022-10-29,Wolves,Brentford,D
2510,0,0,2022-11-05,Wolves,Brighton,L
2511,0,0,2022-11-12,Wolves,Arsenal,L


In [65]:

#from pathlib import Path  
#filepath = Path('C:\Coding\IA\Week1\predictions.csv')  
#filepath.parent.mkdir(parents=True, exist_ok=True)  
#combined.to_csv(filepath)  
data = pd.read_csv("predictions2.csv")
data

Unnamed: 0.1,Unnamed: 0,actual,predicted,date,team,opponent,result
0,93,0,1,2022-01-23,Arsenal,Burnley,D
1,94,1,0,2022-02-10,Arsenal,Wolves,W
2,95,1,0,2022-02-19,Arsenal,Brentford,W
3,96,1,1,2022-02-24,Arsenal,Wolves,W
4,97,1,1,2022-03-06,Arsenal,Watford,W
...,...,...,...,...,...,...,...
691,2508,0,0,2022-10-23,Wolves,Leicester City,L
692,2509,0,0,2022-10-29,Wolves,Brentford,D
693,2510,0,0,2022-11-05,Wolves,Brighton,L
694,2511,0,0,2022-11-12,Wolves,Arsenal,L
