In [59]:
#Importing the data
import pandas as pd
matches = pd.read_csv("~/matches.csv",index_col = 0)
matches.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
1,2021-08-15,16:30,Premier League,Matchweek 1,Sun,Away,L,0,1,Tottenham,...,Match Report,,18.0,4.0,16.9,1.0,0.0,0.0,2022,Manchester City
2,2021-08-21,15:00,Premier League,Matchweek 2,Sat,Home,W,5,0,Norwich City,...,Match Report,,16.0,4.0,17.3,1.0,0.0,0.0,2022,Manchester City
3,2021-08-28,12:30,Premier League,Matchweek 3,Sat,Home,W,5,0,Arsenal,...,Match Report,,25.0,10.0,14.3,0.0,0.0,0.0,2022,Manchester City
4,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Away,W,1,0,Leicester City,...,Match Report,,25.0,8.0,14.0,0.0,0.0,0.0,2022,Manchester City
6,2021-09-18,15:00,Premier League,Matchweek 5,Sat,Home,D,0,0,Southampton,...,Match Report,,16.0,1.0,15.7,1.0,0.0,0.0,2022,Manchester City


In [60]:
matches.dtypes

date             object
time             object
comp             object
round            object
day              object
venue            object
result           object
gf                int64
ga                int64
opponent         object
xg              float64
xga             float64
poss            float64
attendance      float64
captain          object
formation        object
referee          object
match report     object
notes           float64
sh              float64
sot             float64
dist            float64
fk              float64
pk              float64
pkatt           float64
season            int64
team             object
dtype: object

In [61]:
#Creating predictors fot the algorithm
matches["date"] = pd.to_datetime(matches["date"])
matches["venue_code"] = matches["venue"].astype("category").cat.codes
matches["opp_code"] = matches["opponent"].astype("category").cat.codes
matches["hour"] = matches["time"].str.replace(":.+","",regex = True).astype("int")
matches["day_code"] = matches["date"].dt.dayofweek
matches["Target"] = (matches["result"] == "W").astype("int")
matches.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,fk,pk,pkatt,season,team,venue_code,opp_code,hour,day_code,Target
1,2021-08-15,16:30,Premier League,Matchweek 1,Sun,Away,L,0,1,Tottenham,...,1.0,0.0,0.0,2022,Manchester City,0,16,16,6,0
2,2021-08-21,15:00,Premier League,Matchweek 2,Sat,Home,W,5,0,Norwich City,...,1.0,0.0,0.0,2022,Manchester City,1,14,15,5,1
3,2021-08-28,12:30,Premier League,Matchweek 3,Sat,Home,W,5,0,Arsenal,...,0.0,0.0,0.0,2022,Manchester City,1,0,12,5,1
4,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Away,W,1,0,Leicester City,...,0.0,0.0,0.0,2022,Manchester City,0,9,15,5,1
6,2021-09-18,15:00,Premier League,Matchweek 5,Sat,Home,D,0,0,Southampton,...,1.0,0.0,0.0,2022,Manchester City,1,15,15,5,0


In [76]:
#Creating the model
from sklearn.ensemble import RandomForestClassifier 
rf = RandomForestClassifier(n_estimators = 100,min_samples_split = 20, random_state = 1)
train = matches[matches["date"] < "2022-03-01"]
test = matches[matches["date"] > "2022-03-01"]

predictors = ["venue_code","opp_code","hour","day_code"]
rf.fit(train[predictors],train["Target"])

RandomForestClassifier(min_samples_split=20, random_state=1)

In [77]:
#Using accuracy metric
from sklearn.metrics import accuracy_score
preds = rf.predict(test[predictors])
acc = accuracy_score(test["Target"],preds)
print("accuracy is ",acc)
combined = pd.DataFrame(dict(actual=test["Target"],prediction = preds))
pd.crosstab(index = combined["actual"],columns = combined["prediction"])

accuracy is  0.6341463414634146


prediction,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,240,48
1,132,72


In [78]:
#Using precision_metric
from sklearn.metrics import precision_score
precision_score(test["Target"],preds)


0.6

In [79]:
#Improving presion with rolling averages
grouped_matches = matches.groupby("team")
group = grouped_matches.get_group("Manchester City")
group.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,fk,pk,pkatt,season,team,venue_code,opp_code,hour,day_code,Target
1,2021-08-15,16:30,Premier League,Matchweek 1,Sun,Away,L,0,1,Tottenham,...,1.0,0.0,0.0,2022,Manchester City,0,16,16,6,0
2,2021-08-21,15:00,Premier League,Matchweek 2,Sat,Home,W,5,0,Norwich City,...,1.0,0.0,0.0,2022,Manchester City,1,14,15,5,1
3,2021-08-28,12:30,Premier League,Matchweek 3,Sat,Home,W,5,0,Arsenal,...,0.0,0.0,0.0,2022,Manchester City,1,0,12,5,1
4,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Away,W,1,0,Leicester City,...,0.0,0.0,0.0,2022,Manchester City,0,9,15,5,1
6,2021-09-18,15:00,Premier League,Matchweek 5,Sat,Home,D,0,0,Southampton,...,1.0,0.0,0.0,2022,Manchester City,1,15,15,5,0


In [86]:
def rolling_averages(group,cols,new_cols):
    group = group.sort_values("date")
    rolling_stats = group[cols].rolling(3,closed ='left').mean()
    group[new_cols]=rolling_stats
    group = group.dropna(subset = new_cols)
    return group

cols = ["gf","ga","sh","sot","dist","fk","pk","pkatt"]
new_cols = [f"{c}_rolling" for c in cols]


rolling_averages(group,cols,new_cols)
new_cols

['gf_rolling',
 'ga_rolling',
 'sh_rolling',
 'sot_rolling',
 'dist_rolling',
 'fk_rolling',
 'pk_rolling',
 'pkatt_rolling']

In [99]:
matches_rolling = matches.groupby("team").apply(lambda x:rolling_averages(x,cols,new_cols))
matches_rolling = matches_rolling.droplevel("team")
matches_rolling.index = range(matches_rolling.shape[0])
matches_rolling.dtypes


date             datetime64[ns]
time                     object
comp                     object
round                    object
day                      object
venue                    object
result                   object
gf                        int64
ga                        int64
opponent                 object
xg                      float64
xga                     float64
poss                    float64
attendance              float64
captain                  object
formation                object
referee                  object
match report             object
notes                   float64
sh                      float64
sot                     float64
dist                    float64
fk                      float64
pk                      float64
pkatt                   float64
season                    int64
team                     object
venue_code                 int8
opp_code                   int8
hour                      int64
day_code                  int64
Target  

In [101]:
#Making the model more robust and scalable
def make_predictors(data,predictors):
    train = data[data["date"] < "2022-03-01"]
    test = data[data["date"] > "2022-03-01"]
    rf.fit(train[predictors],train["Target"])
    preds = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(actual=test["Target"],prediction = preds))
    precision = precision_score(test["Target"],preds)
    
    return combined, precision


    

In [109]:
predictors+new_cols

combined,precision = make_predictors(matches_rolling,predictors+new_cols)
precision

0.6422764227642277

In [110]:
combined

Unnamed: 0,actual,prediction
45,1,1
46,1,1
47,1,0
48,1,0
49,0,1
...,...,...
1455,0,0
1456,0,0
1457,0,0
1458,0,0


In [111]:
combined = combined.merge(matches_rolling[["date","team","opponent","result"]],left_index =True,right_index = True)
combined

Unnamed: 0,actual,prediction,date,team,opponent,result
45,1,1,2022-03-06,Arsenal,Watford,W
46,1,1,2022-03-06,Arsenal,Watford,W
47,1,0,2022-03-13,Arsenal,Leicester City,W
48,1,0,2022-03-13,Arsenal,Leicester City,W
49,0,1,2022-03-16,Arsenal,Liverpool,L
...,...,...,...,...,...,...
1455,0,0,2022-05-11,Wolverhampton Wanderers,Manchester City,L
1456,0,0,2022-05-15,Wolverhampton Wanderers,Norwich City,D
1457,0,0,2022-05-15,Wolverhampton Wanderers,Norwich City,D
1458,0,0,2022-05-22,Wolverhampton Wanderers,Liverpool,L


In [118]:
class MissingDict(dict):
    __missing__ = lambda self, key: key

map_values = {"Brighton and Hove Albion": "Brighton", "Manchester United": "Manchester Utd", "Newcastle United": "Newcastle Utd", "Tottenham Hotspur": "Tottenham", "West Ham United": "West Ham", "Wolverhampton Wanderers": "Wolves"} 
mapping = MissingDict(**map_values)
combined["new_team"] = combined["team"].map(mapping)
merged = combined.merge(combined, left_on=["date", "new_team"], right_on=["date", "opponent"])
merged

Unnamed: 0,actual_x,prediction_x,date,team_x,opponent_x,result_x,new_team_x,actual_y,prediction_y,team_y,opponent_y,result_y,new_team_y
0,1,1,2022-03-06,Arsenal,Watford,W,Arsenal,0,0,Watford,Arsenal,L,Watford
1,1,1,2022-03-06,Arsenal,Watford,W,Arsenal,0,0,Watford,Arsenal,L,Watford
2,1,1,2022-03-06,Arsenal,Watford,W,Arsenal,0,0,Watford,Arsenal,L,Watford
3,1,1,2022-03-06,Arsenal,Watford,W,Arsenal,0,0,Watford,Arsenal,L,Watford
4,1,0,2022-03-13,Arsenal,Leicester City,W,Arsenal,0,1,Leicester City,Arsenal,L,Leicester City
...,...,...,...,...,...,...,...,...,...,...,...,...,...
979,0,0,2022-05-15,Wolverhampton Wanderers,Norwich City,D,Wolves,0,0,Norwich City,Wolves,D,Norwich City
980,0,0,2022-05-22,Wolverhampton Wanderers,Liverpool,L,Wolves,1,0,Liverpool,Wolves,W,Liverpool
981,0,0,2022-05-22,Wolverhampton Wanderers,Liverpool,L,Wolves,1,0,Liverpool,Wolves,W,Liverpool
982,0,0,2022-05-22,Wolverhampton Wanderers,Liverpool,L,Wolves,1,0,Liverpool,Wolves,W,Liverpool


In [120]:
merged[(merged["prediction_x"] == 1) & (merged["prediction_y"] ==0)]["actual_x"].value_counts()

1    140
0     58
Name: actual_x, dtype: int64

In [122]:
140/198

0.7070707070707071