In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
import pandas as pd

In [6]:
cleaned_matches = pd.read_csv("../data/processed/cleaned_matches.csv")

In [8]:
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)

In [8]:
print(cleaned_matches.columns)


Index(['date', 'time', 'round', 'day', 'venue', 'result', 'gf', 'ga',
       'opponent', 'xg', 'xga', 'poss', 'attendance', 'captain', 'formation',
       'referee', 'match report', 'sh', 'sot', 'dist', 'fk', 'pk', 'pkatt',
       'season', 'team', 'gf_avg_last_10', 'sh_avg_last_10', 'sh_sot_ratio',
       'sh_sot_ratio_avg_last_10', 'target', 'venue_code', 'opp_code', 'hour',
       'day_code', 'gf_avg_category_codes', 'sh_avg_category_codes',
       'sh_sot_ratio_avg_category_codes'],
      dtype='object')


In [9]:
train = cleaned_matches[cleaned_matches['date'] < "2023-05-12"]
test = cleaned_matches[cleaned_matches['date'] > "2023-05-12"]

In [10]:
predictors = ["venue_code", "opp_code", "hour", "day_code","gf_avg_category_codes","sh_avg_category_codes", "sh_sot_ratio_avg_category_codes" ]
rf.fit(train[predictors], train["target"])

In [11]:
preds = rf.predict(test[predictors])

In [14]:
error = accuracy_score(test["target"], preds)
error

0.535234899328859

In [16]:
combined = pd.DataFrame(dict(actual=test["target"], predicted=preds))
pd.crosstab(index=combined["actual"], columns=combined["predicted"])

predicted,0,1,2
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,2,51,73
1,5,146,84
2,8,56,171


In [21]:
precision_score(test["target"], preds, average='weighted')

0.4612884096036626

In [5]:
def make_prediction(data, predictors):
    train = data[data['date'] < "2023-05-12"]
    test = data[data['date'] > "2023-05-12"]
    rf.fit(train[predictors], train["target"])
    preds = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(actual=test["target"], predicted=preds), index=test.index)
    precision = precision_score(test["target"], preds, average='weighted')
    return combined, precision

In [22]:
# New Prediction with rolling matches
rolling_matches = pd.read_csv("../data/processed/rolling_matches.csv")
predictors = ["venue_code","opp_code","hour","day_code","gf_rolling_mean","ga_rolling_mean","sh_rolling_mean","sot_rolling_mean"]
combined, precision = make_prediction(rolling_matches, predictors)

In [23]:
print(combined)
print(precision)

      actual  predicted
34         1          2
35         1          2
36         2          2
37         2          2
38         2          2
...      ...        ...
1261       2          1
1262       1          2
1263       2          2
1264       2          2
1265       1          1

[593 rows x 2 columns]
0.4538809925099612


In [28]:
combined = combined.merge(rolling_matches[["date", "team", "opponent", "result"]], left_index=True, right_index=True)
combined

Unnamed: 0,actual,predicted,date,team,opponent,result
34,1,2,2023-05-14,Arsenal,Brighton,L
35,1,2,2023-05-20,Arsenal,Nott'ham Forest,L
36,2,2,2023-05-28,Arsenal,Wolves,W
37,2,2,2023-08-12,Arsenal,Nott'ham Forest,W
38,2,2,2023-08-21,Arsenal,Crystal Palace,W
...,...,...,...,...,...,...
1261,2,1,2024-02-04,Wolverhampton Wanderers,Chelsea,W
1262,1,2,2024-02-10,Wolverhampton Wanderers,Brentford,L
1263,2,2,2024-02-17,Wolverhampton Wanderers,Tottenham,W
1264,2,2,2024-02-25,Wolverhampton Wanderers,Sheffield Utd,W


In [29]:
# Unifying team names
class MissingDict(dict):
    __missing__ = lambda self, key: key

map_values = {
    "Brighton & Hove Albion": "Brighton",
    "Manchester United": "Manchester Utd",
    "Newcastle United": "Newcastle",
    "Tottenham Hotspur": "Tottenham",
    "West Ham United": "West Ham",
    "Wolverhampton Wanderers": "Wolves"
}
mappping = MissingDict(**map_values)

In [31]:
combined["new_team"] = combined["team"].map(mappping)
combined

Unnamed: 0,actual,predicted,date,team,opponent,result,new_team
34,1,2,2023-05-14,Arsenal,Brighton,L,Arsenal
35,1,2,2023-05-20,Arsenal,Nott'ham Forest,L,Arsenal
36,2,2,2023-05-28,Arsenal,Wolves,W,Arsenal
37,2,2,2023-08-12,Arsenal,Nott'ham Forest,W,Arsenal
38,2,2,2023-08-21,Arsenal,Crystal Palace,W,Arsenal
...,...,...,...,...,...,...,...
1261,2,1,2024-02-04,Wolverhampton Wanderers,Chelsea,W,Wolves
1262,1,2,2024-02-10,Wolverhampton Wanderers,Brentford,L,Wolves
1263,2,2,2024-02-17,Wolverhampton Wanderers,Tottenham,W,Wolves
1264,2,2,2024-02-25,Wolverhampton Wanderers,Sheffield Utd,W,Wolves


In [32]:
merged = combined.merge(combined, left_on=["date","new_team"], right_on=["date","opponent"])
merged

Unnamed: 0,actual_x,predicted_x,date,team_x,opponent_x,result_x,new_team_x,actual_y,predicted_y,team_y,opponent_y,result_y,new_team_y
0,1,2,2023-05-14,Arsenal,Brighton,L,Arsenal,2,1,Brighton and Hove Albion,Arsenal,W,Brighton and Hove Albion
1,1,2,2023-05-20,Arsenal,Nott'ham Forest,L,Arsenal,2,1,Nottingham Forest,Arsenal,W,Nottingham Forest
2,2,2,2023-05-28,Arsenal,Wolves,W,Arsenal,1,1,Wolverhampton Wanderers,Arsenal,L,Wolves
3,2,2,2023-08-12,Arsenal,Nott'ham Forest,W,Arsenal,1,1,Nottingham Forest,Arsenal,L,Nottingham Forest
4,2,2,2023-08-21,Arsenal,Crystal Palace,W,Arsenal,1,1,Crystal Palace,Arsenal,L,Crystal Palace
...,...,...,...,...,...,...,...,...,...,...,...,...,...
468,2,1,2024-02-04,Wolverhampton Wanderers,Chelsea,W,Wolves,1,0,Chelsea,Wolves,L,Chelsea
469,1,2,2024-02-10,Wolverhampton Wanderers,Brentford,L,Wolves,2,1,Brentford,Wolves,W,Brentford
470,2,2,2024-02-17,Wolverhampton Wanderers,Tottenham,W,Wolves,1,2,Tottenham Hotspur,Wolves,L,Tottenham
471,2,2,2024-02-25,Wolverhampton Wanderers,Sheffield Utd,W,Wolves,1,1,Sheffield United,Wolves,L,Sheffield United


In [33]:
# Count Team A wins and Team B losses to get the accuracy
merged[(merged["predicted_x"] == 2) & (merged["predicted_y"] == 1)]["actual_x"].value_counts()

actual_x
2    95
1    37
0    28
Name: count, dtype: int64