In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn import preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import accuracy_score, precision_score, confusion_matrix, classification_report

from boruta import BorutaPy

from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [2]:
matches = pd.read_csv("PL_Matches.csv", index_col = 0)

In [3]:
matches

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,pk,pkatt,oppsh,oppsot,oppdist,oppfk,opppk,opppkatt,season,team
0,13-08-2023,16:30,Premier League,Matchweek 1,Sun,Away,D,1,1,Chelsea,...,0,0,10,4,13.2,1,0,0,2024,Liverpool
1,19-08-2023,15:00,Premier League,Matchweek 2,Sat,Home,W,3,1,Bournemouth,...,0,1,13,5,19.7,1,0,0,2024,Liverpool
2,27-08-2023,16:30,Premier League,Matchweek 3,Sun,Away,W,2,1,Newcastle Utd,...,0,0,23,8,18.3,1,0,0,2024,Liverpool
3,03-09-2023,14:00,Premier League,Matchweek 4,Sun,Home,W,3,0,Aston Villa,...,0,0,9,3,12.5,0,0,0,2024,Liverpool
4,16-09-2023,12:30,Premier League,Matchweek 5,Sat,Away,W,3,1,Wolves,...,0,0,11,2,18.7,0,0,0,2024,Liverpool
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35,13-04-2019,12:30,Premier League,Matchweek 34,Sat,Away,L,0,4,Tottenham,...,0,0,22,5,16.1,1,0,0,2019,Huddersfield
36,20-04-2019,15:00,Premier League,Matchweek 35,Sat,Home,L,1,2,Watford,...,0,0,11,6,18.1,1,0,0,2019,Huddersfield
37,26-04-2019,20:00,Premier League,Matchweek 36,Fri,Away,L,0,5,Liverpool,...,0,0,21,7,16.6,0,0,0,2019,Huddersfield
38,05-05-2019,14:00,Premier League,Matchweek 37,Sun,Home,D,1,1,Manchester Utd,...,0,0,23,7,19.0,2,0,0,2019,Huddersfield


In [4]:
matches["team"].value_counts()

team
Liverpool            214
West Ham             214
Everton              214
Wolves               214
Chelsea              214
Brighton             214
Crystal Palace       214
Newcastle Utd        214
Tottenham            214
Manchester Utd       214
Arsenal              214
Manchester City      213
Leicester City       190
Southampton          190
Aston Villa          176
Burnley              176
Fulham               138
Bournemouth          137
Watford              114
Leeds United         114
Sheffield Utd        100
Brentford             99
Norwich City          76
Nottingham Forest     62
West Brom             38
Cardiff City          38
Huddersfield          38
Luton Town            23
Name: count, dtype: int64

In [5]:
matches.dtypes

date             object
time             object
comp             object
round            object
day              object
venue            object
result           object
gf                int64
ga                int64
opponent         object
xg              float64
xga             float64
poss              int64
attendance      float64
captain          object
formation        object
referee          object
match report     object
notes           float64
sh                int64
sot               int64
dist            float64
fk                int64
pk                int64
pkatt             int64
oppsh             int64
oppsot            int64
oppdist         float64
oppfk             int64
opppk             int64
opppkatt          int64
season            int64
team             object
dtype: object

In [6]:
matches["date"] = pd.to_datetime(matches["date"], dayfirst = True)

In [7]:
matches["venue_code"] = matches["venue"].astype("category").cat.codes

In [8]:
matches["opp_code"] = matches["opponent"].astype("category").cat.codes

In [9]:
matches["hour"] = matches["time"].str.replace(":.+", "", regex = True).astype("int")

In [10]:
matches["day_code"] = matches["date"].dt.dayofweek

In [11]:
matches["round"] = matches["round"].str.replace("Matchweek ", "").astype("int")
matches

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,oppdist,oppfk,opppk,opppkatt,season,team,venue_code,opp_code,hour,day_code
0,2023-08-13,16:30,Premier League,1,Sun,Away,D,1,1,Chelsea,...,13.2,1,0,0,2024,Liverpool,0,7,16,6
1,2023-08-19,15:00,Premier League,2,Sat,Home,W,3,1,Bournemouth,...,19.7,1,0,0,2024,Liverpool,1,2,15,5
2,2023-08-27,16:30,Premier League,3,Sun,Away,W,2,1,Newcastle Utd,...,18.3,1,0,0,2024,Liverpool,0,18,16,6
3,2023-09-03,14:00,Premier League,4,Sun,Home,W,3,0,Aston Villa,...,12.5,0,0,0,2024,Liverpool,1,1,14,6
4,2023-09-16,12:30,Premier League,5,Sat,Away,W,3,1,Wolves,...,18.7,0,0,0,2024,Liverpool,0,27,12,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35,2019-04-13,12:30,Premier League,34,Sat,Away,L,0,4,Tottenham,...,16.1,1,0,0,2019,Huddersfield,0,23,12,5
36,2019-04-20,15:00,Premier League,35,Sat,Home,L,1,2,Watford,...,18.1,1,0,0,2019,Huddersfield,1,24,15,5
37,2019-04-26,20:00,Premier League,36,Fri,Away,L,0,5,Liverpool,...,16.6,0,0,0,2019,Huddersfield,0,14,20,4
38,2019-05-05,14:00,Premier League,37,Sun,Home,D,1,1,Manchester Utd,...,19.0,2,0,0,2019,Huddersfield,1,17,14,6


In [12]:
matches["gd"] = matches["gf"] - matches["ga"]

In [13]:
matches["target"] = 0 #same idea as points
matches.loc[matches["result"] == "D", "target"] = 1
matches.loc[matches["result"] == "W", "target"] = 2

In [14]:
matches["points"] = 0
matches.loc[matches["result"] == "D", "points"] = 1
matches.loc[matches["result"] == "W", "points"] = 3

In [15]:
matches["win_rate"] = 0
matches.loc[matches["result"] == "W", "win_rate"] = 1

matches["loss_rate"] = 0
matches.loc[matches["result"] == "L", "loss_rate"] = 1

matches["draw_rate"] = 0
matches.loc[matches["result"] == "D", "draw_rate"] = 1

In [16]:
matches

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,venue_code,opp_code,hour,day_code,gd,target,points,win_rate,loss_rate,draw_rate
0,2023-08-13,16:30,Premier League,1,Sun,Away,D,1,1,Chelsea,...,0,7,16,6,0,1,1,0,0,1
1,2023-08-19,15:00,Premier League,2,Sat,Home,W,3,1,Bournemouth,...,1,2,15,5,2,2,3,1,0,0
2,2023-08-27,16:30,Premier League,3,Sun,Away,W,2,1,Newcastle Utd,...,0,18,16,6,1,2,3,1,0,0
3,2023-09-03,14:00,Premier League,4,Sun,Home,W,3,0,Aston Villa,...,1,1,14,6,3,2,3,1,0,0
4,2023-09-16,12:30,Premier League,5,Sat,Away,W,3,1,Wolves,...,0,27,12,5,2,2,3,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35,2019-04-13,12:30,Premier League,34,Sat,Away,L,0,4,Tottenham,...,0,23,12,5,-4,0,0,0,1,0
36,2019-04-20,15:00,Premier League,35,Sat,Home,L,1,2,Watford,...,1,24,15,5,-1,0,0,0,1,0
37,2019-04-26,20:00,Premier League,36,Fri,Away,L,0,5,Liverpool,...,0,14,20,4,-5,0,0,0,1,0
38,2019-05-05,14:00,Premier League,37,Sun,Home,D,1,1,Manchester Utd,...,1,17,14,6,0,1,1,0,0,1


In [17]:
fifa = pd.read_csv("PL_FIFA_Index.csv")

In [18]:
fifa

Unnamed: 0,season,team,team_att,team_mid,team_def,team_ovr
0,2024,Manchester City,87,86,83,85
1,2024,Liverpool,84,82,84,83
2,2024,Arsenal,82,84,81,82
3,2024,Manchester Utd,82,83,80,82
4,2024,Tottenham,81,80,79,81
...,...,...,...,...,...,...
115,2019,Newcastle Utd,75,77,76,76
116,2019,Southampton,75,77,74,76
117,2019,Fulham,78,76,73,75
118,2019,Huddersfield,72,73,75,74


In [19]:
matches = matches.merge(fifa, left_on = ["season", "team"], right_on = ["season", "team"])

In [20]:
matches

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,gd,target,points,win_rate,loss_rate,draw_rate,team_att,team_mid,team_def,team_ovr
0,2023-08-13,16:30,Premier League,1,Sun,Away,D,1,1,Chelsea,...,0,1,1,0,0,1,84,82,84,83
1,2023-08-19,15:00,Premier League,2,Sat,Home,W,3,1,Bournemouth,...,2,2,3,1,0,0,84,82,84,83
2,2023-08-27,16:30,Premier League,3,Sun,Away,W,2,1,Newcastle Utd,...,1,2,3,1,0,0,84,82,84,83
3,2023-09-03,14:00,Premier League,4,Sun,Home,W,3,0,Aston Villa,...,3,2,3,1,0,0,84,82,84,83
4,2023-09-16,12:30,Premier League,5,Sat,Away,W,3,1,Wolves,...,2,2,3,1,0,0,84,82,84,83
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4209,2019-04-13,12:30,Premier League,34,Sat,Away,L,0,4,Tottenham,...,-4,0,0,0,1,0,72,73,75,74
4210,2019-04-20,15:00,Premier League,35,Sat,Home,L,1,2,Watford,...,-1,0,0,0,1,0,72,73,75,74
4211,2019-04-26,20:00,Premier League,36,Fri,Away,L,0,5,Liverpool,...,-5,0,0,0,1,0,72,73,75,74
4212,2019-05-05,14:00,Premier League,37,Sun,Home,D,1,1,Manchester Utd,...,0,1,1,0,0,1,72,73,75,74


In [21]:
rf_model = RandomForestClassifier(n_estimators = 30, min_samples_split = 30, random_state = 1)  # 75, 30, 1

In [22]:
predictors = ["venue_code", "opp_code", "hour", "day_code"]

In [23]:
def rolling_averages(group, cols, new_cols):
    group = group.sort_values("date")
    rolling_stats = group[cols].rolling(5, closed = 'left').mean() # closed = 'left' ensures that future data is not used to predict the present
    group[new_cols] = rolling_stats
    group = group.dropna(subset = new_cols)
    return group

In [24]:
cols = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt", "xg", "poss", "oppsh", "oppsot", "oppdist", "oppfk", "opppk", "opppkatt", "xga", "gd", "win_rate", "loss_rate", "draw_rate", "points"]

# cols = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt", "xg", "poss", "oppsh", "oppsot", "oppdist", "oppfk", "opppk", "opppkatt", "xga"]
# cols = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt"]
# cols = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt", "xg", "poss"]
# control_group = ["gd"]

new_cols = [f"{c}_rolling" for c in cols]

In [25]:
new_cols

['gf_rolling',
 'ga_rolling',
 'sh_rolling',
 'sot_rolling',
 'dist_rolling',
 'fk_rolling',
 'pk_rolling',
 'pkatt_rolling',
 'xg_rolling',
 'poss_rolling',
 'oppsh_rolling',
 'oppsot_rolling',
 'oppdist_rolling',
 'oppfk_rolling',
 'opppk_rolling',
 'opppkatt_rolling',
 'xga_rolling',
 'gd_rolling',
 'win_rate_rolling',
 'loss_rate_rolling',
 'draw_rate_rolling',
 'points_rolling']

In [26]:
matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))
matches_rolling

  matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))


Unnamed: 0_level_0,Unnamed: 1_level_0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,oppdist_rolling,oppfk_rolling,opppk_rolling,opppkatt_rolling,xga_rolling,gd_rolling,win_rate_rolling,loss_rate_rolling,draw_rate_rolling,points_rolling
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Arsenal,3611,2018-09-23,16:00,Premier League,6,Sun,Home,W,2,0,Everton,...,16.80,0.4,0.0,0.0,1.16,0.2,0.6,0.4,0.0,1.8
Arsenal,3612,2018-09-29,15:00,Premier League,7,Sat,Home,W,2,0,Watford,...,16.58,0.4,0.0,0.0,0.98,1.0,0.8,0.2,0.0,2.4
Arsenal,3613,2018-10-07,12:00,Premier League,8,Sun,Away,W,5,1,Fulham,...,16.02,0.4,0.0,0.0,1.22,1.6,1.0,0.0,0.0,3.0
Arsenal,3614,2018-10-22,20:00,Premier League,9,Mon,Home,W,3,1,Leicester City,...,16.90,0.4,0.0,0.0,1.28,2.0,1.0,0.0,0.0,3.0
Arsenal,3615,2018-10-28,13:30,Premier League,10,Sun,Away,D,2,2,Crystal Palace,...,17.06,0.6,0.0,0.0,1.18,2.2,1.0,0.0,0.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wolves,258,2023-12-30,15:00,Premier League,20,Sat,Home,W,3,0,Everton,...,17.12,1.0,0.0,0.0,1.50,0.4,0.6,0.2,0.2,2.0
Wolves,259,2024-01-22,19:45,Premier League,21,Mon,Away,D,0,0,Brighton,...,16.88,0.6,0.0,0.0,1.40,0.8,0.6,0.2,0.2,2.0
Wolves,260,2024-02-01,20:15,Premier League,22,Thu,Home,L,3,4,Manchester Utd,...,17.18,0.6,0.0,0.0,1.34,0.8,0.6,0.2,0.2,2.0
Wolves,261,2024-02-04,14:00,Premier League,23,Sun,Away,W,4,2,Chelsea,...,16.86,0.6,0.0,0.0,1.78,1.2,0.6,0.2,0.2,2.0


In [27]:
matches_rolling.index = range(matches_rolling.shape[0])

In [28]:
matches_rolling

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,oppdist_rolling,oppfk_rolling,opppk_rolling,opppkatt_rolling,xga_rolling,gd_rolling,win_rate_rolling,loss_rate_rolling,draw_rate_rolling,points_rolling
0,2018-09-23,16:00,Premier League,6,Sun,Home,W,2,0,Everton,...,16.80,0.4,0.0,0.0,1.16,0.2,0.6,0.4,0.0,1.8
1,2018-09-29,15:00,Premier League,7,Sat,Home,W,2,0,Watford,...,16.58,0.4,0.0,0.0,0.98,1.0,0.8,0.2,0.0,2.4
2,2018-10-07,12:00,Premier League,8,Sun,Away,W,5,1,Fulham,...,16.02,0.4,0.0,0.0,1.22,1.6,1.0,0.0,0.0,3.0
3,2018-10-22,20:00,Premier League,9,Mon,Home,W,3,1,Leicester City,...,16.90,0.4,0.0,0.0,1.28,2.0,1.0,0.0,0.0,3.0
4,2018-10-28,13:30,Premier League,10,Sun,Away,D,2,2,Crystal Palace,...,17.06,0.6,0.0,0.0,1.18,2.2,1.0,0.0,0.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4054,2023-12-30,15:00,Premier League,20,Sat,Home,W,3,0,Everton,...,17.12,1.0,0.0,0.0,1.50,0.4,0.6,0.2,0.2,2.0
4055,2024-01-22,19:45,Premier League,21,Mon,Away,D,0,0,Brighton,...,16.88,0.6,0.0,0.0,1.40,0.8,0.6,0.2,0.2,2.0
4056,2024-02-01,20:15,Premier League,22,Thu,Home,L,3,4,Manchester Utd,...,17.18,0.6,0.0,0.0,1.34,0.8,0.6,0.2,0.2,2.0
4057,2024-02-04,14:00,Premier League,23,Sun,Away,W,4,2,Chelsea,...,16.86,0.6,0.0,0.0,1.78,1.2,0.6,0.2,0.2,2.0


In [29]:
def make_predictions(data, predictors, model):
    train = data[data["date"] <= "2023-06-10"]
    test = data[data["date"] > "2023-06-10"]

    # Split the DataFrame into training and testing sets
    # train, test = train_test_split(data, test_size=0.2, random_state=42, shuffle=True)

    #OG
    train_x = train[predictors]
    train_y = train["target"]
    test_x = test[predictors]
    test_y = test["target"]
    
    #normalizer = preprocessing.Normalizer()
    #normalized_train_x = normalizer.fit_transform(train_x)
    #normalized_test_x = normalizer.transform(test_x)
    
    model.fit(train_x, train_y) #normalized_train_x
    
    preds = model.predict(test_x) #normalized_test_x
    
    combined = pd.DataFrame(dict(actual = test_y, predicted = preds), index = test.index)
    #precision = precision_score(test_y, preds, average = None, zero_division = 0.0)
    #accuracy = accuracy_score(test_y, preds)
    report = classification_report(test_y, preds, digits = 4, zero_division = 0.0)
    
    preds_train = model.predict(train_x) #normalized_train_x
    
    combined_train = pd.DataFrame(dict(actual = train_y, predicted = preds_train), index = train.index)
    #precision_train = precision_score(train_y, preds_train, average = None, zero_division = 0.0)
    #accuracy_train = accuracy_score(train_y, preds_train)
    report_train = classification_report(train_y, preds_train, digits = 4, zero_division = 0.0)
    
    return combined, combined_train, report, report_train # precision, accuracy

In [30]:
combined, combined_train, report, report_train = make_predictions(matches_rolling, predictors + new_cols, rf_model)

In [31]:
print(report)

              precision    recall  f1-score   support

           0     0.4890    0.6416    0.5550       173
           1     1.0000    0.0337    0.0652        89
           2     0.5161    0.6054    0.5572       185

    accuracy                         0.5056       447
   macro avg     0.6684    0.4269    0.3925       447
weighted avg     0.6020    0.5056    0.4584       447



In [32]:
print(report_train)

              precision    recall  f1-score   support

           0     0.7035    0.8779    0.7811      1400
           1     0.9953    0.2672    0.4213       801
           2     0.7315    0.8554    0.7886      1411

    accuracy                         0.7337      3612
   macro avg     0.8101    0.6668    0.6637      3612
weighted avg     0.7792    0.7337    0.7042      3612



In [33]:
combined

Unnamed: 0,actual,predicted
185,2,2
186,2,2
187,1,2
188,2,2
189,2,2
...,...,...
4054,2,0
4055,1,0
4056,0,0
4057,2,0


In [34]:
combined = combined.merge(matches_rolling[["date", "team", "opponent", "result"]], left_index = True, right_index = True)

In [35]:
# NOT NEEDED
class MissingDict(dict):
    __missing__ = lambda self, key: key
    
map_values = {
    "Brighton and Hove Albion": "Brighton",
    "Manchester United": "Manchester Utd",
    "Newcastle United": "Newcastle Utd",
    "Tottenham Hotspur": "Tottenham",
    "West Ham United": "West Ham",
    "Wolverhampton Wanderers": "Wolves",
    "Sheffield United": "Sheffield Utd",
    "Nottingham Forest": "Nott'ham Forest",
    "West Bromwich Albion": "West Brom",
    "Huddersfield Town": "Huddersfield"
}

mapping = MissingDict(**map_values)

In [36]:
combined

Unnamed: 0,actual,predicted,date,team,opponent,result
185,2,2,2023-08-12,Arsenal,Nott'ham Forest,W
186,2,2,2023-08-21,Arsenal,Crystal Palace,W
187,1,2,2023-08-26,Arsenal,Fulham,D
188,2,2,2023-09-03,Arsenal,Manchester Utd,W
189,2,2,2023-09-17,Arsenal,Everton,W
...,...,...,...,...,...,...
4054,2,0,2023-12-30,Wolves,Everton,W
4055,1,0,2024-01-22,Wolves,Brighton,D
4056,0,0,2024-02-01,Wolves,Manchester Utd,L
4057,2,0,2024-02-04,Wolves,Chelsea,W


In [37]:
merged = combined.merge(combined, left_on = ["date", "team"], right_on = ["date", "opponent"]) # merging records of the same game from H and A perspectives

In [38]:
merged

Unnamed: 0,actual_x,predicted_x,date,team_x,opponent_x,result_x,actual_y,predicted_y,team_y,opponent_y,result_y
0,2,2,2023-08-21,Arsenal,Crystal Palace,W,0,2,Crystal Palace,Arsenal,L
1,1,2,2023-08-26,Arsenal,Fulham,D,1,0,Fulham,Arsenal,D
2,2,2,2023-09-03,Arsenal,Manchester Utd,W,0,0,Manchester Utd,Arsenal,L
3,2,2,2023-09-17,Arsenal,Everton,W,0,0,Everton,Arsenal,L
4,1,2,2023-09-24,Arsenal,Tottenham,D,1,2,Tottenham,Arsenal,D
...,...,...,...,...,...,...,...,...,...,...,...
413,2,0,2023-12-30,Wolves,Everton,W,0,2,Everton,Wolves,L
414,1,0,2024-01-22,Wolves,Brighton,D,1,2,Brighton,Wolves,D
415,0,0,2024-02-01,Wolves,Manchester Utd,L,2,0,Manchester Utd,Wolves,W
416,2,0,2024-02-04,Wolves,Chelsea,W,0,2,Chelsea,Wolves,L


In [39]:
merged[(merged["predicted_x"] == 2) & (merged["predicted_y"] == 0)]["actual_x"].value_counts()

actual_x
2    76
0    29
1    18
Name: count, dtype: int64

In [40]:
merged[(merged["predicted_x"] == 0) & (merged["predicted_y"] == 2)]["actual_x"].value_counts()

actual_x
0    76
2    29
1    18
Name: count, dtype: int64

In [41]:
merged[(merged["predicted_x"] == 1) & (merged["predicted_y"] == 1)]["actual_x"].value_counts()

Series([], Name: count, dtype: int64)

In [42]:
combined_train = combined_train.merge(matches_rolling[["date", "team", "opponent", "result"]], left_index = True, right_index = True)

In [43]:
merged_train = combined_train.merge(combined_train, left_on = ["date", "team"], right_on = ["date", "opponent"]) # merging records of the same game from H and A perspectives

In [44]:
merged_train

Unnamed: 0,actual_x,predicted_x,date,team_x,opponent_x,result_x,actual_y,predicted_y,team_y,opponent_y,result_y
0,2,2,2018-09-23,Arsenal,Everton,W,0,0,Everton,Arsenal,L
1,2,2,2018-09-29,Arsenal,Watford,W,0,0,Watford,Arsenal,L
2,2,2,2018-10-07,Arsenal,Fulham,W,0,0,Fulham,Arsenal,L
3,2,2,2018-10-22,Arsenal,Leicester City,W,0,0,Leicester City,Arsenal,L
4,1,2,2018-10-28,Arsenal,Crystal Palace,D,1,1,Crystal Palace,Arsenal,D
...,...,...,...,...,...,...,...,...,...,...,...
3519,0,0,2023-04-29,Wolves,Brighton,L,2,2,Brighton,Wolves,W
3520,2,0,2023-05-06,Wolves,Aston Villa,W,0,2,Aston Villa,Wolves,L
3521,0,0,2023-05-13,Wolves,Manchester Utd,L,2,2,Manchester Utd,Wolves,W
3522,1,1,2023-05-20,Wolves,Everton,D,1,0,Everton,Wolves,D


In [45]:
merged_train[(merged_train["predicted_x"] == 2) & (merged_train["predicted_y"] == 0)]["actual_x"].value_counts()

actual_x
2    1050
1     121
0      44
Name: count, dtype: int64

In [46]:
merged_train[(merged_train["predicted_x"] == 0) & (merged_train["predicted_y"] == 2)]["actual_x"].value_counts()

actual_x
0    1050
1     121
2      44
Name: count, dtype: int64

In [47]:
merged_train[(merged_train["predicted_x"] == 1) & (merged_train["predicted_y"] == 1)]["actual_x"].value_counts()

actual_x
1    56
Name: count, dtype: int64

In [48]:
fixtures = matches_rolling.merge(matches_rolling, left_on = ["date", "team"], right_on = ["date", "opponent"])

In [49]:
fixtures = fixtures[fixtures["venue_x"] == "Home"]

In [50]:
fixtures.index = range(fixtures.shape[0])

In [51]:
fixtures["target"] = 0 #Home Team = x = 1
fixtures.loc[fixtures["result_x"] == "W", "target"] = 1
fixtures.loc[fixtures["result_x"] == "L", "target"] = 2

In [52]:
fixtures["team_code_x"] = fixtures["team_x"].astype("category").cat.codes
fixtures["team_code_y"] = fixtures["team_y"].astype("category").cat.codes

In [53]:
fixtures

Unnamed: 0,date,time_x,comp_x,round_x,day_x,venue_x,result_x,gf_x,ga_x,opponent_x,...,opppkatt_rolling_y,xga_rolling_y,gd_rolling_y,win_rate_rolling_y,loss_rate_rolling_y,draw_rate_rolling_y,points_rolling_y,target,team_code_x,team_code_y
0,2018-09-23,16:00,Premier League,6,Sun,Home,W,2,0,Everton,...,0.2,1.30,-0.2,0.2,0.2,0.6,1.2,1,0,9
1,2018-09-29,15:00,Premier League,7,Sat,Home,W,2,0,Watford,...,0.0,1.16,0.6,0.6,0.2,0.2,2.0,1,0,23
2,2018-10-22,20:00,Premier League,9,Mon,Home,W,3,1,Leicester City,...,0.2,1.14,0.0,0.4,0.6,0.0,1.2,1,0,13
3,2018-11-03,17:30,Premier League,11,Sat,Home,D,1,1,Liverpool,...,0.2,0.78,1.4,0.6,0.0,0.4,2.2,0,0,14
4,2018-11-11,16:30,Premier League,12,Sun,Home,D,1,1,Wolves,...,0.0,1.12,-0.2,0.4,0.6,0.0,1.2,0,0,26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1966,2023-12-05,19:30,Premier League,15,Tue,Home,W,1,0,Burnley,...,0.0,1.10,-0.2,0.2,0.8,0.0,0.6,1,26,5
1967,2023-12-24,13:00,Premier League,18,Sun,Home,W,2,1,Chelsea,...,0.2,1.84,-0.6,0.4,0.6,0.0,1.2,1,26,7
1968,2023-12-30,15:00,Premier League,20,Sat,Home,W,3,0,Everton,...,0.2,1.40,0.8,0.6,0.4,0.0,1.8,1,26,9
1969,2024-02-01,20:15,Premier League,22,Thu,Home,L,3,4,Manchester Utd,...,0.0,1.42,-0.4,0.2,0.4,0.4,1.0,2,26,17


In [54]:
fixtures["target"].value_counts()

target
1    878
2    663
0    430
Name: count, dtype: int64

In [55]:
#fixtures.to_csv("Fixtures.csv")

In [56]:
fixtures["team_x"].value_counts()

team_x
Arsenal            103
Manchester Utd     103
Everton            102
West Ham           102
Tottenham          102
Newcastle Utd      102
Liverpool          101
Wolves             100
Chelsea            100
Brighton            99
Crystal Palace      98
Manchester City     98
Leicester City      91
Southampton         90
Burnley             85
Aston Villa         82
Fulham              62
Bournemouth         60
Leeds United        53
Watford             53
Sheffield Utd       46
Brentford           46
Norwich City        35
Cardiff City        17
West Brom           16
Huddersfield        15
Luton Town          10
Name: count, dtype: int64

In [57]:
fixtures["team_y"].value_counts()

team_y
Everton            102
Wolves             102
Chelsea            102
Manchester Utd     102
Liverpool          102
Arsenal            101
West Ham           101
Tottenham          101
Newcastle Utd      100
Manchester City    100
Brighton            99
Crystal Palace      99
Southampton         91
Leicester City      89
Aston Villa         84
Burnley             81
Fulham              64
Bournemouth         61
Watford             55
Leeds United        53
Sheffield Utd       47
Brentford           44
Norwich City        36
Huddersfield        17
West Brom           16
Cardiff City        15
Luton Town           7
Name: count, dtype: int64

In [58]:
betting_odds = pd.read_csv("PL_Betting_Odds.csv")

In [59]:
betting_odds["date"] = pd.to_datetime(betting_odds["date"], dayfirst = True)
betting_odds

Unnamed: 0,season,Div,date,Time,team_x,team_y,FTHG,FTAG,FTR,HTHG,...,AvgC<2.5,AHCh,B365CAHH,B365CAHA,PCAHH,PCAHA,MaxCAHH,MaxCAHA,AvgCAHH,AvgCAHA
0,2024,E0,2023-08-11,20:00,Burnley,Manchester City,0,3,A,0,...,2.28,1.50,1.95,1.98,1.95,1.97,,,1.92,1.95
1,2024,E0,2023-08-12,12:30,Arsenal,Nott'm Forest,2,1,H,2,...,2.63,-2.00,1.95,1.98,1.93,1.97,2.01,2.09,1.95,1.92
2,2024,E0,2023-08-12,15:00,Bournemouth,West Ham,1,1,D,0,...,2.12,0.00,2.02,1.91,2.01,1.92,2.06,1.96,1.96,1.91
3,2024,E0,2023-08-12,15:00,Brighton,Luton Town,4,1,H,1,...,2.48,-1.75,2.01,1.92,2.00,1.91,2.14,1.93,2.00,1.86
4,2024,E0,2023-08-12,15:00,Everton,Fulham,0,1,A,0,...,1.71,-0.25,2.06,1.87,2.04,1.88,2.08,1.99,1.98,1.88
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2133,2019,E0,2019-05-12,,Liverpool,Wolves,2,0,H,1,...,,,,,,,,,,
2134,2019,E0,2019-05-12,,Manchester Utd,Cardiff City,0,2,A,0,...,,,,,,,,,,
2135,2019,E0,2019-05-12,,Southampton,Huddersfield,1,1,D,1,...,,,,,,,,,,
2136,2019,E0,2019-05-12,,Tottenham,Everton,2,2,D,1,...,,,,,,,,,,


In [60]:
betting_odds = betting_odds[["date", "team_x", "team_y", "B365H", "B365D", "B365A"]]
betting_odds

Unnamed: 0,date,team_x,team_y,B365H,B365D,B365A
0,2023-08-11,Burnley,Manchester City,8.00,5.50,1.33
1,2023-08-12,Arsenal,Nott'm Forest,1.18,7.00,15.00
2,2023-08-12,Bournemouth,West Ham,2.70,3.40,2.55
3,2023-08-12,Brighton,Luton Town,1.33,5.50,9.00
4,2023-08-12,Everton,Fulham,2.20,3.40,3.30
...,...,...,...,...,...,...
2133,2019-05-12,Liverpool,Wolves,1.30,6.00,11.00
2134,2019-05-12,Manchester Utd,Cardiff City,1.28,6.50,11.00
2135,2019-05-12,Southampton,Huddersfield,1.44,4.75,8.50
2136,2019-05-12,Tottenham,Everton,2.20,3.50,3.50


In [61]:
fixtures = fixtures.merge(betting_odds, left_on = ["date", "team_x", "team_y"] , right_on = ["date", "team_x", "team_y"])
fixtures

Unnamed: 0,date,time_x,comp_x,round_x,day_x,venue_x,result_x,gf_x,ga_x,opponent_x,...,win_rate_rolling_y,loss_rate_rolling_y,draw_rate_rolling_y,points_rolling_y,target,team_code_x,team_code_y,B365H,B365D,B365A
0,2018-09-23,16:00,Premier League,6,Sun,Home,W,2,0,Everton,...,0.2,0.2,0.6,1.2,1,0,9,1.44,5.00,7.50
1,2018-09-29,15:00,Premier League,7,Sat,Home,W,2,0,Watford,...,0.6,0.2,0.2,2.0,1,0,23,1.44,5.10,7.00
2,2018-10-22,20:00,Premier League,9,Mon,Home,W,3,1,Leicester City,...,0.4,0.6,0.0,1.2,1,0,13,1.53,4.50,6.50
3,2018-11-03,17:30,Premier League,11,Sat,Home,D,1,1,Liverpool,...,0.6,0.0,0.4,2.2,0,0,14,3.90,3.90,1.95
4,2018-11-11,16:30,Premier League,12,Sun,Home,D,1,1,Wolves,...,0.4,0.6,0.0,1.2,0,0,26,1.66,4.20,5.25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1966,2023-12-05,19:30,Premier League,15,Tue,Home,W,1,0,Burnley,...,0.2,0.8,0.0,0.6,1,26,5,1.85,3.60,4.20
1967,2023-12-24,13:00,Premier League,18,Sun,Home,W,2,1,Chelsea,...,0.4,0.6,0.0,1.2,1,26,7,3.75,3.75,1.91
1968,2023-12-30,15:00,Premier League,20,Sat,Home,W,3,0,Everton,...,0.6,0.4,0.0,1.8,1,26,9,2.63,3.30,2.70
1969,2024-02-01,20:15,Premier League,22,Thu,Home,L,3,4,Manchester Utd,...,0.2,0.4,0.4,1.0,2,26,17,2.63,3.60,2.55


In [62]:
#gen_predictors = ["team_code_x", "team_code_y", "round_x", "hour_x", "day_code_x"]
#x_predictors_home = ["gf_rolling_x", "ga_rolling_x", "sh_rolling_x", "sot_rolling_x", "dist_rolling_x", "xg_rolling_x", "poss_rolling_x", "oppsh_rolling_x", "oppsot_rolling_x", "oppdist_rolling_x", "xga_rolling_x"]
#y_predictors_away = ["gf_rolling_y", "ga_rolling_y", "sh_rolling_y", "sot_rolling_y", "dist_rolling_y", "xg_rolling_y", "poss_rolling_y", "oppsh_rolling_y", "oppsot_rolling_y", "oppdist_rolling_y", "xga_rolling_y"]
#betting_predictors = []
#control_group = []

gen_predictors = ["team_code_x", "team_code_y", "round_x", "hour_x", "day_code_x"]
x_predictors_home = ["gf_rolling_x", "ga_rolling_x", "sh_rolling_x", "sot_rolling_x", "dist_rolling_x", "fk_rolling_x", "pk_rolling_x", "pkatt_rolling_x", "xg_rolling_x", "poss_rolling_x", "oppsh_rolling_x", "oppsot_rolling_x", "oppdist_rolling_x", "opppkatt_rolling_x", "xga_rolling_x"]
y_predictors_away = ["gf_rolling_y", "ga_rolling_y", "sh_rolling_y", "sot_rolling_y", "dist_rolling_y", "fk_rolling_y", "pk_rolling_y", "pkatt_rolling_y", "xg_rolling_y", "poss_rolling_y", "oppsh_rolling_y", "oppsot_rolling_y", "oppdist_rolling_y", "opppkatt_rolling_y", "xga_rolling_y"]
fifa_predictors = ["team_att_x", "team_mid_x", "team_def_x", "team_ovr_x", "team_att_y", "team_mid_y", "team_def_y", "team_ovr_y"]
betting_predictors = ["B365H", "B365D", "B365A"]
control_group = ["gd_rolling_x", "gd_rolling_y", "win_rate_rolling_x", "win_rate_rolling_y", "loss_rate_rolling_x", "loss_rate_rolling_y", "draw_rate_rolling_x", "draw_rate_rolling_y", "points_rolling_x", "points_rolling_y"]

all_predictors = gen_predictors + x_predictors_home + y_predictors_away + fifa_predictors + betting_predictors +  control_group

In [63]:
cols = ["date", "target"] + all_predictors
fixtures = fixtures[cols]
fixtures

Unnamed: 0,date,target,team_code_x,team_code_y,round_x,hour_x,day_code_x,gf_rolling_x,ga_rolling_x,sh_rolling_x,...,gd_rolling_x,gd_rolling_y,win_rate_rolling_x,win_rate_rolling_y,loss_rate_rolling_x,loss_rate_rolling_y,draw_rate_rolling_x,draw_rate_rolling_y,points_rolling_x,points_rolling_y
0,2018-09-23,1,0,9,6,16,6,2.0,1.8,14.0,...,0.2,-0.2,0.6,0.2,0.4,0.2,0.0,0.6,1.8,1.2
1,2018-09-29,1,0,23,7,15,5,2.4,1.4,14.0,...,1.0,0.6,0.8,0.6,0.2,0.2,0.0,0.2,2.4,2.0
2,2018-10-22,1,0,13,9,20,0,2.8,0.8,11.2,...,2.0,0.0,1.0,0.4,0.0,0.6,0.0,0.0,3.0,1.2
3,2018-11-03,0,0,14,11,17,5,2.8,0.8,10.6,...,2.0,1.4,0.8,0.6,0.0,0.0,0.2,0.4,2.6,2.2
4,2018-11-11,0,0,26,12,16,6,2.6,1.0,11.2,...,1.6,-0.2,0.6,0.4,0.0,0.6,0.4,0.0,2.2,1.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1966,2023-12-05,1,26,5,15,19,1,1.6,2.0,10.6,...,-0.4,-0.2,0.2,0.2,0.6,0.8,0.2,0.0,0.8,0.6
1967,2023-12-24,1,26,7,18,13,6,1.0,1.8,9.2,...,-0.8,-0.6,0.2,0.4,0.6,0.6,0.2,0.0,0.8,1.2
1968,2023-12-30,1,26,9,20,15,5,1.6,1.2,11.2,...,0.4,0.8,0.6,0.6,0.2,0.4,0.2,0.0,2.0,1.8
1969,2024-02-01,2,26,17,22,20,3,1.8,1.0,11.8,...,0.8,-0.4,0.6,0.2,0.2,0.4,0.2,0.4,2.0,1.0


In [64]:
#Boruta goes here
def Boruta(fixtures, all_predictors):
    model = RandomForestClassifier()
    feat_selector = BorutaPy(model, n_estimators = 'auto', verbose = 2, random_state = 1)

    X = fixtures[all_predictors].values
    y = fixtures["target"].values
    y = y.ravel()
    
    feature_names = all_predictors

    print(feature_names)
    
    feat_selector.fit(X, y)

    feat_ranks = list(zip(feature_names, feat_selector.ranking_, feat_selector.support_))

    return feat_ranks

In [65]:
#run this cell for BorutaPy to work with current NumPy version
np.int = np.int_
np.float = np.float_
np.bool = np.bool_

In [66]:
feat_ranks = Boruta(fixtures, all_predictors)
for feat in feat_ranks:
    print('Feature: {:<30} Rank: {},  Keep: {}'.format(feat[0], feat[1], feat[2]))

['team_code_x', 'team_code_y', 'round_x', 'hour_x', 'day_code_x', 'gf_rolling_x', 'ga_rolling_x', 'sh_rolling_x', 'sot_rolling_x', 'dist_rolling_x', 'fk_rolling_x', 'pk_rolling_x', 'pkatt_rolling_x', 'xg_rolling_x', 'poss_rolling_x', 'oppsh_rolling_x', 'oppsot_rolling_x', 'oppdist_rolling_x', 'opppkatt_rolling_x', 'xga_rolling_x', 'gf_rolling_y', 'ga_rolling_y', 'sh_rolling_y', 'sot_rolling_y', 'dist_rolling_y', 'fk_rolling_y', 'pk_rolling_y', 'pkatt_rolling_y', 'xg_rolling_y', 'poss_rolling_y', 'oppsh_rolling_y', 'oppsot_rolling_y', 'oppdist_rolling_y', 'opppkatt_rolling_y', 'xga_rolling_y', 'team_att_x', 'team_mid_x', 'team_def_x', 'team_ovr_x', 'team_att_y', 'team_mid_y', 'team_def_y', 'team_ovr_y', 'B365H', 'B365D', 'B365A', 'gd_rolling_x', 'gd_rolling_y', 'win_rate_rolling_x', 'win_rate_rolling_y', 'loss_rate_rolling_x', 'loss_rate_rolling_y', 'draw_rate_rolling_x', 'draw_rate_rolling_y', 'points_rolling_x', 'points_rolling_y']
Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	56
Reje

In [67]:
selected_predictors = []

for feat in feat_ranks:
    if feat[1] < 20:
        selected_predictors.append(feat[0])

print(selected_predictors)

['sh_rolling_x', 'dist_rolling_x', 'xg_rolling_x', 'poss_rolling_x', 'oppsh_rolling_x', 'oppdist_rolling_x', 'sh_rolling_y', 'dist_rolling_y', 'xg_rolling_y', 'poss_rolling_y', 'oppsh_rolling_y', 'oppdist_rolling_y', 'xga_rolling_y', 'team_att_x', 'team_mid_x', 'team_def_x', 'team_ovr_x', 'team_att_y', 'team_def_y', 'B365H', 'B365D', 'B365A']


In [68]:
rf_model = RandomForestClassifier(n_estimators = 19, min_samples_split = 30, random_state = 1) #19, 30, 1

In [69]:
combined, combined_train, report, report_train = make_predictions(fixtures, selected_predictors, rf_model)

In [70]:
print(report)

              precision    recall  f1-score   support

           0     0.5000    0.0488    0.0889        41
           1     0.6486    0.7423    0.6923        97
           2     0.5426    0.7183    0.6182        71

    accuracy                         0.5981       209
   macro avg     0.5637    0.5031    0.4665       209
weighted avg     0.5834    0.5981    0.5488       209



In [71]:
print(report_train)

              precision    recall  f1-score   support

           0     0.9735    0.3779    0.5444       389
           1     0.7200    0.9155    0.8061       781
           2     0.7443    0.7770    0.7603       592

    accuracy                         0.7503      1762
   macro avg     0.8126    0.6901    0.7036      1762
weighted avg     0.7842    0.7503    0.7330      1762



In [72]:
combined

Unnamed: 0,actual,predicted
92,0,1
93,1,1
94,0,1
95,1,0
96,1,1
...,...,...
1966,1,1
1967,1,2
1968,1,1
1969,2,1


In [73]:
combined[combined["predicted"] == 1]["actual"].value_counts()

actual
1    72
2    20
0    19
Name: count, dtype: int64

In [74]:
combined[combined["predicted"] == 2]["actual"].value_counts()

actual
2    51
1    23
0    20
Name: count, dtype: int64

In [75]:
combined[combined["predicted"] == 0]["actual"].value_counts()

actual
1    2
0    2
Name: count, dtype: int64

In [76]:
combined_train[combined_train["predicted"] == 1]["actual"].value_counts()

actual
1    715
0    147
2    131
Name: count, dtype: int64

In [77]:
combined_train[combined_train["predicted"] == 2]["actual"].value_counts()

actual
2    460
0     95
1     63
Name: count, dtype: int64

In [78]:
combined_train[combined_train["predicted"] == 0]["actual"].value_counts()

actual
0    147
1      3
2      1
Name: count, dtype: int64

In [79]:
# RandomForest ends here

In [80]:
lr_model = LogisticRegression(max_iter = 100, multi_class = 'ovr', penalty = 'l2', C = 10)

In [81]:
def make_predictions_normalized(data, predictors, model):
    train = data[data["date"] <= "2023-06-10"]
    test = data[data["date"] > "2023-06-10"]

    # Split the DataFrame into training and testing sets
    # train, test = train_test_split(data, test_size=0.2, random_state=42, shuffle=True)

    #OG
    train_x = train[predictors]
    train_y = train["target"]
    test_x = test[predictors]
    test_y = test["target"]
    
    
    normalizer = preprocessing.Normalizer()
    normalized_train_x = normalizer.fit_transform(train_x)
    normalized_test_x = normalizer.transform(test_x)
    
    model.fit(normalized_train_x, train_y)
    
    preds = model.predict(normalized_test_x)
    
    combined = pd.DataFrame(dict(actual = test_y, predicted = preds), index = test.index)
    #precision = precision_score(test_y, preds, average = None)
    #accuracy = accuracy_score(test_y, preds)
    report = classification_report(test_y, preds, digits = 4, zero_division = 0.0)
    
    preds_train = model.predict(normalized_train_x)
    
    combined_train = pd.DataFrame(dict(actual = train_y, predicted = preds_train), index = train.index)
    #precision_train = precision_score(train_y, preds_train, average = None)
    #accuracy_train = accuracy_score(train_y, preds_train)
    report_train = classification_report(train_y, preds_train, digits = 4, zero_division = 0.0)
    
    return combined, combined_train, report, report_train # precision, accuracy

In [82]:
combined, combined_train, report, report_train = make_predictions_normalized(fixtures, selected_predictors, lr_model)

In [83]:
print(report)

              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000        41
           1     0.6270    0.8144    0.7085        97
           2     0.5301    0.6197    0.5714        71

    accuracy                         0.5885       209
   macro avg     0.3857    0.4781    0.4266       209
weighted avg     0.4711    0.5885    0.5230       209



In [84]:
print(report_train)

              precision    recall  f1-score   support

           0     0.3333    0.0026    0.0051       389
           1     0.5568    0.7849    0.6514       781
           2     0.5182    0.5760    0.5456       592

    accuracy                         0.5420      1762
   macro avg     0.4694    0.4545    0.4007      1762
weighted avg     0.4945    0.5420    0.4732      1762



In [85]:
combined

Unnamed: 0,actual,predicted
92,0,1
93,1,1
94,0,1
95,1,2
96,1,1
...,...,...
1966,1,1
1967,1,2
1968,1,1
1969,2,2


In [86]:
combined[combined["predicted"] == 1]["actual"].value_counts()

actual
1    79
2    27
0    20
Name: count, dtype: int64

In [87]:
combined[combined["predicted"] == 2]["actual"].value_counts()

actual
2    44
0    21
1    18
Name: count, dtype: int64

In [88]:
combined[combined["predicted"] == 0]["actual"].value_counts()

Series([], Name: count, dtype: int64)

In [89]:
#Logistic Regression ends here

In [90]:
svc_model = SVC(kernel = 'rbf', C = 1, gamma = 'scale', random_state = 1)

In [91]:
combined, combined_train, report, report_train = make_predictions_normalized(fixtures, selected_predictors, svc_model)

In [92]:
print(report)

              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000        41
           1     0.6014    0.8557    0.7064        97
           2     0.5634    0.5634    0.5634        71

    accuracy                         0.5885       209
   macro avg     0.3883    0.4730    0.4233       209
weighted avg     0.4705    0.5885    0.5192       209



In [93]:
print(report_train)

              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000       389
           1     0.5394    0.8412    0.6573       781
           2     0.5533    0.5084    0.5299       592

    accuracy                         0.5437      1762
   macro avg     0.3642    0.4499    0.3958      1762
weighted avg     0.4250    0.5437    0.4694      1762



In [94]:
combined

Unnamed: 0,actual,predicted
92,0,1
93,1,1
94,0,1
95,1,2
96,1,1
...,...,...
1966,1,1
1967,1,1
1968,1,1
1969,2,1


In [95]:
combined[combined["predicted"] == 1]["actual"].value_counts()

actual
1    83
2    31
0    24
Name: count, dtype: int64

In [96]:
combined[combined["predicted"] == 2]["actual"].value_counts()

actual
2    40
0    17
1    14
Name: count, dtype: int64

In [97]:
combined[combined["predicted"] == 0]["actual"].value_counts()

Series([], Name: count, dtype: int64)

In [98]:
combined, combined_train, report, report_train = make_predictions(fixtures, selected_predictors, svc_model)

In [99]:
print(report)

              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000        41
           1     0.6058    0.8557    0.7094        97
           2     0.5556    0.5634    0.5594        71

    accuracy                         0.5885       209
   macro avg     0.3871    0.4730    0.4229       209
weighted avg     0.4699    0.5885    0.5193       209



In [100]:
print(report_train)

              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000       389
           1     0.5417    0.8476    0.6610       781
           2     0.5593    0.5101    0.5336       592

    accuracy                         0.5471      1762
   macro avg     0.3670    0.4526    0.3982      1762
weighted avg     0.4280    0.5471    0.4723      1762



In [101]:
combined

Unnamed: 0,actual,predicted
92,0,1
93,1,1
94,0,1
95,1,2
96,1,1
...,...,...
1966,1,1
1967,1,1
1968,1,1
1969,2,1


In [102]:
combined[combined["predicted"] == 1]["actual"].value_counts()

actual
1    83
2    31
0    23
Name: count, dtype: int64

In [103]:
combined[combined["predicted"] == 2]["actual"].value_counts()

actual
2    40
0    18
1    14
Name: count, dtype: int64

In [104]:
combined[combined["predicted"] == 0]["actual"].value_counts()

Series([], Name: count, dtype: int64)

In [105]:
# SVC ends here
# tune hyperparameters

In [106]:
nb_model = GaussianNB()

In [107]:
combined, combined_train, report, report_train = make_predictions_normalized(fixtures, selected_predictors, nb_model)

In [108]:
print(report)

              precision    recall  f1-score   support

           0     0.2727    0.1463    0.1905        41
           1     0.6321    0.6907    0.6601        97
           2     0.5432    0.6197    0.5789        71

    accuracy                         0.5598       209
   macro avg     0.4827    0.4856    0.4765       209
weighted avg     0.5314    0.5598    0.5404       209



In [109]:
print(report_train)

              precision    recall  f1-score   support

           0     0.2764    0.2288    0.2504       389
           1     0.6258    0.6402    0.6329       781
           2     0.5382    0.5828    0.5596       592

    accuracy                         0.5301      1762
   macro avg     0.4801    0.4839    0.4810      1762
weighted avg     0.5192    0.5301    0.5238      1762



In [110]:
combined

Unnamed: 0,actual,predicted
92,0,1
93,1,1
94,0,1
95,1,2
96,1,1
...,...,...
1966,1,1
1967,1,2
1968,1,0
1969,2,2


In [111]:
combined[combined["predicted"] == 1]["actual"].value_counts()

actual
1    67
2    23
0    16
Name: count, dtype: int64

In [112]:
combined[combined["predicted"] == 2]["actual"].value_counts()

actual
2    44
0    19
1    18
Name: count, dtype: int64

In [113]:
combined[combined["predicted"] == 0]["actual"].value_counts()

actual
1    12
0     6
2     4
Name: count, dtype: int64

In [114]:
combined, combined_train, report, report_train = make_predictions(fixtures, selected_predictors, nb_model)

In [115]:
print(report)

              precision    recall  f1-score   support

           0     0.2286    0.1951    0.2105        41
           1     0.6813    0.6392    0.6596        97
           2     0.5542    0.6479    0.5974        71

    accuracy                         0.5550       209
   macro avg     0.4880    0.4941    0.4892       209
weighted avg     0.5493    0.5550    0.5504       209



In [116]:
print(report_train)

              precision    recall  f1-score   support

           0     0.2734    0.2699    0.2717       389
           1     0.6377    0.5723    0.6032       781
           2     0.5244    0.5997    0.5595       592

    accuracy                         0.5148      1762
   macro avg     0.4785    0.4806    0.4781      1762
weighted avg     0.5192    0.5148    0.5153      1762



In [117]:
combined

Unnamed: 0,actual,predicted
92,0,1
93,1,1
94,0,1
95,1,2
96,1,1
...,...,...
1966,1,0
1967,1,2
1968,1,0
1969,2,2


In [118]:
combined[combined["predicted"] == 1]["actual"].value_counts()

actual
1    62
0    15
2    14
Name: count, dtype: int64

In [119]:
combined[combined["predicted"] == 2]["actual"].value_counts()

actual
2    46
1    19
0    18
Name: count, dtype: int64

In [120]:
combined[combined["predicted"] == 0]["actual"].value_counts()

actual
1    16
2    11
0     8
Name: count, dtype: int64

In [121]:
#GaussianNB ends here

In [122]:
dt_model = DecisionTreeClassifier(max_depth = 4, min_samples_split = 10, random_state = 1)

In [123]:
combined, combined_train, report, report_train = make_predictions_normalized(fixtures, selected_predictors, dt_model)

In [124]:
print(report)

              precision    recall  f1-score   support

           0     0.2222    0.0488    0.0800        41
           1     0.6032    0.7835    0.6816        97
           2     0.5405    0.5634    0.5517        71

    accuracy                         0.5646       209
   macro avg     0.4553    0.4652    0.4378       209
weighted avg     0.5072    0.5646    0.5195       209



In [125]:
print(report_train)

              precision    recall  f1-score   support

           0     0.6111    0.1131    0.1909       389
           1     0.5840    0.7926    0.6725       781
           2     0.5556    0.5912    0.5728       592

    accuracy                         0.5749      1762
   macro avg     0.5835    0.4990    0.4787      1762
weighted avg     0.5804    0.5749    0.5327      1762



In [126]:
combined

Unnamed: 0,actual,predicted
92,0,1
93,1,1
94,0,1
95,1,0
96,1,1
...,...,...
1966,1,1
1967,1,2
1968,1,1
1969,2,1


In [127]:
combined[combined["predicted"] == 1]["actual"].value_counts()

actual
1    76
2    29
0    21
Name: count, dtype: int64

In [128]:
combined[combined["predicted"] == 2]["actual"].value_counts()

actual
2    40
0    18
1    16
Name: count, dtype: int64

In [129]:
combined[combined["predicted"] == 0]["actual"].value_counts()

actual
1    5
0    2
2    2
Name: count, dtype: int64

In [130]:
combined, combined_train, report, report_train = make_predictions(fixtures, selected_predictors, dt_model)

In [131]:
print(report)

              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000        41
           1     0.5739    0.6804    0.6226        97
           2     0.4889    0.6197    0.5466        71

    accuracy                         0.5263       209
   macro avg     0.3543    0.4334    0.3897       209
weighted avg     0.4324    0.5263    0.4747       209



In [132]:
print(report_train)

              precision    recall  f1-score   support

           0     0.5000    0.0154    0.0299       389
           1     0.5713    0.7951    0.6649       781
           2     0.5460    0.6115    0.5769       592

    accuracy                         0.5613      1762
   macro avg     0.5391    0.4740    0.4239      1762
weighted avg     0.5471    0.5613    0.4951      1762



In [133]:
combined

Unnamed: 0,actual,predicted
92,0,1
93,1,1
94,0,1
95,1,2
96,1,1
...,...,...
1966,1,1
1967,1,2
1968,1,1
1969,2,2


In [134]:
combined[combined["predicted"] == 1]["actual"].value_counts()

actual
1    66
2    27
0    22
Name: count, dtype: int64

In [135]:
combined[combined["predicted"] == 2]["actual"].value_counts()

actual
2    44
1    27
0    19
Name: count, dtype: int64

In [136]:
combined[combined["predicted"] == 0]["actual"].value_counts()

actual
1    4
Name: count, dtype: int64

In [137]:
#DecisionTree ends here
#reduce overfitting

In [138]:
#HeterogenousClassifier
combined1, combined_train1, report1, report_train1 = make_predictions(fixtures, selected_predictors, rf_model)
combined2, combined_train2, report2, report_train2 = make_predictions_normalized(fixtures, selected_predictors, lr_model)
combined3, combined_train3, report3, report_train3 = make_predictions(fixtures, selected_predictors, svc_model)

In [139]:
combined1.index = range(combined1.shape[0])
combined2.index = range(combined2.shape[0])
combined3.index = range(combined3.shape[0])

In [140]:
combined3

Unnamed: 0,actual,predicted
0,0,1
1,1,1
2,0,1
3,1,2
4,1,1
...,...,...
204,1,1
205,1,1
206,1,1
207,2,1


In [141]:
combined_het = pd.merge(combined1, combined2, left_index=True, right_index=True)
combined_het = pd.merge(combined_het, combined3, left_index=True, right_index=True)
combined_het

Unnamed: 0,actual_x,predicted_x,actual_y,predicted_y,actual,predicted
0,0,1,0,1,0,1
1,1,1,1,1,1,1
2,0,1,0,1,0,1
3,1,0,1,2,1,2
4,1,1,1,1,1,1
...,...,...,...,...,...,...
204,1,1,1,1,1,1
205,1,2,1,2,1,1
206,1,1,1,1,1,1
207,2,1,2,2,2,1


In [142]:
combined_het = combined_het.drop(columns = ["actual_x", "actual_y"])
combined_het.rename(columns = {'predicted': 'predicted_z'}, inplace = True)
combined_het

Unnamed: 0,predicted_x,predicted_y,actual,predicted_z
0,1,1,0,1
1,1,1,1,1
2,1,1,0,1
3,0,2,1,2
4,1,1,1,1
...,...,...,...,...
204,1,1,1,1
205,2,2,1,1
206,1,1,1,1
207,1,2,2,1


In [143]:
#HeterogenousClassifier - "selected" approach

In [144]:
combined_het[(combined_het["predicted_x"] == 1) & (combined_het["predicted_y"] == 1) & (combined_het["predicted_z"] == 1)]["actual"].value_counts()

actual
1    70
0    19
2    17
Name: count, dtype: int64

In [145]:
combined_het[(combined_het["predicted_x"] == 2) & (combined_het["predicted_y"] == 2) & (combined_het["predicted_z"] == 2)]["actual"].value_counts()

actual
2    38
0    16
1    11
Name: count, dtype: int64

In [146]:
#Since only the RandomForest model can somewhat detect draws, we will not consider the values of the other classifiers when RF predicts a draw
combined_het[(combined_het["predicted_x"] == 0)]["actual"].value_counts()

actual
1    2
0    2
Name: count, dtype: int64

In [147]:
#HeterogenousClassifier - "all" approach

In [148]:
def most_frequent(row):
    values = row[['predicted_x', 'predicted_y', 'predicted_z']]
    counter = Counter(values)
    most_common_value = counter.most_common(1)[0][0]
    return most_common_value

combined_het['predicted'] = combined_het.apply(most_frequent, axis=1)

In [149]:
combined_het

Unnamed: 0,predicted_x,predicted_y,actual,predicted_z,predicted
0,1,1,0,1,1
1,1,1,1,1,1
2,1,1,0,1,1
3,0,2,1,2,2
4,1,1,1,1,1
...,...,...,...,...,...
204,1,1,1,1,1
205,2,2,1,1,2
206,1,1,1,1,1
207,1,2,2,1,1


In [150]:
combined_het[(combined_het["predicted"] == 1) & (combined_het["predicted_x"] != 0)]["actual"].value_counts()

actual
1    80
2    28
0    20
Name: count, dtype: int64

In [151]:
combined_het[(combined_het["predicted"] == 2) & (combined_het["predicted_x"] != 0)]["actual"].value_counts()

actual
2    43
0    19
1    15
Name: count, dtype: int64

In [152]:
combined_het[combined_het["predicted_x"] == 0]["actual"].value_counts()

actual
1    2
0    2
Name: count, dtype: int64

In [153]:
# Neural Network Model

In [154]:
# check for CUDA availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [155]:
data = fixtures[["date"] + selected_predictors + ["target"]]
data

Unnamed: 0,date,sh_rolling_x,dist_rolling_x,xg_rolling_x,poss_rolling_x,oppsh_rolling_x,oppdist_rolling_x,sh_rolling_y,dist_rolling_y,xg_rolling_y,...,team_att_x,team_mid_x,team_def_x,team_ovr_x,team_att_y,team_def_y,B365H,B365D,B365A,target
0,2018-09-23,14.0,17.52,1.48,55.2,14.4,16.80,11.8,16.86,1.06,...,82,83,80,82,77,79,1.44,5.00,7.50,1
1,2018-09-29,14.0,17.44,1.54,59.2,13.0,16.58,10.8,17.86,1.16,...,82,83,80,82,78,77,1.44,5.10,7.00,1
2,2018-10-22,11.2,16.54,1.20,62.4,12.6,16.90,12.4,18.34,1.44,...,82,83,80,82,78,79,1.53,4.50,6.50,1
3,2018-11-03,10.6,17.56,1.38,60.4,13.4,19.36,12.4,18.12,1.34,...,82,83,80,82,86,82,3.90,3.90,1.95,0
4,2018-11-11,11.2,16.96,1.50,60.2,14.0,19.40,14.0,18.96,1.42,...,82,83,80,82,75,76,1.66,4.20,5.25,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1966,2023-12-05,10.6,16.12,1.26,45.8,11.4,19.56,12.0,16.74,1.06,...,75,77,76,76,69,73,1.85,3.60,4.20,1
1967,2023-12-24,9.2,15.78,0.96,51.4,12.4,18.58,11.6,17.74,1.62,...,75,77,76,76,77,78,3.75,3.75,1.91,1
1968,2023-12-30,11.2,17.20,1.28,49.2,12.6,17.12,12.8,13.96,1.74,...,75,77,76,76,79,75,2.63,3.30,2.70,1
1969,2024-02-01,11.8,17.40,1.48,41.6,12.8,17.18,9.8,16.88,0.90,...,75,77,76,76,82,80,2.63,3.60,2.55,2


In [156]:
################# Separation from Home Wins ################

data = data[data["target"] != 1]

data

Unnamed: 0,date,sh_rolling_x,dist_rolling_x,xg_rolling_x,poss_rolling_x,oppsh_rolling_x,oppdist_rolling_x,sh_rolling_y,dist_rolling_y,xg_rolling_y,...,team_att_x,team_mid_x,team_def_x,team_ovr_x,team_att_y,team_def_y,B365H,B365D,B365A,target
3,2018-11-03,10.6,17.56,1.38,60.4,13.4,19.36,12.4,18.12,1.34,...,82,83,80,82,86,82,3.90,3.9,1.95,0
4,2018-11-11,11.2,16.96,1.50,60.2,14.0,19.40,14.0,18.96,1.42,...,82,83,80,82,75,76,1.66,4.2,5.25,0
15,2019-04-21,10.8,16.56,1.54,55.6,12.2,21.50,9.2,19.12,1.16,...,82,83,80,82,79,76,1.53,4.6,6.50,2
16,2019-05-05,11.0,16.44,1.04,59.6,17.0,16.32,8.4,20.20,0.54,...,82,83,80,82,76,76,1.40,5.0,9.00,0
18,2019-09-01,13.6,17.22,1.60,61.4,15.0,16.50,14.6,23.40,0.98,...,84,82,78,80,85,82,2.37,3.6,2.80,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1961,2023-09-16,13.6,16.02,1.42,51.0,15.2,14.32,18.8,17.00,2.34,...,75,77,76,76,84,84,6.00,5.0,1.45,2
1963,2023-10-08,8.0,14.22,0.78,44.0,17.8,16.00,14.8,15.64,1.76,...,75,77,76,76,82,79,3.50,3.4,2.10,0
1964,2023-10-28,9.2,13.86,1.14,44.4,16.6,17.22,13.8,14.50,2.56,...,75,77,76,76,79,82,4.20,4.0,1.75,0
1969,2024-02-01,11.8,17.40,1.48,41.6,12.8,17.18,9.8,16.88,0.90,...,75,77,76,76,82,80,2.63,3.6,2.55,2


In [157]:
train = data[data["date"] <= "2023-06-10"]
test = data[data["date"] > "2023-06-10"]

In [158]:
test = test.drop(columns=['date'])
test

Unnamed: 0,sh_rolling_x,dist_rolling_x,xg_rolling_x,poss_rolling_x,oppsh_rolling_x,oppdist_rolling_x,sh_rolling_y,dist_rolling_y,xg_rolling_y,poss_rolling_y,...,team_att_x,team_mid_x,team_def_x,team_ovr_x,team_att_y,team_def_y,B365H,B365D,B365A,target
92,13.4,16.40,1.42,60.8,8.8,16.04,9.4,17.92,1.38,54.6,...,82,84,81,82,76,76,1.20,7.00,13.00,0
94,15.2,16.34,1.86,66.2,9.2,17.92,20.2,17.14,1.88,60.6,...,82,84,81,82,81,79,1.67,4.33,4.33,0
100,18.6,15.46,1.88,56.2,8.2,17.48,10.8,16.70,1.38,39.6,...,82,84,81,82,78,78,1.33,5.50,7.50,2
181,14.6,18.44,1.34,50.4,10.2,12.62,7.4,17.96,0.68,34.2,...,82,79,79,80,72,72,1.22,6.50,12.00,0
183,14.2,15.98,1.68,66.0,8.2,16.72,15.4,14.50,1.84,52.8,...,82,79,79,80,79,82,1.85,4.00,3.80,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1961,13.6,16.02,1.42,51.0,15.2,14.32,18.8,17.00,2.34,54.4,...,75,77,76,76,84,84,6.00,5.00,1.45,2
1963,8.0,14.22,0.78,44.0,17.8,16.00,14.8,15.64,1.76,46.4,...,75,77,76,76,82,79,3.50,3.40,2.10,0
1964,9.2,13.86,1.14,44.4,16.6,17.22,13.8,14.50,2.56,58.0,...,75,77,76,76,79,82,4.20,4.00,1.75,0
1969,11.8,17.40,1.48,41.6,12.8,17.18,9.8,16.88,0.90,47.2,...,75,77,76,76,82,80,2.63,3.60,2.55,2


In [159]:
train = train.drop(columns=['date'])
train

Unnamed: 0,sh_rolling_x,dist_rolling_x,xg_rolling_x,poss_rolling_x,oppsh_rolling_x,oppdist_rolling_x,sh_rolling_y,dist_rolling_y,xg_rolling_y,poss_rolling_y,...,team_att_x,team_mid_x,team_def_x,team_ovr_x,team_att_y,team_def_y,B365H,B365D,B365A,target
3,10.6,17.56,1.38,60.4,13.4,19.36,12.4,18.12,1.34,58.6,...,82,83,80,82,86,82,3.90,3.9,1.95,0
4,11.2,16.96,1.50,60.2,14.0,19.40,14.0,18.96,1.42,47.2,...,82,83,80,82,75,76,1.66,4.2,5.25,0
15,10.8,16.56,1.54,55.6,12.2,21.50,9.2,19.12,1.16,44.0,...,82,83,80,82,79,76,1.53,4.6,6.50,2
16,11.0,16.44,1.04,59.6,17.0,16.32,8.4,20.20,0.54,44.0,...,82,83,80,82,76,76,1.40,5.0,9.00,0
18,13.6,17.22,1.60,61.4,15.0,16.50,14.6,23.40,0.98,59.2,...,84,82,78,80,85,82,2.37,3.6,2.80,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1948,11.8,17.50,1.20,47.6,11.8,16.90,13.8,15.72,1.72,55.8,...,78,80,74,78,81,80,6.00,4.2,1.55,2
1949,11.8,18.60,1.08,42.2,12.4,16.04,13.6,18.70,1.48,55.0,...,78,80,74,78,82,81,4.00,3.6,1.91,2
1952,12.0,17.52,1.08,49.4,16.0,17.92,10.6,16.20,1.04,39.6,...,78,80,74,78,72,73,1.67,3.6,6.00,2
1954,8.4,16.58,0.90,52.6,14.0,17.66,12.2,15.92,1.02,45.0,...,78,80,74,78,77,75,2.25,3.2,3.30,2


In [160]:
X_train = train[selected_predictors].to_numpy()
Y_train = train["target"].to_numpy()

In [161]:
X_test = test[selected_predictors].to_numpy()
Y_test = test["target"].to_numpy()

In [162]:
# standardize the data using Standard sclaer
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [163]:
# convert data to PyTorch tensors and move it to GPU
X_train = torch.tensor(X_train, dtype=torch.float32).to(device)
Y_train = torch.tensor(Y_train, dtype=torch.float32).to(device)
X_test = torch.tensor(X_test, dtype=torch.float32).to(device)
Y_test = torch.tensor(Y_test, dtype=torch.float32).to(device)

In [164]:
class NeuralNet(nn.Module):
    def __init__(self, input_size, hidden_size, output_size=1):
        super(NeuralNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, hidden_size)
        self.fc4 = nn.Linear(hidden_size, hidden_size)
        self.fc5 = nn.Linear(hidden_size, hidden_size)
        #self.fc6 = nn.Linear(hidden_size, hidden_size)
        self.fc6 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        x = F.relu(self.fc5(x))
        #x = F.relu(self.fc6(x))
        x = F.sigmoid(self.fc6(x))
        return x

In [165]:
# define hyperparameters
input_size = X_train.shape[1]
hidden_size = 256
learning_rate = 0.00009
num_epochs = 100

# Setting the seed
seed = 42

torch.manual_seed(seed)

# If you are using CUDA
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if you are using multi-GPU.

# Set the seed for NumPy
np.random.seed(seed)

# Set the seed for the random module in Python
# random.seed(42)

In [166]:
# initialize the neural network and move it the GPU
model = NeuralNet(input_size, hidden_size).to(device)

In [167]:
# define loss and the optiizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
Y = (Y_train / 2).float().to(device)

In [168]:
X_train.shape

torch.Size([981, 22])

In [169]:
Y.shape

torch.Size([981])

In [170]:
X_test.shape

torch.Size([112, 22])

In [171]:
Y_test.shape

torch.Size([112])

In [172]:
# Get unique values and their counts
unique_values, counts = torch.unique(Y, return_counts=True)

# Display the counts
for value, count in zip(unique_values, counts):
    print(f'Value: {value}, Count: {count}')

Value: 0.0, Count: 389
Value: 1.0, Count: 592


In [173]:
# Get unique values and their counts
unique_values, counts = torch.unique(Y_test, return_counts=True)

# Display the counts
for value, count in zip(unique_values, counts):
    print(f'Value: {value}, Count: {count}')

Value: 0.0, Count: 41
Value: 2.0, Count: 71


In [174]:
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train)
    outputs = outputs.view(-1)
    loss = criterion(outputs, Y)
    loss.backward()
    optimizer.step()

    # Calculate accuracy
    with torch.no_grad():
        predicted = (outputs >= 0.5).float()
        correct = (predicted == Y).sum().item()
        accuracy = correct / len(Y)

    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}, Accuracy: {accuracy * 100:.2f}%")

Epoch [10/100], Loss: 0.6873, Accuracy: 60.35%
Epoch [20/100], Loss: 0.6788, Accuracy: 60.35%
Epoch [30/100], Loss: 0.6665, Accuracy: 60.35%
Epoch [40/100], Loss: 0.6509, Accuracy: 60.35%
Epoch [50/100], Loss: 0.6374, Accuracy: 61.98%
Epoch [60/100], Loss: 0.6286, Accuracy: 64.53%
Epoch [70/100], Loss: 0.6210, Accuracy: 65.55%
Epoch [80/100], Loss: 0.6125, Accuracy: 65.85%
Epoch [90/100], Loss: 0.6019, Accuracy: 67.28%
Epoch [100/100], Loss: 0.5862, Accuracy: 69.32%


In [175]:
# Train
model.eval()
with torch.no_grad():
    outputs = model(X_train)
    outputs = outputs.view(-1)
    predicted = (outputs >= 0.5).float()
    predicted = predicted * 2

    correct = (predicted == Y_train).float().sum()
    accuracy = correct / Y_train.size(0)

    draw_correct = ((predicted == 0) & (Y_train == 0)).float().sum()
    away_win_correct = ((predicted == 2) & (Y_train == 2)).float().sum()

    draw_total = (Y_train == 0).float().sum()
    away_win_total = (Y_train == 2).float().sum()

    draw_accuracy = draw_correct / draw_total if draw_total > 0 else 0
    away_win_accuracy = away_win_correct / away_win_total if away_win_total > 0 else 0

    print(f"Overall Accuracy: {accuracy.item() * 100:.2f}%")
    print(f"Draw Accuracy: {draw_accuracy.item() * 100:.2f}%")

    if isinstance(away_win_accuracy, torch.Tensor):
        print(f"Away Win Accuracy: {away_win_accuracy.item() * 100:.2f}%")
    else:
        print(f"Away Win Accuracy: {away_win_accuracy * 100:.2f}%")

    Y_train_cpu = Y_train.cpu()
    predicted_cpu = predicted.cpu()
    Y_train_numpy = Y_train_cpu.numpy()
    predicted_numpy = predicted_cpu.numpy()

    report = classification_report(Y_train_numpy, predicted_numpy, digits=4, zero_division=0.0)
    print(report)

Overall Accuracy: 69.62%
Draw Accuracy: 49.10%
Away Win Accuracy: 83.11%
              precision    recall  f1-score   support

         0.0     0.6564    0.4910    0.5618       389
         2.0     0.7130    0.8311    0.7676       592

    accuracy                         0.6962       981
   macro avg     0.6847    0.6610    0.6647       981
weighted avg     0.6906    0.6962    0.6859       981



In [176]:
# Test
model.eval()
with torch.no_grad():
    outputs = model(X_test)
    outputs = outputs.view(-1)
    predicted = (outputs >= 0.5).float()
    predicted = predicted * 2

    correct = (predicted == Y_test).float().sum()
    accuracy = correct / Y_test.size(0)

    draw_correct = ((predicted == 0) & (Y_test == 0)).float().sum()
    away_win_correct = ((predicted == 2) & (Y_test == 2)).float().sum()

    draw_total = (Y_test == 0).float().sum()
    away_win_total = (Y_test == 2).float().sum()

    draw_accuracy = draw_correct / draw_total if draw_total > 0 else 0
    away_win_accuracy = away_win_correct / away_win_total if away_win_total > 0 else 0

    print(f"Overall Accuracy: {accuracy.item() * 100:.2f}%")
    print(f"Draw Accuracy: {draw_accuracy.item() * 100:.2f}%")
    print(f"Away Win Accuracy: {away_win_accuracy.item() * 100:.2f}%")

    Y_test_cpu = Y_test.cpu()
    predicted_cpu = predicted.cpu()
    Y_test_numpy = Y_test_cpu.numpy()
    predicted_numpy = predicted_cpu.numpy()
    report = classification_report(Y_test_numpy, predicted_numpy, digits=4, zero_division=0.0)
    print(report)

Overall Accuracy: 68.75%
Draw Accuracy: 36.59%
Away Win Accuracy: 87.32%
              precision    recall  f1-score   support

         0.0     0.6250    0.3659    0.4615        41
         2.0     0.7045    0.8732    0.7799        71

    accuracy                         0.6875       112
   macro avg     0.6648    0.6195    0.6207       112
weighted avg     0.6754    0.6875    0.6633       112



In [177]:
#HeterogenousClassifier
combined1, combined_train1, report1, report_train1 = make_predictions(fixtures, selected_predictors, rf_model)
combined2, combined_train2, report2, report_train2 = make_predictions_normalized(fixtures, selected_predictors, lr_model)
combined3, combined_train3, report3, report_train3 = make_predictions(fixtures, selected_predictors, svc_model)

In [178]:
#combined1.index = range(combined1.shape[0])
#combined2.index = range(combined2.shape[0])
#combined3.index = range(combined3.shape[0])

In [179]:
combined3

Unnamed: 0,actual,predicted
92,0,1
93,1,1
94,0,1
95,1,2
96,1,1
...,...,...
1966,1,1
1967,1,1
1968,1,1
1969,2,1


In [180]:
combined_het = pd.concat([combined1, combined2, combined3], axis=1)
combined_het

Unnamed: 0,actual,predicted,actual.1,predicted.1,actual.2,predicted.2
92,0,1,0,1,0,1
93,1,1,1,1,1,1
94,0,1,0,1,0,1
95,1,0,1,2,1,2
96,1,1,1,1,1,1
...,...,...,...,...,...,...
1966,1,1,1,1,1,1
1967,1,2,1,2,1,1
1968,1,1,1,1,1,1
1969,2,1,2,2,2,1


In [181]:
# Define new column names based on their indices
new_column_names = {0: 'actual', 1: 'predicted_x', 2: 'actual_y', 3: 'predicted_y', 4: 'actual_x', 5: 'predicted_z'}

# Rename columns by their indices
combined_het.columns = [new_column_names.get(i, col) for i, col in enumerate(combined_het.columns)]

In [182]:
combined_het

Unnamed: 0,actual,predicted_x,actual_y,predicted_y,actual_x,predicted_z
92,0,1,0,1,0,1
93,1,1,1,1,1,1
94,0,1,0,1,0,1
95,1,0,1,2,1,2
96,1,1,1,1,1,1
...,...,...,...,...,...,...
1966,1,1,1,1,1,1
1967,1,2,1,2,1,1
1968,1,1,1,1,1,1
1969,2,1,2,2,2,1


In [183]:
combined_het = combined_het.drop(columns = ["actual_x", "actual_y"])
combined_het

Unnamed: 0,actual,predicted_x,predicted_y,predicted_z
92,0,1,1,1
93,1,1,1,1
94,0,1,1,1
95,1,0,2,2
96,1,1,1,1
...,...,...,...,...
1966,1,1,1,1
1967,1,2,2,1
1968,1,1,1,1
1969,2,1,2,1


In [184]:
combined_het2 = combined_het
combined_het2

Unnamed: 0,actual,predicted_x,predicted_y,predicted_z
92,0,1,1,1
93,1,1,1,1
94,0,1,1,1
95,1,0,2,2
96,1,1,1,1
...,...,...,...,...
1966,1,1,1,1
1967,1,2,2,1
1968,1,1,1,1
1969,2,1,2,1


In [185]:
# Majority Protocol Approach

In [186]:
def most_frequent(row):
    values = row[['predicted_x', 'predicted_y', 'predicted_z']]
    counter = Counter(values)
    most_common_value = counter.most_common(1)[0][0]
    return most_common_value

combined_het['predicted'] = combined_het.apply(most_frequent, axis=1)

In [187]:
combined_het

Unnamed: 0,actual,predicted_x,predicted_y,predicted_z,predicted
92,0,1,1,1,1
93,1,1,1,1,1
94,0,1,1,1,1
95,1,0,2,2,2
96,1,1,1,1,1
...,...,...,...,...,...
1966,1,1,1,1,1
1967,1,2,2,1,2
1968,1,1,1,1,1
1969,2,1,2,1,1


In [188]:
combined_het[combined_het["predicted"] == 1]["actual"].value_counts()

actual
1    80
2    28
0    20
Name: count, dtype: int64

In [189]:
nn_candidates = combined_het[combined_het["predicted"] != 1]
nn_candidates

Unnamed: 0,actual,predicted_x,predicted_y,predicted_z,predicted
95,1,0,2,2,2
179,1,2,2,2,2
180,1,2,2,2,2
235,0,2,2,1,2
236,2,2,2,2,2
...,...,...,...,...,...
1962,1,2,2,2,2
1963,0,2,2,2,2
1964,0,2,2,2,2
1965,1,2,2,2,2


In [190]:
data1 = fixtures[selected_predictors + ["target"]]
data1

Unnamed: 0,sh_rolling_x,dist_rolling_x,xg_rolling_x,poss_rolling_x,oppsh_rolling_x,oppdist_rolling_x,sh_rolling_y,dist_rolling_y,xg_rolling_y,poss_rolling_y,...,team_att_x,team_mid_x,team_def_x,team_ovr_x,team_att_y,team_def_y,B365H,B365D,B365A,target
0,14.0,17.52,1.48,55.2,14.4,16.80,11.8,16.86,1.06,52.4,...,82,83,80,82,77,79,1.44,5.00,7.50,1
1,14.0,17.44,1.54,59.2,13.0,16.58,10.8,17.86,1.16,40.4,...,82,83,80,82,78,77,1.44,5.10,7.00,1
2,11.2,16.54,1.20,62.4,12.6,16.90,12.4,18.34,1.44,55.2,...,82,83,80,82,78,79,1.53,4.50,6.50,1
3,10.6,17.56,1.38,60.4,13.4,19.36,12.4,18.12,1.34,58.6,...,82,83,80,82,86,82,3.90,3.90,1.95,0
4,11.2,16.96,1.50,60.2,14.0,19.40,14.0,18.96,1.42,47.2,...,82,83,80,82,75,76,1.66,4.20,5.25,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1966,10.6,16.12,1.26,45.8,11.4,19.56,12.0,16.74,1.06,54.4,...,75,77,76,76,69,73,1.85,3.60,4.20,1
1967,9.2,15.78,0.96,51.4,12.4,18.58,11.6,17.74,1.62,56.4,...,75,77,76,76,77,78,3.75,3.75,1.91,1
1968,11.2,17.20,1.28,49.2,12.6,17.12,12.8,13.96,1.74,34.6,...,75,77,76,76,79,75,2.63,3.30,2.70,1
1969,11.8,17.40,1.48,41.6,12.8,17.18,9.8,16.88,0.90,47.2,...,75,77,76,76,82,80,2.63,3.60,2.55,2


In [191]:
selected_records = data1.loc[nn_candidates.index]

In [192]:
selected_records

Unnamed: 0,sh_rolling_x,dist_rolling_x,xg_rolling_x,poss_rolling_x,oppsh_rolling_x,oppdist_rolling_x,sh_rolling_y,dist_rolling_y,xg_rolling_y,poss_rolling_y,...,team_att_x,team_mid_x,team_def_x,team_ovr_x,team_att_y,team_def_y,B365H,B365D,B365A,target
95,14.6,15.72,2.34,60.8,9.4,17.20,18.8,17.00,2.30,68.0,...,82,84,81,82,87,83,2.90,3.30,2.40,1
179,13.6,15.56,1.78,58.8,10.8,17.10,17.6,15.98,2.54,58.8,...,82,79,79,80,87,83,4.75,4.20,1.70,1
180,14.6,16.44,1.78,54.0,9.8,15.12,17.4,16.16,1.80,61.6,...,82,79,79,80,82,81,3.30,3.60,2.10,1
235,8.8,15.42,0.98,44.2,15.2,16.56,12.0,18.00,1.08,44.8,...,75,75,74,75,78,78,2.70,3.40,2.55,0
236,9.8,18.10,0.88,48.4,18.2,16.80,14.4,16.72,1.68,57.0,...,75,75,74,75,81,79,3.80,3.75,1.91,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1962,10.6,15.16,1.12,47.0,16.4,15.26,17.0,16.72,2.32,66.2,...,75,77,76,76,87,83,8.00,5.75,1.33,1
1963,8.0,14.22,0.78,44.0,17.8,16.00,14.8,15.64,1.76,46.4,...,75,77,76,76,82,79,3.50,3.40,2.10,0
1964,9.2,13.86,1.14,44.4,16.6,17.22,13.8,14.50,2.56,58.0,...,75,77,76,76,79,82,4.20,4.00,1.75,0
1965,10.6,15.66,1.32,49.6,14.0,18.96,14.4,19.38,1.52,60.0,...,75,77,76,76,81,79,3.10,3.75,2.20,1


In [193]:
X_test = selected_records[selected_predictors].to_numpy()
Y_test = selected_records["target"].to_numpy()

In [194]:
# standardize the data using Standard sclaer

X_test = scaler.transform(X_test)

In [195]:
# convert data to PyTorch tensors and move it to GPU
X_test = torch.tensor(X_test, dtype=torch.float32).to(device)
Y_test = torch.tensor(Y_test, dtype=torch.float32).to(device)

In [196]:
X_test.shape

torch.Size([81, 22])

In [197]:
Y_test.shape

torch.Size([81])

In [198]:
# Get unique values and their counts
unique_values, counts = torch.unique(Y_test, return_counts=True)

# Display the counts
for value, count in zip(unique_values, counts):
    print(f'Value: {value}, Count: {count}')

Value: 0.0, Count: 21
Value: 1.0, Count: 17
Value: 2.0, Count: 43


In [199]:
# Test
model.eval()
with torch.no_grad():
    outputs = model(X_test)
    outputs = outputs.view(-1)
    predicted = (outputs >= 0.5).float()
    predicted = predicted * 2

    correct = (predicted == Y_test).float().sum()
    accuracy = correct / Y_test.size(0)

    draw_correct = ((predicted == 0) & (Y_test == 0)).float().sum()
    away_win_correct = ((predicted == 2) & (Y_test == 2)).float().sum()

    draw_total = (Y_test == 0).float().sum()
    away_win_total = (Y_test == 2).float().sum()

    draw_accuracy = draw_correct / draw_total if draw_total > 0 else 0
    away_win_accuracy = away_win_correct / away_win_total if away_win_total > 0 else 0

    print(f"Overall Accuracy: {accuracy.item() * 100:.2f}%")
    print(f"Draw Accuracy: {draw_accuracy.item() * 100:.2f}%")
    print(f"Away Win Accuracy: {away_win_accuracy.item() * 100:.2f}%")

    Y_test_cpu = Y_test.cpu()
    predicted_cpu = predicted.cpu()
    Y_test_numpy = Y_test_cpu.numpy()
    predicted_numpy = predicted_cpu.numpy()
    report = classification_report(Y_test_numpy, predicted_numpy, digits=4, zero_division=0.0)
    print(report)

Overall Accuracy: 53.09%
Draw Accuracy: 4.76%
Away Win Accuracy: 97.67%
              precision    recall  f1-score   support

         0.0     0.5000    0.0476    0.0870        21
         1.0     0.0000    0.0000    0.0000        17
         2.0     0.5316    0.9767    0.6885        43

    accuracy                         0.5309        81
   macro avg     0.3439    0.3415    0.2585        81
weighted avg     0.4119    0.5309    0.3881        81



In [200]:
# Selected Protocol Approach

In [201]:
data = fixtures[["date"] + selected_predictors + ["target"]]
data

Unnamed: 0,date,sh_rolling_x,dist_rolling_x,xg_rolling_x,poss_rolling_x,oppsh_rolling_x,oppdist_rolling_x,sh_rolling_y,dist_rolling_y,xg_rolling_y,...,team_att_x,team_mid_x,team_def_x,team_ovr_x,team_att_y,team_def_y,B365H,B365D,B365A,target
0,2018-09-23,14.0,17.52,1.48,55.2,14.4,16.80,11.8,16.86,1.06,...,82,83,80,82,77,79,1.44,5.00,7.50,1
1,2018-09-29,14.0,17.44,1.54,59.2,13.0,16.58,10.8,17.86,1.16,...,82,83,80,82,78,77,1.44,5.10,7.00,1
2,2018-10-22,11.2,16.54,1.20,62.4,12.6,16.90,12.4,18.34,1.44,...,82,83,80,82,78,79,1.53,4.50,6.50,1
3,2018-11-03,10.6,17.56,1.38,60.4,13.4,19.36,12.4,18.12,1.34,...,82,83,80,82,86,82,3.90,3.90,1.95,0
4,2018-11-11,11.2,16.96,1.50,60.2,14.0,19.40,14.0,18.96,1.42,...,82,83,80,82,75,76,1.66,4.20,5.25,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1966,2023-12-05,10.6,16.12,1.26,45.8,11.4,19.56,12.0,16.74,1.06,...,75,77,76,76,69,73,1.85,3.60,4.20,1
1967,2023-12-24,9.2,15.78,0.96,51.4,12.4,18.58,11.6,17.74,1.62,...,75,77,76,76,77,78,3.75,3.75,1.91,1
1968,2023-12-30,11.2,17.20,1.28,49.2,12.6,17.12,12.8,13.96,1.74,...,75,77,76,76,79,75,2.63,3.30,2.70,1
1969,2024-02-01,11.8,17.40,1.48,41.6,12.8,17.18,9.8,16.88,0.90,...,75,77,76,76,82,80,2.63,3.60,2.55,2


In [202]:
test = data[data["date"] > "2023-06-10"]
test = test.drop(columns=['date'])
X_test = test[selected_predictors].to_numpy()
Y_test = test["target"].to_numpy()
X_test = scaler.transform(X_test)

In [203]:
X_test = torch.tensor(X_test, dtype=torch.float32).to(device)
Y_test = torch.tensor(Y_test, dtype=torch.float32).to(device)

In [204]:
X_test.shape

torch.Size([209, 22])

In [205]:
unique_values, counts = torch.unique(Y_test, return_counts=True)

# Display the counts
for value, count in zip(unique_values, counts):
    print(f'Value: {value}, Count: {count}')

Value: 0.0, Count: 41
Value: 1.0, Count: 97
Value: 2.0, Count: 71


In [206]:
# Test
# Set the model to evaluation mode
model.eval()

# Disable gradient calculation for inference
with torch.no_grad():
    # Get model outputs
    outputs = model(X_test)
    outputs = outputs.view(-1)
    
    # Convert model outputs to binary predictions
    predicted = (outputs >= 0.5).float()
    predicted = predicted * 2

    # Calculate overall accuracy
    correct = (predicted == Y_test).float().sum()
    accuracy = correct / Y_test.size(0)

    # Calculate accuracy for draws
    draw_correct = ((predicted == 0) & (Y_test == 0)).float().sum()
    draw_total = (Y_test == 0).float().sum()
    draw_accuracy = draw_correct / draw_total if draw_total > 0 else 0

    # Calculate accuracy for away wins
    away_win_correct = ((predicted == 2) & (Y_test == 2)).float().sum()
    away_win_total = (Y_test == 2).float().sum()
    away_win_accuracy = away_win_correct / away_win_total if away_win_total > 0 else 0

    # Print accuracy results
    print(f"Overall Accuracy: {accuracy.item() * 100:.2f}%")
    print(f"Draw Accuracy: {draw_accuracy.item() * 100:.2f}%")
    print(f"Away Win Accuracy: {away_win_accuracy.item() * 100:.2f}%")

    # Convert tensors to numpy arrays for classification report
    Y_test_numpy = Y_test.cpu().numpy()
    predicted_numpy = predicted.cpu().numpy()

    # Generate and print classification report
    report = classification_report(Y_test_numpy, predicted_numpy, digits=4, zero_division=0.0)
    print(report)

    # Get the indices of records predicted to be 0 (draw)
    draw_indices = torch.nonzero(predicted == 0).squeeze().cpu().numpy()

# Return the indices
draw_indices

Overall Accuracy: 36.84%
Draw Accuracy: 36.59%
Away Win Accuracy: 87.32%
              precision    recall  f1-score   support

         0.0     0.2027    0.3659    0.2609        41
         1.0     0.0000    0.0000    0.0000        97
         2.0     0.4593    0.8732    0.6019        71

    accuracy                         0.3684       209
   macro avg     0.2207    0.4130    0.2876       209
weighted avg     0.1958    0.3684    0.2557       209



array([  0,   2,   4,   5,   6,   7,   8,   9,  11,  12,  15,  16,  19,
        20,  35,  37,  40,  42,  48,  49,  51,  72,  73,  74,  82,  85,
        86,  91, 106, 110, 111, 112, 113, 114, 115, 116, 117, 119, 120,
       131, 132, 133, 134, 136, 137, 138, 139, 140, 141, 143, 144, 145,
       146, 148, 149, 151, 152, 154, 155, 156, 160, 162, 176, 178, 179,
       183, 184, 185, 189, 192, 193, 195, 196, 204], dtype=int64)

In [207]:
draw_indices = [  3,  10,  18,  22,  23,  34,  63,  66,  69,  80,  84,  95,  98,
       108, 109, 149, 150, 153, 168, 173, 180, 181, 187, 195, 196]

In [208]:
combined_het2

Unnamed: 0,actual,predicted_x,predicted_y,predicted_z,predicted
92,0,1,1,1,1
93,1,1,1,1,1
94,0,1,1,1,1
95,1,0,2,2,2
96,1,1,1,1,1
...,...,...,...,...,...
1966,1,1,1,1,1
1967,1,2,2,1,2
1968,1,1,1,1,1
1969,2,1,2,1,1


In [209]:
combined_het2.reset_index(drop=True, inplace=True)

In [210]:
combined_het2

Unnamed: 0,actual,predicted_x,predicted_y,predicted_z,predicted
0,0,1,1,1,1
1,1,1,1,1,1
2,0,1,1,1,1
3,1,0,2,2,2
4,1,1,1,1,1
...,...,...,...,...,...
204,1,1,1,1,1
205,1,2,2,1,2
206,1,1,1,1,1
207,2,1,2,1,1


In [211]:
combined_het2.drop(draw_indices)

Unnamed: 0,actual,predicted_x,predicted_y,predicted_z,predicted
0,0,1,1,1,1
1,1,1,1,1,1
2,0,1,1,1,1
4,1,1,1,1,1
5,1,1,1,1,1
...,...,...,...,...,...
204,1,1,1,1,1
205,1,2,2,1,2
206,1,1,1,1,1
207,2,1,2,1,1


In [212]:
combined_het2[(combined_het2["predicted_x"] == 1) & (combined_het2["predicted_y"] == 1) & (combined_het2["predicted_z"] == 1)]["actual"].value_counts()

actual
1    70
0    19
2    17
Name: count, dtype: int64

In [213]:
combined_het2[(combined_het2["predicted_x"] == 2) & (combined_het2["predicted_y"] == 2) & (combined_het2["predicted_z"] == 2)]["actual"].value_counts()

actual
2    38
0    16
1    11
Name: count, dtype: int64

In [214]:
home = combined_het2[(combined_het2["predicted_x"] == 1) & (combined_het2["predicted_y"] == 1) & (combined_het2["predicted_z"] == 1)]
home

Unnamed: 0,actual,predicted_x,predicted_y,predicted_z,predicted
0,0,1,1,1,1
1,1,1,1,1,1
2,0,1,1,1,1
4,1,1,1,1,1
5,1,1,1,1,1
...,...,...,...,...,...
192,0,1,1,1,1
193,1,1,1,1,1
194,1,1,1,1,1
204,1,1,1,1,1


In [215]:
away = combined_het2[(combined_het2["predicted_x"] == 2) & (combined_het2["predicted_y"] == 2) & (combined_het2["predicted_z"] == 2)]
away

Unnamed: 0,actual,predicted_x,predicted_y,predicted_z,predicted
17,1,2,2,2,2
18,1,2,2,2,2
24,2,2,2,2,2
25,0,2,2,2,2
26,2,2,2,2,2
...,...,...,...,...,...
199,2,2,2,2,2
200,1,2,2,2,2
201,0,2,2,2,2
202,0,2,2,2,2


In [216]:
het2_filtered = combined_het2.drop(home.index, errors='ignore')
het2_filtered = het2_filtered.drop(away.index, errors='ignore')
het2_filtered

Unnamed: 0,actual,predicted_x,predicted_y,predicted_z,predicted
3,1,0,2,2,2
22,2,2,1,1,1
23,0,2,2,1,2
31,1,2,1,1,1
34,0,0,2,2,2
45,1,2,2,1,2
52,1,2,2,1,2
60,2,2,2,1,2
63,2,2,2,1,2
81,2,2,1,1,1


In [208]:
data2 = fixtures[["date"] + selected_predictors + ["target"]]
data2

Unnamed: 0,date,sh_rolling_x,dist_rolling_x,xg_rolling_x,poss_rolling_x,oppsh_rolling_x,oppdist_rolling_x,sh_rolling_y,dist_rolling_y,xg_rolling_y,...,team_att_x,team_mid_x,team_def_x,team_ovr_x,team_att_y,team_def_y,B365H,B365D,B365A,target
0,2018-09-23,14.0,17.52,1.48,55.2,14.4,16.80,11.8,16.86,1.06,...,82,83,80,82,77,79,1.44,5.00,7.50,1
1,2018-09-29,14.0,17.44,1.54,59.2,13.0,16.58,10.8,17.86,1.16,...,82,83,80,82,78,77,1.44,5.10,7.00,1
2,2018-10-22,11.2,16.54,1.20,62.4,12.6,16.90,12.4,18.34,1.44,...,82,83,80,82,78,79,1.53,4.50,6.50,1
3,2018-11-03,10.6,17.56,1.38,60.4,13.4,19.36,12.4,18.12,1.34,...,82,83,80,82,86,82,3.90,3.90,1.95,0
4,2018-11-11,11.2,16.96,1.50,60.2,14.0,19.40,14.0,18.96,1.42,...,82,83,80,82,75,76,1.66,4.20,5.25,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1966,2023-12-05,10.6,16.12,1.26,45.8,11.4,19.56,12.0,16.74,1.06,...,75,77,76,76,69,73,1.85,3.60,4.20,1
1967,2023-12-24,9.2,15.78,0.96,51.4,12.4,18.58,11.6,17.74,1.62,...,75,77,76,76,77,78,3.75,3.75,1.91,1
1968,2023-12-30,11.2,17.20,1.28,49.2,12.6,17.12,12.8,13.96,1.74,...,75,77,76,76,79,75,2.63,3.30,2.70,1
1969,2024-02-01,11.8,17.40,1.48,41.6,12.8,17.18,9.8,16.88,0.90,...,75,77,76,76,82,80,2.63,3.60,2.55,2


In [209]:
selected_records2 = data2.loc[het2_filtered.index]
selected_records2

Unnamed: 0,date,sh_rolling_x,dist_rolling_x,xg_rolling_x,poss_rolling_x,oppsh_rolling_x,oppdist_rolling_x,sh_rolling_y,dist_rolling_y,xg_rolling_y,...,team_att_x,team_mid_x,team_def_x,team_ovr_x,team_att_y,team_def_y,B365H,B365D,B365A,target
95,2023-10-08,14.6,15.72,2.34,60.8,9.4,17.2,18.8,17.0,2.3,...,82,84,81,82,87,83,2.9,3.3,2.4,1
184,2024-02-11,14.0,15.16,1.94,62.0,11.2,14.5,13.0,17.72,1.34,...,82,79,79,80,82,80,2.15,3.75,3.1,2
235,2023-08-12,8.8,15.42,0.98,44.2,15.2,16.56,12.0,18.0,1.08,...,75,75,74,75,78,78,2.7,3.4,2.55,0
243,2023-12-26,15.4,13.88,1.96,46.2,13.6,16.58,12.4,17.54,1.52,...,75,75,74,75,76,76,2.15,3.6,3.2,1
281,2023-08-26,14.4,15.34,2.38,40.6,14.2,18.62,16.2,16.86,1.48,...,75,76,76,76,75,76,2.15,3.4,3.5,0
380,2023-09-02,19.0,17.54,2.26,60.0,12.2,15.42,16.6,16.96,1.88,...,77,77,78,77,79,82,2.5,3.8,2.55,1
387,2023-12-28,17.8,16.68,1.42,63.6,11.4,15.54,15.6,16.18,1.82,...,77,77,78,77,81,79,2.5,3.6,2.6,1
468,2023-11-04,8.8,18.58,0.86,48.2,16.2,16.34,10.6,16.52,0.76,...,69,74,73,74,75,76,2.75,3.25,2.6,2
471,2023-12-16,11.0,17.44,0.98,45.4,13.8,15.18,14.8,16.38,1.82,...,69,74,73,74,79,75,3.6,3.4,2.1,2
683,2023-11-11,10.0,15.52,0.96,34.8,14.4,15.38,14.8,19.44,1.38,...,75,76,76,76,79,75,2.45,3.2,3.0,2


In [210]:
X_test = selected_records2[selected_predictors].to_numpy()
Y_test = selected_records2["target"].to_numpy()

In [211]:
# standardize the data using Standard sclaer

X_test = scaler.transform(X_test)

In [212]:
# convert data to PyTorch tensors and move it to GPU
X_test = torch.tensor(X_test, dtype=torch.float32).to(device)
Y_test = torch.tensor(Y_test, dtype=torch.float32).to(device)

In [213]:
# Get unique values and their counts
unique_values, counts = torch.unique(Y_test, return_counts=True)

# Display the counts
for value, count in zip(unique_values, counts):
    print(f'Value: {value}, Count: {count}')

Value: 0.0, Count: 6
Value: 1.0, Count: 16
Value: 2.0, Count: 16


In [214]:
#Test
model.eval()
with torch.no_grad():
    outputs = model(X_test)
    outputs = outputs.view(-1)
    predicted = (outputs >= 0.5).float()
    predicted = predicted * 2

    correct = (predicted == Y_test).float().sum()
    accuracy = correct / Y_test.size(0)

    draw_correct = ((predicted == 0) & (Y_test == 0)).float().sum()
    away_win_correct = ((predicted == 2) & (Y_test == 2)).float().sum()

    draw_total = (Y_test == 0).float().sum()
    away_win_total = (Y_test == 2).float().sum()

    draw_accuracy = draw_correct / draw_total if draw_total > 0 else 0
    away_win_accuracy = away_win_correct / away_win_total if away_win_total > 0 else 0

    print(f"Overall Accuracy: {accuracy.item() * 100:.2f}%")
    print(f"Draw Accuracy: {draw_accuracy.item() * 100:.2f}%")
    print(f"Away Win Accuracy: {away_win_accuracy.item() * 100:.2f}%")

    Y_test_cpu = Y_test.cpu()
    predicted_cpu = predicted.cpu()
    Y_test_numpy = Y_test_cpu.numpy()
    predicted_numpy = predicted_cpu.numpy()
    report = classification_report(Y_test_numpy, predicted_numpy, digits=4, zero_division=0.0)
    print(report)

Overall Accuracy: 39.47%
Draw Accuracy: 16.67%
Away Win Accuracy: 87.50%
              precision    recall  f1-score   support

         0.0     0.2500    0.1667    0.2000         6
         1.0     0.0000    0.0000    0.0000        16
         2.0     0.4118    0.8750    0.5600        16

    accuracy                         0.3947        38
   macro avg     0.2206    0.3472    0.2533        38
weighted avg     0.2128    0.3947    0.2674        38

