In [124]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import torch
import torch.nn.functional as F

%matplotlib inline

In [125]:
class Loader:

    def __init__(self, files):
        self.files = files

    def get_data(self):
        dfs = []
        for file in self.files:
            df = self.load_past_matches(file)
            dfs.append(df)
        
        df_join = self.join_data(dfs[0], dfs[1])

        return df_join

    def load_past_matches(self, file):
        df = pd.read_csv(f'../../data/{file}')
        df.drop('Unnamed: 0', axis=1, inplace=True)
        df['date'] = pd.to_datetime(df['date']).dt.date

        return df

    def join_data(self, df1, df2):
        df = pd.merge(df1, df2,  how='inner',
            left_on=['league', 'date','team', 'opponent', 'home'],
            right_on=['league', 'date','team', 'opponent', 'home'])
        df.sort_values(by=['date', 'league', 'team', 'opponent'], inplace=True)
        df.reset_index(inplace=True, drop=True)
              
        return df

In [126]:
def build_dataset(df):
    df_copy = df.copy()
    df_copy.reset_index(inplace=True, drop=True)
    date = df_copy[['date']].iloc[:,0]
    df_copy.drop(['league', 'date', 'team', 'opponent'], axis=1, inplace=True)
    df_copy['date'] = date
    df_copy.sort_values(by=['date'], inplace=True)
    df_copy.drop(['date'], axis=1, inplace=True)
    
    X = df_copy.drop(['result'], axis=1).to_numpy()
    Y = np.array(df_copy['result']) / 0.5
    
    X = torch.tensor(X).float()
    Y = torch.tensor(Y).long()
    
    return X, Y

In [127]:
def add_past_to_row(df, i):
    df_past = df.copy()
    df_past.index += i
    df_past.rename(columns={c: c+f'_{i}' for c in df_past.columns if c not in ['league',
                                                                               'date',
                                                                               'team',
                                                                               'opponent']}, inplace=True)
    return df_past

In [128]:
def build_matches_dataset(df, past_matches, team):
    dfs_past = []
    df_team = df[df['team']==team]
    for i in range(1, past_matches+1):
        df_past = add_past_to_row(df_team, i)
        dfs_past.append(df_past)

    df_team_joined = df_team.copy()
    for df_past in dfs_past:
        df_team_joined = pd.concat([df_team_joined, df_past],
                                    axis=1,
                                    )
    df_team_joined = df_team_joined[past_matches:-past_matches]

    return df_team_joined

In [129]:
def build_teams_dataset(df, past_matches):
    dfs = []
    for team in df['team'].unique():
        df_team_joined = build_matches_dataset(df, past_matches, team)
        dfs.append(df_team_joined)
    dfs = pd.concat(dfs)
    dfs.insert(5, 'result_0', 0)

    return dfs

In [130]:
def build_wavenet_dataset_past_future(df, future_matches, past_matches=7):
    df_copy = df.copy()
    df_copy.sort_values(by=['team', 'date'], inplace=True)
    df_copy.reset_index(inplace=True, drop=True)
    dfs = build_teams_dataset(df_copy, past_matches)
    dfs_future = add_stats_to_future(dfs, future_matches)
    dfs = dfs.loc[:,~dfs.columns.duplicated()].copy()
    dfs = dfs[dfs['date']<future_date]
    X, Y = build_dataset(dfs)
    
    return X, Y, dfs, dfs_future

#     return dfs

In [131]:
FILES = ["elos_matches.csv", "goals_matches.csv"]
loader = Loader(FILES)
data = loader.get_data()
data.shape

(75212, 39)

In [132]:
data.drop(['team_goals_scored',
           'opponent_goals_scored',
           'team_goals_conceded',
           'opponent_goals_conceded'], axis=1, inplace=True)

In [133]:
def load_future_matches():
    df = pd.read_csv('../../data/future_matches.csv', parse_dates=True, dayfirst=True)
    df['date'] = pd.to_datetime(df['date'], dayfirst=True)
    df.drop('Unnamed: 0', axis=1, inplace=True)
    df = duplicate_to_team_and_opponent(df)
    df.sort_values(by='date', inplace=True)
    return df


def add_stats_to_future(stats, future):
    stats = get_final_entry(stats, 'team')
    stats_opp = team_to_opponent(stats)

    df_future = pd.merge(future, stats, how='left', on='team')
    df_future = pd.merge(df_future, stats_opp, how='left', on='opponent')
    df_future['elo_diff'] = df_future['elo_team'] - df_future['elo_opponent']
    df_future['date'] = pd.to_datetime(df_future['date'], dayfirst=True)
    df_future['date'] = df_future['date'].dt.date

    df_future.sort_values(by='date', inplace=True)

    return df_future


def get_final_entry(df, team_or_opponent):
    df = df.copy()
    df = df.loc[:,~df.columns.duplicated()].copy()
    df.sort_values(by='date', inplace=True)
    df.reset_index(inplace=True, drop=True)
    df.drop_duplicates(subset=team_or_opponent, keep='last', inplace=True)
    df = df.loc[:, df.columns.str.contains(team_or_opponent) | df.columns.str.contains('league_') |
               df.columns.str.contains('elo_diff') | df.columns.str.contains('^home_\\d', regex=True) |
               df.columns.str.contains('result')]

    return df


def duplicate_to_team_and_opponent(df_matches):
    df_matches_copy = df_matches.copy()
    df_matches = df_matches.rename(columns={'pt1': 'team', 'pt2': 'opponent',
                                            })
    df_matches_copy = df_matches_copy.rename(columns={'pt2': 'team', 'pt1': 'opponent',
                                                    })
    df_matches_copy = df_matches_copy[['league', 'date', 'team', 'opponent' 
                                        ]]
    df_matches.loc[:, 'home'] = 1
    df_matches_copy.loc[:, 'home'] = 0
    df_matches = pd.concat([df_matches, df_matches_copy])
    df_matches.sort_values(by='date', inplace=True)

    return df_matches


def team_to_opponent(df):
    df_opponent = df.copy()
    df_opponent = df_opponent.loc[:, df_opponent.columns.str.contains("team")]
    df_opponent.columns = df_opponent.columns.str.replace("team", "opponent")

    return df_opponent

def build_future_dataset(df):
    df_copy = df.copy()
    df_copy.reset_index(inplace=True, drop=True)
    date = df_copy[['date']].iloc[:,0]
    df_copy.drop(['league', 'date', 'team', 'opponent'], axis=1, inplace=True)
    df_copy['date'] = date
    df_copy.sort_values(by=['date'], inplace=True)
    df_copy.drop(['date'], axis=1, inplace=True)
    
    X = df_copy.drop(['result'], axis=1).to_numpy()
    X = torch.tensor(X).float()
    
    return X

In [134]:
future_data = load_future_matches()
future_data.reset_index(inplace=True, drop=True)
future_date = future_data['date'][0]

In [135]:
with open("../columns", "rb") as fp:   # Unpickling
    columns = pickle.load(fp)
data = data[columns]

In [136]:
data.columns

Index(['league', 'date', 'team', 'opponent', 'result', 'elo_team',
       'elo_opponent', 'elo_diff', 'home', 'team_goals_scored_avg',
       'team_goals_conceded_avg', 'team_goals_scored_avg_home',
       'team_goals_conceded_avg_home', 'team_goals_scored_avg_away',
       'team_goals_conceded_avg_away', 'opponent_goals_scored_avg',
       'opponent_goals_conceded_avg', 'opponent_goals_scored_avg_home',
       'opponent_goals_conceded_avg_home', 'opponent_goals_scored_avg_away',
       'opponent_goals_conceded_avg_away', 'league_home_goals_scored',
       'league_away_goals_scored', 'league_home_goals_scored_avg',
       'league_away_goals_scored_avg', 'league_home_goals_conceded',
       'league_away_goals_conceded', 'league_home_goals_conceded_avg',
       'league_away_goals_conceded_avg', 'team_attack_strength',
       'team_defense_strength', 'opponent_attack_strength',
       'opponent_defense_strength', 'team_lambda', 'opponent_lambda'],
      dtype='object')

In [137]:
Xall, Yall, dfs, dfs_future = build_wavenet_dataset_past_future(data, future_data, 7)

  dfs = dfs[dfs['date']<future_date]


## Predicting Matches

In [138]:
PATH = "../../src/model/trained_models/wavenet_4.pt"
model = torch.load(PATH)
model.train()

Sequential(
  (0): Conv1d(1, 32, kernel_size=(31,), stride=(31,))
  (1): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): Tanh()
  (3): Conv1d(32, 64, kernel_size=(2,), stride=(2,))
  (4): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (5): Tanh()
  (6): Conv1d(64, 128, kernel_size=(2,), stride=(2,))
  (7): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (8): Tanh()
  (9): Flatten(start_dim=1, end_dim=-1)
  (10): Linear(in_features=256, out_features=3, bias=True)
)

In [139]:
@torch.no_grad()
def predict(x):
    x = x[:, None, :]
    logits = model(x)
    preds = torch.softmax(logits, dim=1)
    
    return preds

In [140]:
predictions = predict(Xall)

In [141]:
dfs.sort_values(by='date', inplace=True)
dfs.reset_index(inplace=True, drop=True)
predictions_df = pd.DataFrame(predictions, columns=['loss', 'draw', 'win'])
dfs_preds = pd.concat([dfs, predictions_df], axis=1)

In [142]:
dfs_preds.head()

Unnamed: 0,league,date,team,opponent,result,result_0,elo_team,elo_opponent,elo_diff,home,...,league_away_goals_conceded_avg_7,team_attack_strength_7,team_defense_strength_7,opponent_attack_strength_7,opponent_defense_strength_7,team_lambda_7,opponent_lambda_7,loss,draw,win
0,Serie A,1997-11-09,empoli,parma,0.0,0,1472.917207,1551.843017,-78.925811,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.82818,0.092926,0.078894
1,Serie A,1997-11-09,brescia,milan,0.0,0,1498.152602,1479.411525,18.741076,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.641593,0.182618,0.175789
2,Serie A,1997-11-09,bari,roma,0.0,0,1464.290814,1525.651095,-61.360281,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.280833,0.196538,0.522629
3,Serie A,1997-11-09,napoli,juventus,0.0,0,1441.459893,1575.166302,-133.706409,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.322649,0.2229,0.45445
4,Serie A,1997-11-09,piacenza,udinese,0.0,0,1433.150516,1503.364301,-70.213785,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.836011,0.108415,0.055573


In [143]:
dfs_preds['prediction'] = dfs_preds[['loss', 'draw', 'win']].idxmax(axis=1)
dfs_preds['prediction'] = dfs_preds['prediction'].replace({'win': 1, 'draw': 0.5, 'loss': 0})
dfs_preds['correct'] = np.where((dfs_preds['result'] == dfs_preds['prediction']), 1, 0)
dfs_preds['correct'].mean()

0.582699087805697

In [144]:
#0.582699087805697
dfs_preds['prediction'].value_counts()

0.0    29456
1.0    29265
0.5    12755
Name: prediction, dtype: int64

In [145]:
dfs_preds.head()

Unnamed: 0,league,date,team,opponent,result,result_0,elo_team,elo_opponent,elo_diff,home,...,team_defense_strength_7,opponent_attack_strength_7,opponent_defense_strength_7,team_lambda_7,opponent_lambda_7,loss,draw,win,prediction,correct
0,Serie A,1997-11-09,empoli,parma,0.0,0,1472.917207,1551.843017,-78.925811,0.0,...,0.0,0.0,0.0,0.0,0.0,0.82818,0.092926,0.078894,0.0,1
1,Serie A,1997-11-09,brescia,milan,0.0,0,1498.152602,1479.411525,18.741076,0.0,...,0.0,0.0,0.0,0.0,0.0,0.641593,0.182618,0.175789,0.0,1
2,Serie A,1997-11-09,bari,roma,0.0,0,1464.290814,1525.651095,-61.360281,1.0,...,0.0,0.0,0.0,0.0,0.0,0.280833,0.196538,0.522629,1.0,0
3,Serie A,1997-11-09,napoli,juventus,0.0,0,1441.459893,1575.166302,-133.706409,1.0,...,0.0,0.0,0.0,0.0,0.0,0.322649,0.2229,0.45445,1.0,0
4,Serie A,1997-11-09,piacenza,udinese,0.0,0,1433.150516,1503.364301,-70.213785,0.0,...,0.0,0.0,0.0,0.0,0.0,0.836011,0.108415,0.055573,0.0,1


In [146]:
def transform_to_home_and_away(df):
    df['date'] = pd.to_datetime(df['date'])
    df_home = df[df['home'] == 1]
    df_away = df[df['home'] == 0]
    if 'result' in df_away.columns:
        df_away.drop('result', axis=1, inplace=True)

    df_home.rename(columns={'team': 'home_team', 'opponent': 'away_team', 'elo_team': 'elo_home', 'elo_opponent': 'elo_away',
                            'loss': 'A', 'draw': 'D', 'win': 'H'}, inplace=True)
    df_away.rename(columns={'team': 'away_team', 'opponent': 'home_team', 'elo_team': 'elo_away', 'elo_opponent': 'elo_home',
                            'loss': 'H', 'draw': 'D', 'win': 'A'}, inplace=True)

    df_combined = pd.concat([df_home, df_away])
    df_combined = df_combined.groupby(['date', 'home_team', 'away_team', 'elo_home', 'elo_away']).mean()
    df_combined.reset_index(inplace=True, drop=False)
    if 'result' in df_combined.columns:
        df_combined.drop(['result'], axis=1, inplace=True)
    df_combined['elo_diff'] = df_combined['elo_home'] - df_combined['elo_away']

    df_ftr = df_home.drop(['A', 'D', 'H', 'elo_diff', 'elo_home', 'elo_away', 'home'], axis=1)
    df_ftr['date'] = pd.to_datetime(df_ftr['date'])

    df_combined = df_combined.merge(df_ftr, on=['date', 'home_team', 'away_team'], how='outer'
                                    )

    return df_combined

In [147]:
dfs_preds_h = transform_to_home_and_away(dfs_preds)
dfs_preds_h.head();

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_away.drop('result', axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_home.rename(columns={'team': 'home_team', 'opponent': 'away_team', 'elo_team': 'elo_home', 'elo_opponent': 'elo_away',
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_away.rename(columns={'team': 'away_team', 'opponent': 'home_team', 'elo_team': 'elo_away', 'elo_opponent': 'elo_home',
  df_combined = df_combined.groupby(['date', 'home_team', 'away_team', 'elo_home', 'elo

In [148]:
# dfs_preds_h['home_team'] = dfs_preds_h['home_team'].str.title().replace('_', ' ', regex=True)
# dfs_preds_h['away_team'] = dfs_preds_h['away_team'].str.title().replace('_', ' ', regex=True)
dfs_preds_h = dfs_preds_h[['date', 'home_team', 'away_team', 'elo_home', 'elo_away', 'A', 'D', 'H', 'result']]
# dfs_preds_h = dfs_preds_h[['date', 'home_team', 'away_team', 'elo_home', 'elo_away',
#                                    'team_goals_scored', 'opponent_goals_scored', 'result', 'A', 'D', 'H']]



In [149]:
dfs_preds.shape

(71476, 258)

In [150]:
dfs_preds_h.shape

(36572, 9)

In [151]:
dfs_preds_h[dfs_preds_h['home_team']=='inter_milan'].tail()

Unnamed: 0,date,home_team,away_team,elo_home,elo_away,A,D,H,result
36028,2022-11-09,inter_milan,bologna,1673.271702,1588.045314,0.021835,0.083817,0.894348,1.0
36440,2023-01-04,inter_milan,napoli,1697.311158,1743.528272,0.265642,0.297743,0.436616,1.0
36498,2023-01-10,inter_milan,parma,1688.162897,1455.373852,0.025826,0.105381,0.868793,1.0
36499,2023-01-10,inter_milan,parma_calcio_1913,1681.282421,1470.738258,0.036337,0.13777,0.825892,1.0
36514,2023-01-14,inter_milan,hellas_verona,1677.164931,1506.506627,0.089172,0.210158,0.70067,1.0


In [152]:
dfs_preds[dfs_preds['team']=='inter_milan'].tail()

Unnamed: 0,league,date,team,opponent,result,result_0,elo_team,elo_opponent,elo_diff,home,...,team_defense_strength_7,opponent_attack_strength_7,opponent_defense_strength_7,team_lambda_7,opponent_lambda_7,loss,draw,win,prediction,correct
71217,Serie A,2023-01-04,inter_milan,napoli,1.0,0,1697.311158,1743.528272,-46.217114,1.0,...,0.7242,0.97767,1.37031,2.380012,1.029127,0.252636,0.414322,0.333042,0.5,0
71232,Serie A,2023-01-07,inter_milan,monza,0.5,0,1714.294825,1577.291645,137.00318,0.0,...,0.646465,0.565657,1.553398,3.433827,0.476342,0.307626,0.255403,0.436971,1.0,0
71330,Coppa Italia,2023-01-10,inter_milan,parma,1.0,0,1688.162897,1455.373852,232.789045,1.0,...,0.716846,1.003584,0.865574,1.548921,1.056404,0.038625,0.151233,0.810141,1.0,1
71331,Coppa Italia,2023-01-10,inter_milan,parma_calcio_1913,1.0,0,1681.282421,1470.738258,210.544163,1.0,...,0.59498,0.371862,1.262399,2.857007,0.313147,0.055481,0.208223,0.736296,1.0,1
71384,Serie A,2023-01-14,inter_milan,hellas_verona,1.0,0,1677.164931,1506.506627,170.658305,1.0,...,0.889653,1.276459,0.59782,1.132712,1.545187,0.090334,0.261825,0.647841,1.0,1


In [153]:
dfs_preds[dfs_preds['team']=='hellas_verona'].tail()

Unnamed: 0,league,date,team,opponent,result,result_0,elo_team,elo_opponent,elo_diff,home,...,team_defense_strength_7,opponent_attack_strength_7,opponent_defense_strength_7,team_lambda_7,opponent_lambda_7,loss,draw,win,prediction,correct
70475,Serie A,2022-11-13,hellas_verona,spezia,0.0,0,1483.9122,1518.539717,-34.627517,1.0,...,1.792531,1.792531,0.640449,0.910112,3.396375,0.132616,0.278864,0.58852,1.0,0
71227,Serie A,2023-01-04,hellas_verona,torino,0.5,0,1470.40226,1614.022762,-143.620502,0.0,...,1.373737,1.454545,0.737864,1.009709,2.602871,0.598715,0.307678,0.093608,0.0,0
71322,Serie A,2023-01-09,hellas_verona,cremonese,1.0,0,1476.272254,1508.817815,-32.545561,1.0,...,1.14456,0.965723,1.080039,1.477948,1.626481,0.167927,0.368565,0.463508,1.0,1
71349,Serie A,2023-01-14,hellas_verona,inter_milan,0.0,0,1506.506627,1677.164931,-170.658305,0.0,...,1.455035,1.050859,0.884173,1.256457,1.991101,0.753498,0.158491,0.088011,0.0,1
71363,Italian Serie A,2023-01-14,hellas_verona,internazionale,0.0,0,1509.731755,1728.202066,-218.470311,0.0,...,1.667954,1.204633,0.802139,1.139882,2.282463,0.855681,0.112239,0.03208,0.0,1


In [154]:
dfs_preds_h['result'].value_counts()

1.0    16085
0.5     9960
0.0     9765
Name: result, dtype: int64

In [155]:
dfs_preds_h['prediction'] = dfs_preds_h[['A', 'D', 'H']].idxmax(axis=1)
dfs_preds_h['prediction'] = dfs_preds_h['prediction'].replace({'H': 1, 'D': 0.5, 'A': 0})
dfs_preds_h['correct'] = np.where((dfs_preds_h['result'] == dfs_preds_h['prediction']), 1, 0)
dfs_preds_h['correct'].mean()

0.5769167669255167

In [156]:
#0.5769167669255167