In [69]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import torch
import torch.nn.functional as F

%matplotlib inline

In [70]:
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F


class Loader:

    def __init__(self, files):
        self.files = files

    def get_data(self):
        dfs = []
        for file in self.files:
            df = self.load_past_matches(file)
            dfs.append(df)
        
        df_join = self.join_data(dfs[0], dfs[1])
        # self.set_up_data(df_join)

        return df_join

    def load_past_matches(self, file):
        df = pd.read_csv(f'../data/{file}')
        df.drop('Unnamed: 0', axis=1, inplace=True)
        df['date'] = pd.to_datetime(df['date']).dt.date

        return df

    def join_data(self, df1, df2):
        df = pd.merge(df1, df2,  how='inner',
            left_on=['league', 'date','team', 'opponent', 'home'],
            right_on=['league', 'date','team', 'opponent', 'home'])
        df.sort_values(by=['date', 'league', 'team', 'opponent'], inplace=True)
        df.reset_index(inplace=True, drop=True)
              
        return df

In [71]:
class Wavenet:

    def __init__(self, df, future=None, past_matches=7):
        self.df = df
        self.future = future
        self.X = None
        self.Y = None
        self.dfs = None
        self.dfs_future = None
        self.past_matches = past_matches

    def set_up_data(self, df):
        df.drop(['team_goals_scored',
            'opponent_goals_scored',
            'team_goals_conceded',
            'opponent_goals_conceded'], axis=1, inplace=True)

    def build_dataset(self, df):
        df_copy = df.copy()
        df_copy.reset_index(inplace=True, drop=True)
        date = df_copy[['date']].iloc[:,0]
        df_copy.drop(['league', 'date', 'team', 'opponent'], axis=1, inplace=True)
        df_copy['date'] = date
        df_copy.sort_values(by=['date'], inplace=True)
        df_copy = df_copy[df_copy['date']<future_date]
        df_copy.drop(['date'], axis=1, inplace=True)
        
        self.X = df_copy.drop(['result'], axis=1).to_numpy()
        self.Y = np.array(df_copy['result']) / 0.5
        
        self.X = torch.tensor(self.X).float()
        self.Y = torch.tensor(self.Y).long()

    def add_past_to_row(self, df, i):
        df_past = df.copy()
        df_past.index += i
        df_past.rename(columns={c: c+f'_{i}' for c in df_past.columns if c not in ['league',
                                                                                'date',
                                                                                'team',
                                                                                'opponent']}, inplace=True)
        return df_past

    def build_matches_dataset(self, df, past_matches, team):
        dfs_past = []
        df_team = df[df['team']==team]
        for i in range(1, past_matches+1):
            df_past = self.add_past_to_row(df_team, i)
            dfs_past.append(df_past)

        df_team_joined = df_team.copy()
        for df_past in dfs_past:
            df_team_joined = pd.concat([df_team_joined, df_past],
                                        axis=1,
                                        )

        df_team_joined.reset_index(inplace=True, drop=True)
        df_team_joined = df_team_joined[past_matches:-past_matches]

        return df_team_joined

    def build_teams_dataset(self, df, past_matches):
        dfs = []
        for team in df['team'].unique():
            df_team_joined = self.build_matches_dataset(df, past_matches, team)
            dfs.append(df_team_joined)
        dfs = pd.concat(dfs)
        dfs.reset_index(inplace=True, drop=True)
        dfs.insert(5, 'result_0', 0)

        return dfs
    
    def team_to_opponent(self, df):
        df_opponent = df.copy()
        df_opponent = df_opponent.loc[:, df_opponent.columns.str.contains("team")]
        df_opponent.columns = df_opponent.columns.str.replace("team", "opponent")

        return df_opponent
    
    def add_stats_to_future(self, stats, future):
        stats = get_final_entry(stats, 'team')
#         stats_opp = self.team_to_opponent(stats)

        df_future = pd.merge(future, stats, how='left', on='team')
#         df_future = pd.merge(df_future, stats_opp, how='left', on='opponent')
        df_future['elo_diff'] = df_future['elo_team'] - df_future['elo_opponent']
        df_future['date'] = pd.to_datetime(df_future['date'], dayfirst=True)
        df_future['date'] = df_future['date'].dt.date
        df_future.sort_values(by='date', inplace=True)

        return df_future
    
    def remove_duplicate_columns(self, df):
        df = df.loc[:,~df.columns.duplicated()].copy()
        return df

    def build_wavenet_dataset(self):
        df_copy = self.df.copy()
        df_copy.sort_values(by=['team', 'date'], inplace=True)
        df_copy.reset_index(inplace=True, drop=True)
        self.dfs = self.build_teams_dataset(df_copy, self.past_matches)
        self.build_dataset(self.dfs)

    def build_wavenet_dataset_past_future(self):
        df_copy = self.df.copy()
        self.set_up_data(df_copy)
        df_copy.sort_values(by=['team', 'date'], inplace=True)
        df_copy.reset_index(inplace=True, drop=True)
        self.dfs = self.build_teams_dataset(df_copy, self.past_matches)
        self.dfs_future = self.add_stats_to_future(self.dfs, self.future)
        self.dfs_future = self.dfs_future[self.dfs.columns]
        self.dfs = self.dfs.loc[:,~self.dfs.columns.duplicated()].copy()
        self.dfs = self.dfs[self.dfs['date']<future_date]
        self.build_dataset(self.dfs)
        self.dfs = self.remove_duplicate_columns(self.dfs)

In [72]:
def load_future_matches():
    df = pd.read_csv('../data/future_matches.csv', parse_dates=True, dayfirst=True)
    df['date'] = pd.to_datetime(df['date'], dayfirst=True)
    df.drop('Unnamed: 0', axis=1, inplace=True)
    df = duplicate_to_team_and_opponent(df)
    df.reset_index(inplace=True, drop=True)
    return df


def get_final_entry(df, team_or_opponent):
    df = df.copy()
    df = df.loc[:,~df.columns.duplicated()].copy()
    df.sort_values(by='date', inplace=True)
    df.reset_index(inplace=True, drop=True)
    df.drop_duplicates(subset=team_or_opponent, keep='last', inplace=True)
    df = df.loc[:, df.columns.str.contains('team') | df.columns.str.contains('opponent') |
                df.columns.str.contains('league_') |
                df.columns.str.contains('elo_diff') | df.columns.str.contains('^home_\\d', regex=True) |
                df.columns.str.contains('result')]
    df.drop(['opponent'], axis=1, inplace=True)


    return df


def duplicate_to_team_and_opponent(df_matches):
    df_matches_copy = df_matches.copy()
    df_matches = df_matches.rename(columns={'pt1': 'team', 'pt2': 'opponent',
                                            })
    df_matches_copy = df_matches_copy.rename(columns={'pt2': 'team', 'pt1': 'opponent',
                                                    })
    df_matches_copy = df_matches_copy[['league', 'date', 'team', 'opponent' 
                                        ]]
    df_matches.loc[:, 'home'] = 1
    df_matches_copy.loc[:, 'home'] = 0
    df_matches = pd.concat([df_matches, df_matches_copy])
    df_matches.sort_values(by='date', inplace=True)

    return df_matches


def build_future_dataset(df):
    df_copy = df.copy()
    df_copy.reset_index(inplace=True, drop=True)
    date = df_copy[['date']].iloc[:,0]
    df_copy.drop(['league', 'date', 'team', 'opponent'], axis=1, inplace=True)
    df_copy['date'] = date
    df_copy.sort_values(by=['date'], inplace=True)
    df_copy.drop(['date'], axis=1, inplace=True)
    
    X = df_copy.drop(['result'], axis=1).to_numpy()
    X = torch.tensor(X).float()
    
    return X

In [73]:
FILES = ["elos_matches.csv", "goals_matches.csv"]
loader = Loader(FILES)
future = load_future_matches()
future_date = future['date'][0]
data = loader.get_data()
wavenet = Wavenet(data, future, 7)
wavenet.build_wavenet_dataset_past_future()
# wavenet.dfs.to_csv('../../data/joined_matches.csv')
# wavenet.dfs_future.to_csv('../../data/future_matches_processed.csv')

  self.dfs = self.dfs[self.dfs['date']<future_date]
  df_copy = df_copy[df_copy['date']<future_date]


In [74]:
data.tail()

Unnamed: 0,league,date,team,opponent,result,elo_team,elo_opponent,elo_diff,home,team_goals_scored,...,league_home_goals_conceded,league_away_goals_conceded,league_home_goals_conceded_avg,league_away_goals_conceded_avg,team_attack_strength,team_defense_strength,opponent_attack_strength,opponent_defense_strength,team_lambda,opponent_lambda
75207,"Serie C, Girone B",2023-01-22,san_donato_tavarnelle,recanatese,0.5,1410.248814,1412.067558,-1.818744,0,0.0,...,0.0,0.0,1.149708,1.139766,1.190234,1.292971,1.246793,1.052899,1.44081,1.837379
75208,"Serie C, Girone B",2023-01-22,siena,aquila_montevarchi,0.5,1424.000617,1343.25748,80.743138,0,0.0,...,0.0,0.0,1.149708,1.139766,0.915565,1.062083,1.200616,0.869786,0.915565,1.453377
75209,"Serie C, Girone B",2023-01-22,torres,imolese,0.5,1417.746454,1285.820827,131.925627,0,0.0,...,0.0,0.0,1.149708,1.139766,1.007121,0.73884,0.600308,0.961343,1.113134,0.505522
75210,"Serie C, Girone B",2023-01-22,virtus_entella,carrarese,0.5,1521.618685,1400.476992,121.141693,1,0.0,...,0.0,0.0,1.149708,1.139766,1.708568,1.190234,1.007121,1.985634,3.86676,1.378166
75211,"Serie C, Girone B",2023-01-22,vis_pesaro,pontedera,0.5,1340.018519,1473.729367,-133.710847,1,0.0,...,0.0,0.0,1.149708,1.139766,0.646485,1.327569,0.732452,1.062083,0.782588,1.117953


In [75]:
data[data['team']=='cesena'].tail()

Unnamed: 0,league,date,team,opponent,result,elo_team,elo_opponent,elo_diff,home,team_goals_scored,...,league_home_goals_conceded,league_away_goals_conceded,league_home_goals_conceded_avg,league_away_goals_conceded_avg,team_attack_strength,team_defense_strength,opponent_attack_strength,opponent_defense_strength,team_lambda,opponent_lambda
74782,"Serie C, Girone B",2022-12-17,cesena,alessandria,1.0,1488.548301,1377.185394,111.362908,1,3.0,...,0.8,1.5,0.969883,0.940789,2.237762,1.031052,0.651191,1.566434,3.297755,0.651191
74874,"Serie C, Girone B",2022-12-23,cesena,carrarese,0.5,1498.898535,1430.102738,68.795796,0,1.0,...,1.3,1.5,0.906725,0.967105,0.986778,0.870748,1.632653,0.928733,0.830971,1.374866
75041,"Serie C, Girone B",2023-01-08,cesena,rimini,1.0,1495.9666,1447.183825,48.782775,1,1.0,...,1.777778,1.888889,0.935673,0.973684,2.162162,1.125,1.06875,0.972973,2.048364,1.125
75158,"Serie C, Girone B",2023-01-15,cesena,torres,1.0,1508.87421,1429.372779,79.501431,0,1.0,...,1.0,0.777778,1.154971,1.130409,0.683544,0.791516,0.977755,0.820253,0.647568,0.874833
75196,"Serie C, Girone B",2023-01-22,cesena,fermana,0.5,1520.500535,1405.862576,114.63796,1,0.0,...,0.0,0.0,1.149708,1.139766,1.847101,0.869786,0.915565,1.431503,3.013691,0.915565


In [76]:
Xfu = build_future_dataset(wavenet.dfs_future)

In [77]:
Xfu.shape

torch.Size([320, 248])

In [78]:
PATH = "../src/model/trained_models/wavenet_4.pt"
model = torch.load(PATH)
model.train()

Sequential(
  (0): Conv1d(1, 32, kernel_size=(31,), stride=(31,))
  (1): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): Tanh()
  (3): Conv1d(32, 64, kernel_size=(2,), stride=(2,))
  (4): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (5): Tanh()
  (6): Conv1d(64, 128, kernel_size=(2,), stride=(2,))
  (7): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (8): Tanh()
  (9): Flatten(start_dim=1, end_dim=-1)
  (10): Linear(in_features=256, out_features=3, bias=True)
)

In [79]:
@torch.no_grad()
def predict(x):
    x = x[:, None, :]
    logits = model(x)
    preds = torch.softmax(logits, dim=1)
    
    return preds

In [80]:
predictions = predict(Xfu)

In [81]:
dfs_future = wavenet.dfs_future
dfs_future = dfs_future.loc[:,~dfs_future.columns.duplicated()].copy()
dfs_future.sort_values(by='date', inplace=True)
dfs_future.reset_index(inplace=True, drop=True)
predictions_df = pd.DataFrame(predictions, columns=['loss', 'draw', 'win'])
dfs_future = pd.concat([dfs_future, predictions_df], axis=1)

In [82]:
dfs_future.head()

Unnamed: 0,league,date,team,opponent,result,result_0,elo_team,elo_opponent,elo_diff,home,...,league_away_goals_conceded_avg_7,team_attack_strength_7,team_defense_strength_7,opponent_attack_strength_7,opponent_defense_strength_7,team_lambda_7,opponent_lambda_7,loss,draw,win
0,"Serie C, Girone B",2023-01-22,imolese,torres,0.5,0,1285.820827,1417.746454,-131.925627,1,...,0.856579,0.731936,2.150538,2.211982,1.126056,0.770459,4.074703,0.359166,0.400409,0.240425
1,"Serie C, Girone B",2023-01-22,carrarese,virtus_entella,0.5,0,1400.476992,1521.618685,-121.141693,0,...,0.856579,1.966206,0.900845,1.182358,1.781874,3.001051,0.99567,0.399443,0.478823,0.121734
2,"Serie C, Girone B",2023-01-22,fiorenzuola,lucchese,0.5,0,1390.44469,1436.26287,-45.818181,0,...,0.856579,0.957147,1.290323,1.474654,1.182358,1.0579,1.629881,0.255645,0.446792,0.297563
3,"Serie C, Girone B",2023-01-22,ancona,gubbio,0.5,0,1474.100863,1447.731989,26.368874,0,...,0.856579,1.182358,1.781874,1.966206,0.900845,0.99567,3.001051,0.183312,0.592608,0.22408
4,"Serie C, Girone B",2023-01-22,rimini,olbia,0.5,0,1422.487676,1363.237419,59.250257,0,...,0.856579,2.211982,1.126056,0.731936,2.150538,4.074703,0.770459,0.262938,0.312872,0.42419


In [83]:
def transform_to_home_and_away(df):
    df['date'] = pd.to_datetime(df['date'])
    df_home = df[df['home'] == 1]
    df_away = df[df['home'] == 0]
    if 'result' in df_away.columns:
        df_away.drop('result', axis=1, inplace=True)

    df_home.rename(columns={'team': 'home_team', 'opponent': 'away_team', 'elo_team': 'elo_home', 'elo_opponent': 'elo_away',
                            'loss': 'A', 'draw': 'D', 'win': 'H'}, inplace=True)
    df_away.rename(columns={'team': 'away_team', 'opponent': 'home_team', 'elo_team': 'elo_away', 'elo_opponent': 'elo_home',
                            'loss': 'H', 'draw': 'D', 'win': 'A'}, inplace=True)

    df_combined = pd.concat([df_home, df_away])
    df_combined = df_combined.groupby(['date', 'home_team', 'away_team', 'elo_home', 'elo_away']).mean()
    df_combined.reset_index(inplace=True, drop=False)
    if 'result' in df_combined.columns:
        df_combined.drop(['result'], axis=1, inplace=True)
    df_combined['elo_diff'] = df_combined['elo_home'] - df_combined['elo_away']

    df_ftr = df_home.drop(['A', 'D', 'H', 'elo_diff', 'elo_home', 'elo_away', 'home'], axis=1)
    df_ftr['date'] = pd.to_datetime(df_ftr['date'])

    df_combined = df_combined.merge(df_ftr, on=['date', 'home_team', 'away_team'], how='outer'
                                    )

    return df_combined

In [84]:
dfs_future_test = transform_to_home_and_away(dfs_future)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_away.drop('result', axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_home.rename(columns={'team': 'home_team', 'opponent': 'away_team', 'elo_team': 'elo_home', 'elo_opponent': 'elo_away',
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_away.rename(columns={'team': 'away_team', 'opponent': 'home_team', 'elo_team': 'elo_away', 'elo_opponent': 'elo_home',
  df_combined = df_combined.groupby(['date', 'home_team', 'away_team', 'elo_home', 'elo

In [85]:
dfs_future_test['home_team'] = dfs_future_test['home_team'].str.title().replace('_', ' ', regex=True)
dfs_future_test['away_team'] = dfs_future_test['away_team'].str.title().replace('_', ' ', regex=True)
dfs_future_test = dfs_future_test[['date', 'home_team', 'away_team', 'elo_home', 'elo_away', 'A', 'D', 'H']]
dfs_future_test[['team_goals_scored', 'opponent_goals_scored', 'result']] = np.nan
dfs_future_test = dfs_future_test[['date', 'home_team', 'away_team', 'elo_home', 'elo_away',
                                   'team_goals_scored', 'opponent_goals_scored', 'result', 'A', 'D', 'H']]

In [86]:
dfs_future_test.to_csv("../data/predictions/wavenet_4_h_a_c_20220123.csv")

In [87]:
dfs_future[dfs_future['team']=='cesena'].iloc[0:1, :]

Unnamed: 0,league,date,team,opponent,result,result_0,elo_team,elo_opponent,elo_diff,home,...,league_away_goals_conceded_avg_7,team_attack_strength_7,team_defense_strength_7,opponent_attack_strength_7,opponent_defense_strength_7,team_lambda_7,opponent_lambda_7,loss,draw,win
9,"Serie C, Girone B",2023-01-22,cesena,fermana,0.5,0,1520.500535,1405.862576,114.63796,1,...,0.856579,0.900845,1.167435,1.597542,1.069753,0.900845,1.597542,0.101801,0.660155,0.238044


In [88]:
dfs_future[dfs_future['team']=='fermana'].iloc[0:1, :]

Unnamed: 0,league,date,team,opponent,result,result_0,elo_team,elo_opponent,elo_diff,home,...,league_away_goals_conceded_avg_7,team_attack_strength_7,team_defense_strength_7,opponent_attack_strength_7,opponent_defense_strength_7,team_lambda_7,opponent_lambda_7,loss,draw,win
5,"Serie C, Girone B",2023-01-22,fermana,cesena,0.5,0,1405.862576,1520.500535,-114.63796,0,...,0.856579,0.731936,1.72043,1.044547,0.731936,0.500798,1.539332,0.340193,0.56542,0.094387
