In [2]:
import code
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F

%matplotlib inline

In [3]:
print(torch.backends.mps.is_available())
print(torch.backends.mps.is_built())
torch.device("mps")

True
True


device(type='mps')

In [4]:
class Loader:

    def __init__(self, files):
        self.files = files

    def get_data(self):
        dfs = []
        for file in self.files:
            df = self.load_past_matches(file)
            dfs.append(df)
        
        df_join = self.join_data(dfs[0], dfs[1])

        return df_join

    def load_past_matches(self, file):
        df = pd.read_csv(f'../../data/{file}')
        df.drop('Unnamed: 0', axis=1, inplace=True)
        df['date'] = pd.to_datetime(df['date']).dt.date

        return df

    def join_data(self, df1, df2):
        df = pd.merge(df1, df2,  how='inner',
            left_on=['league', 'date','team', 'opponent', 'home'],
            right_on=['league', 'date','team', 'opponent', 'home'])
        df.sort_values(by=['date', 'league', 'team', 'opponent'], inplace=True)
        df.reset_index(inplace=True, drop=True)
              
        return df

In [5]:
class Wavenet:

    def __init__(self, df, future=None, past_matches=7, future_date=None):
        self.df = df
        self.future = future
        self.X = None
        self.Y = None
        self.dfs = None
        self.dfs_future = None
        self.past_matches = past_matches
        self.future_date = future_date
        self.index_columns = ['league', 'date', 'team', 'opponent', 'result']

    def set_up_data(self, df):
        df.drop(['team_goals_scored',
            'opponent_goals_scored',
            'team_goals_conceded',
            'opponent_goals_conceded'], axis=1, inplace=True)

    def build_dataset(self, df):
        df_copy = df.copy()
        df_copy.reset_index(inplace=True, drop=True)
        date = df_copy[['date']].iloc[:,0]
        df_copy.drop(['league', 'date', 'team', 'opponent'], axis=1, inplace=True)
        df_copy['date'] = date
        df_copy.sort_values(by=['date'], inplace=True)
        df_copy = df_copy[df_copy['date']<self.future_date]
        df_copy.drop(['date'], axis=1, inplace=True)
        
        self.X = df_copy.drop(['result'], axis=1).to_numpy()
        self.Y = np.array(df_copy['result']) / 0.5
        
        self.X = torch.tensor(self.X).float()
        self.Y = torch.tensor(self.Y).long()

    def add_past_to_row(self, df, i):
        df_past = df.copy()
        df_past.index += i
        df_past.rename(columns={c: c+f'_{i}' for c in df_past.columns if c not in ['league',
                                                                                'date',
                                                                                'team',
                                                                                'opponent']}, inplace=True)
        return df_past

    def build_matches_dataset(self, df, past_matches, team):
        dfs_past = []
        df_team = df[df['team']==team]
        for i in range(1, past_matches+1):
            df_past = self.add_past_to_row(df_team, i)
            dfs_past.append(df_past)

        df_team_joined = df_team.copy()
        for df_past in dfs_past:
            df_team_joined = pd.concat([df_team_joined, df_past],
                                        axis=1,
                                        )
        df_team_joined = df_team_joined[past_matches:-past_matches]

        return df_team_joined

    def build_teams_dataset(self, df, past_matches):
        dfs = []
        for team in df['team'].unique():
            df_team_joined = self.build_matches_dataset(df, past_matches, team)
            dfs.append(df_team_joined)
        dfs = pd.concat(dfs)
        dfs.insert(5, 'result_0', 0)
        dfs = self.add_opponent_past_matches(dfs)
        dfs = self.ordering_columns(dfs)

        return dfs
    
    def add_opponent_past_matches(self, df):
        df = df.loc[:,~df.columns.duplicated()].copy()
        df_copy = df.copy()
        keep_same = {'league', 'date', 'team', 'opponent', 'result'}
        df_copy.columns = ['{}{}'.format(c, '' if c in keep_same else '_y') for c in df_copy.columns]
        df_copy['result'] = 1 - df_copy['result']
        df_copy.rename(columns={'team': 'opponent', 'opponent': 'team'}, inplace=True)
        df_combined = pd.merge(df, df_copy, how='left',
                              left_on=['league', 'date', 'team', 'opponent', 'result'],
                              right_on=['league', 'date', 'team', 'opponent', 'result'])
        return df_combined

    def ordering_columns(self, df):
        index_columns = self.index_columns
        template_columns = ['result', 'elo_team', 'elo_opponent', 'elo_diff', 'home', 'team_goals_scored_avg',
                           'team_goals_conceded_avg', 'team_goals_scored_avg_home',
                           'team_goals_conceded_avg_home', 'team_goals_scored_avg_away',
                           'team_goals_conceded_avg_away', 'opponent_goals_scored_avg',
                           'opponent_goals_conceded_avg', 'opponent_goals_scored_avg_home',
                           'opponent_goals_conceded_avg_home', 'opponent_goals_scored_avg_away',
                           'opponent_goals_conceded_avg_away', 'league_home_goals_scored',
                           'league_away_goals_scored', 'league_home_goals_scored_avg',
                           'league_away_goals_scored_avg', 'league_home_goals_conceded',
                           'league_away_goals_conceded', 'league_home_goals_conceded_avg',
                           'league_away_goals_conceded_avg', 'team_attack_strength',
                           'team_defense_strength', 'opponent_attack_strength',
                           'opponent_defense_strength', 'team_lambda', 'opponent_lambda']
        template_columns_y = [s + f'_y' for s in template_columns]
        columns = [[index_columns + ['result_0'] + template_columns[1:] + ['result_0_y'] + template_columns_y[1:]]]
        for i in range(1, self.past_matches+1):
            team_cols = [s + f'_{i}' for s in template_columns]
            opp_cols = [s + f'_{i}_y' for s in template_columns]
            columns.append([team_cols, opp_cols])

        columns = [subitem for sublist in columns for item in sublist for subitem in item]
        df = df[columns]
        df.dropna(inplace=True)
        df.reset_index(inplace=True, drop=True)
        return df
    
    def get_final_entry(self, df, team_or_opponent):
        df = df.copy()
        df = df.loc[:,~df.columns.duplicated()].copy()
        df.sort_values(by='date', inplace=True)
        df.reset_index(inplace=True, drop=True)
        df.drop_duplicates(subset=team_or_opponent, keep='last', inplace=True)
        df = df.loc[:, ~df.columns.str.contains('_y')]
        df = df.drop(['home'], axis=1)
        return df
    
    def team_to_opponent(self, df):
        df_opponent = df.copy()     
        keep_same = {'league', 'date', 'team', 'opponent', 'result'}
        df_opponent.columns = ['{}{}'.format(c, '' if c in keep_same else '_y') for c in df_opponent.columns]
        df_opponent['result'] = 1 - df_opponent['result']
        df_opponent = self.drop_common_columns(df_opponent, 'opponent')
        return df_opponent
    
    def drop_common_columns(self, df, team_or_opp):
        columns_to_drop = [item for item in self.index_columns if item not in [team_or_opp]]
        df.drop(columns=columns_to_drop, axis=1, inplace=True)
        return df
    
    def add_stats_to_future(self, stats, future):
        stats = self.get_final_entry(stats, 'team')
        stats_opp = self.team_to_opponent(stats)
        stats = self.drop_common_columns(stats, 'team')

        df_future = pd.merge(future, stats, how='left', on='team')
        df_future = pd.merge(df_future, stats_opp, how='left', on='opponent')
        df_future['home_y'] = 1 - df_future['home']
        df_future['date'] = pd.to_datetime(df_future['date'], dayfirst=True)
        df_future['date'] = df_future['date'].dt.date
        df_future.sort_values(by='date', inplace=True)
#         df_future.drop_duplicates(subset=['team', 'opponent'], inplace=True)

        return df_future
    
    def remove_duplicate_columns(self, df):
        df = df.loc[:,~df.columns.duplicated()].copy()
        return df

    def build_wavenet_dataset(self):
        df_copy = self.df.copy()
        df_copy.sort_values(by=['team', 'date'], inplace=True)
        df_copy.reset_index(inplace=True, drop=True)
        self.dfs = self.build_teams_dataset(df_copy, self.past_matches)
        self.build_dataset(self.dfs)
        
    def order_date(self, df):
        df = df.sort_values(by=['team', 'date'])
        df = df.reset_index(drop=True)
        return df

    def build_wavenet_dataset_past_future(self):
        df_copy = self.df.copy()
        self.set_up_data(df_copy)
        df_copy.sort_values(by=['team', 'date'], inplace=True)
        df_copy.reset_index(inplace=True, drop=True)
        self.dfs = self.build_teams_dataset(df_copy, self.past_matches)
        self.dfs_future = self.add_stats_to_future(self.dfs, self.future)
        self.dfs_future = self.dfs_future[self.dfs.drop(['result'], axis=1).columns]
        self.dfs_future = self.order_date(self.dfs_future)
        self.dfs = self.dfs.loc[:,~self.dfs.columns.duplicated()].copy()
#         self.dfs = self.dfs[self.dfs['date']<future_date]
#         self.build_dataset(self.dfs)
#         self.dfs = self.remove_duplicate_columns(self.dfs)

In [6]:
def load_future_matches():
    df = pd.read_csv('../../data/future_matches_serie_a.csv', parse_dates=True, dayfirst=True)
    df['date'] = pd.to_datetime(df['date'], dayfirst=True)
    df.drop('Unnamed: 0', axis=1, inplace=True)
    df = duplicate_to_team_and_opponent(df)
    df.reset_index(inplace=True, drop=True)
    return df

def duplicate_to_team_and_opponent(df_matches):
    df_matches_copy = df_matches.copy()
    df_matches = df_matches.rename(columns={'pt1': 'team', 'pt2': 'opponent',
                                            })
    df_matches_copy = df_matches_copy.rename(columns={'pt2': 'team', 'pt1': 'opponent',
                                                    })
    df_matches_copy = df_matches_copy[['league', 'date', 'team', 'opponent' 
                                        ]]
    df_matches.loc[:, 'home'] = 1
    df_matches_copy.loc[:, 'home'] = 0
    df_matches = pd.concat([df_matches, df_matches_copy])
    df_matches.sort_values(by='date', inplace=True)

    return df_matches


def build_future_dataset(df):
    df_copy = df.copy()
    df_copy.reset_index(inplace=True, drop=True)
    date = df_copy[['date']].iloc[:,0]
    df_copy.drop(['league', 'date', 'team', 'opponent'], axis=1, inplace=True)
    df_copy['date'] = date
    df_copy.sort_values(by=['date'], inplace=True)
    df_copy.drop(['date'], axis=1, inplace=True)

    X = df_copy.to_numpy()
    X = torch.tensor(X).float()
    
    return X

In [7]:
def add_stats_to_future(stats, future):
    columns = stats.drop(['result'], axis=1).columns
    stats = get_final_entry(stats, 'team')
    stats_opp = team_to_opponent(stats)

    df_future = pd.merge(future, stats, how='left', on='team')
    df_future = pd.merge(df_future, stats_opp, how='left', on='opponent')
    df_future['elo_diff'] = df_future['elo_team'] - df_future['elo_opponent']
    df_future['date'] = pd.to_datetime(df_future['date'], dayfirst=True)
    df_future['date'] = df_future['date'].dt.date

    df_future.sort_values(by='date', inplace=True)
    df_future = df_future[columns]

    return df_future


def get_final_entry(df, team_or_opponent):
    df.sort_values(by='date', inplace=True)
    df.reset_index(inplace=True, drop=True)
    df.drop_duplicates(subset=team_or_opponent, keep='last', inplace=True)
    df = df.loc[:, df.columns.str.contains(team_or_opponent) | df.columns.str.contains('league_')]

    return df


def duplicate_to_team_and_opponent(df_matches):
    df_matches_copy = df_matches.copy()
    df_matches = df_matches.rename(columns={'pt1': 'team', 'pt2': 'opponent',
                                            })
    df_matches_copy = df_matches_copy.rename(columns={'pt2': 'team', 'pt1': 'opponent',
                                                    })
    df_matches_copy = df_matches_copy[['league', 'date', 'team', 'opponent' 
                                        ]]
    df_matches.loc[:, 'home'] = 1
    df_matches_copy.loc[:, 'home'] = 0
    df_matches = pd.concat([df_matches, df_matches_copy])
    df_matches.sort_values(by='date', inplace=True)

    return df_matches


def team_to_opponent(df):
    df_opponent = df.copy()
    df_opponent = df_opponent.loc[:, df_opponent.columns.str.contains("team")]
    df_opponent.columns = df_opponent.columns.str.replace("team", "opponent")

    return df_opponent


In [8]:
# future_data_combined.to_csv("../../data/predictions/future_test.csv")

In [9]:
FILES = ["elos_matches.csv", "goals_matches.csv"]
loader = Loader(FILES)
future = load_future_matches()
future_date = future['date'][0]
data = loader.get_data()
wavenet = Wavenet(data, future, 15, future_date)
wavenet.build_wavenet_dataset_past_future()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace=True)


In [10]:
future_data = add_stats_to_future(data, future)
future_data.drop(['team_goals_conceded',
                 'opponent_goals_conceded',
                 'opponent_goals_scored',
                 'team_goals_scored'], axis=1, inplace=True)
current_date = pd.to_datetime("2023-01-25")
next_match = wavenet.dfs[wavenet.dfs['date']>=current_date]
next_match.reset_index(inplace=True, drop=True)
next_match_team = next_match.loc[:, next_match.columns.str.contains('\d$', regex=True) | 
                                    next_match.columns.str.contains('^team$', regex=True)]
next_match_opp = next_match.loc[:, next_match.columns.str.contains('_y$', regex=True) | 
                                   next_match.columns.str.contains('^opponent$', regex=True)]
future_data_combined = pd.merge(future_data, next_match_team, how='left',
                               left_on='team',
                               right_on='team')
future_data_combined = pd.merge(future_data_combined, next_match_opp, how='left',
                               left_on='opponent',
                               right_on='opponent')
columns_list = wavenet.dfs.drop(['result'], axis=1).columns
future_data_combined = future_data_combined[columns_list]

  next_match = wavenet.dfs[wavenet.dfs['date']>=current_date]


In [11]:
next_match[next_match['league']=='Serie A'].head()

Unnamed: 0,league,date,team,opponent,result,result_0,elo_team,elo_opponent,elo_diff,home,...,league_home_goals_conceded_15_y,league_away_goals_conceded_15_y,league_home_goals_conceded_avg_15_y,league_away_goals_conceded_avg_15_y,team_attack_strength_15_y,team_defense_strength_15_y,opponent_attack_strength_15_y,opponent_defense_strength_15_y,team_lambda_15_y,opponent_lambda_15_y
0,Serie A,2023-01-29,ac_milan,sassuolo,0.5,0,1643.549693,1471.441369,172.108324,1.0,...,1.333333,1.5,1.157895,1.403509,0.9375,1.318182,1.454545,1.2375,1.628289,2.220096
4,Serie A,2023-01-29,as_roma,napoli,0.5,0,1634.23881,1725.112191,-90.873381,0.0,...,1.375,1.0,1.048246,1.425439,1.656904,0.590769,1.144615,0.65272,1.133671,0.963887
5,Serie A,2023-01-28,atalanta,sampdoria,0.5,0,1606.610311,1411.488424,195.121887,1.0,...,0.666667,2.333333,1.052193,1.357018,0.892049,1.2005,1.500625,1.241112,1.502399,1.895527
6,Serie A,2023-01-27,bologna,spezia,0.5,0,1515.317379,1471.277445,44.039935,1.0,...,1.428571,1.714286,1.200439,1.45,0.701498,1.306715,1.23412,0.745342,0.627656,2.338332
9,Serie A,2023-01-28,cremonese,inter_milan,0.5,0,1467.498921,1633.776448,-166.277527,1.0,...,2.2,1.4,1.067982,1.414912,1.6739,0.73922,1.330595,1.00434,2.378699,1.05047


In [12]:
future_data_combined.head()

Unnamed: 0,league,date,team,opponent,result_0,elo_team,elo_opponent,elo_diff,home,team_goals_scored_avg,...,league_home_goals_conceded_15_y,league_away_goals_conceded_15_y,league_home_goals_conceded_avg_15_y,league_away_goals_conceded_avg_15_y,team_attack_strength_15_y,team_defense_strength_15_y,opponent_attack_strength_15_y,opponent_defense_strength_15_y,team_lambda_15_y,opponent_lambda_15_y
0,Serie A,2023-01-27,bologna,spezia,0,1515.317379,1471.277445,44.039935,1,1.157895,...,1.428571,1.714286,1.200439,1.45,0.701498,1.306715,1.23412,0.745342,0.627656,2.338332
1,Serie A,2023-01-27,lecce,salernitana,0,1514.600223,1445.195351,69.404871,1,1.052632,...,1.333333,1.5,1.157895,1.403509,0.590909,1.275,1.2375,0.681818,0.466507,2.214474
2,Serie A,2023-01-27,salernitana,lecce,0,1445.195351,1514.600223,-69.404871,0,1.263158,...,2.0,1.0,1.109649,1.469298,0.901186,0.752239,0.716418,1.660079,1.660079,0.79183
3,Serie A,2023-01-27,spezia,bologna,0,1471.277445,1515.317379,-44.039935,0,1.052632,...,1.0,0.666667,1.100877,1.495614,0.703812,0.908367,1.003984,1.055718,1.111283,1.003984
4,Serie A,2023-01-28,empoli,torino,0,1506.57965,1573.607783,-67.028133,1,0.947368,...,1.428571,1.714286,1.200439,1.45,0.76225,0.833029,0.876872,1.016334,1.123316,0.876872


In [13]:
future_data_combined[future_data_combined.isnull().any(axis=1)]

Unnamed: 0,league,date,team,opponent,result_0,elo_team,elo_opponent,elo_diff,home,team_goals_scored_avg,...,league_home_goals_conceded_15_y,league_away_goals_conceded_15_y,league_home_goals_conceded_avg_15_y,league_away_goals_conceded_avg_15_y,team_attack_strength_15_y,team_defense_strength_15_y,opponent_attack_strength_15_y,opponent_defense_strength_15_y,team_lambda_15_y,opponent_lambda_15_y


In [14]:
Xfu = build_future_dataset(future_data_combined)

In [15]:
wavenet.dfs_future.shape

(551, 996)

In [16]:
wavenet.dfs.shape

(63958, 997)

In [17]:
Xfu.shape

torch.Size([380, 992])

In [18]:
# wavenet.dfs_future.to_csv("../../data/predictions/wavenet_7_test.csv")

In [19]:
# dfs_test = wavenet.dfs[(wavenet.dfs['team']=='alessandria') | (wavenet.dfs['team']=='reggiana')]
# dfs_test[dfs_test['date']>=pd.to_datetime('2022-09-01')].to_csv("../../data/predictions/past_test.csv")

## Model Predictions

In [24]:
@torch.no_grad()
def get_predictions(x, df):
    x = x[:, None, :]
    logits = model(x)
    preds = []
    preds = torch.softmax(logits, dim=1)
    print(preds)
    df[['loss', 'draw', 'win']] = pd.DataFrame(preds.numpy())
    
    return df

In [25]:
PATH = "../../src/model/trained_models/wavenet_9.pt"
model = torch.load(PATH)
model.train().to("cpu")

Sequential(
  (0): Conv1d(1, 32, kernel_size=(62,), stride=(62,))
  (1): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): Tanh()
  (3): Conv1d(32, 64, kernel_size=(2,), stride=(2,))
  (4): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (5): Tanh()
  (6): Conv1d(64, 128, kernel_size=(2,), stride=(2,))
  (7): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (8): Tanh()
  (9): Flatten(start_dim=1, end_dim=-1)
  (10): Linear(in_features=256, out_features=3, bias=True)
)

In [26]:
Xfu.shape

torch.Size([380, 992])

In [27]:
dfs_preds = future_data_combined.copy()
dfs_preds = dfs_preds[['date', 'team', 'opponent',
                       'elo_team', 'elo_opponent', 'elo_diff', 'home',
                       ]]
dfs_preds.sort_values('date', inplace=True)
dfs_preds.reset_index(inplace=True, drop=True)
dfs_preds = get_predictions(Xfu, dfs_preds)

torch.Size([380, 1, 992])


RuntimeError: mat1 and mat2 shapes cannot be multiplied (380x512 and 256x3)

In [None]:
dfs_preds_cut = dfs_preds.copy()
dfs_preds_cut['prediction'] = dfs_preds_cut[['loss', 'draw', 'win']].idxmax(axis=1)
dfs_preds_cut['prediction'] = dfs_preds_cut['prediction'].replace({'win': 1, 'draw': 0.5, 'loss': 0})

In [None]:
dfs_preds_cut['prediction'].value_counts()

In [None]:
# dfs_preds_cut[dfs_preds_cut['date']<=pd.to_datetime('2023-01-29')]

In [None]:
dfs_preds_cut.head()

In [None]:
def transform_to_home_and_away(df):
    df['date'] = pd.to_datetime(df['date'])
    df_home = df[df['home'] == 1]
    df_away = df[df['home'] == 0]
    if 'result' in df_away.columns:
        df_away.drop('result', axis=1, inplace=True)

    df_home.rename(columns={'team': 'home_team', 'opponent': 'away_team', 'elo_team': 'elo_home', 'elo_opponent': 'elo_away',
                            'loss': 'A', 'draw': 'D', 'win': 'H'}, inplace=True)
    df_away.rename(columns={'team': 'away_team', 'opponent': 'home_team', 'elo_team': 'elo_away', 'elo_opponent': 'elo_home',
                            'loss': 'H', 'draw': 'D', 'win': 'A'}, inplace=True)

    df_combined = pd.concat([df_home, df_away])
    df_combined = df_combined.groupby(['date', 'home_team', 'away_team', 'elo_home', 'elo_away']).mean()
    df_combined.reset_index(inplace=True, drop=False)
    if 'result' in df_combined.columns:
        df_combined.drop(['result'], axis=1, inplace=True)
    df_combined['elo_diff'] = df_combined['elo_home'] - df_combined['elo_away']

    if 'team_goals_scored' not in df_home.columns:
        df_ftr = df_home.drop(['A', 'D', 'H', 'elo_diff', 'elo_home', 'elo_away', 'home'], axis=1)
        df_ftr['date'] = pd.to_datetime(df_ftr['date'])
    else:
        df_ftr = df_home.drop(['loss', 'draw', 'win', 'rest_days', 'team_goals_scored', 'opponent_goals_scored', 'elo_home', 'elo_away', 'home'], axis=1)
        df_ftr['date'] = pd.to_datetime(df_ftr['date'])

    df_combined = df_combined.merge(df_ftr, on=['date', 'home_team', 'away_team'], how='outer'
                                    )

    return df_combined

In [None]:
dfs_preds_h_a = transform_to_home_and_away(dfs_preds_cut)
dfs_preds_h_a = dfs_preds_h_a.loc[:, ~dfs_preds_h_a.columns.str.contains('_x')]
dfs_preds_h_a = dfs_preds_h_a.loc[:, ~dfs_preds_h_a.columns.str.contains('_y')]

In [None]:
dfs_preds_h_a.head()

In [None]:
dfs_preds_h_a['prediction'] = dfs_preds_h_a[['A', 'D', 'H']].idxmax(axis=1)
dfs_preds_h_a['prediction'] = dfs_preds_h_a['prediction'].replace({'H': 1, 'D': 0.5, 'A': 0})

In [None]:
dfs_preds_h_a['prediction'].value_counts()