In [148]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F

%matplotlib inline

In [149]:
class Loader:

    def __init__(self, files):
        self.files = files

    def get_data(self):
        dfs = []
        for file in self.files:
            df = self.load_past_matches(file)
            dfs.append(df)
        
        df_join = self.join_data(dfs[0], dfs[1])

        return df_join

    def load_past_matches(self, file):
        df = pd.read_csv(f'../../data/{file}')
        df.drop('Unnamed: 0', axis=1, inplace=True)
        df['date'] = pd.to_datetime(df['date']).dt.date

        return df

    def join_data(self, df1, df2):
        df = pd.merge(df1, df2,  how='inner',
            left_on=['league', 'date','team', 'opponent', 'home'],
            right_on=['league', 'date','team', 'opponent', 'home'])
        df.sort_values(by=['date', 'league', 'team', 'opponent'], inplace=True)
        df.reset_index(inplace=True, drop=True)
              
        return df

In [150]:
FILES = ["elos_matches.csv", "goals_matches.csv"]
loader = Loader(FILES)
data = loader.get_data()

In [151]:
data.head()

Unnamed: 0,league,date,team,opponent,result,elo_team,elo_opponent,elo_diff,home,team_goals_scored,...,league_home_goals_conceded,league_away_goals_conceded,league_home_goals_conceded_avg,league_away_goals_conceded_avg,team_attack_strength,team_defense_strength,opponent_attack_strength,opponent_defense_strength,team_lambda,opponent_lambda
0,Serie A,1997-08-31,atalanta,bologna,1.0,1500.0,1500.0,0.0,1,4,...,1.444444,1.777778,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Serie A,1997-08-31,bari,parma,0.0,1500.0,1500.0,0.0,1,0,...,1.444444,1.777778,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Serie A,1997-08-31,bologna,atalanta,0.0,1500.0,1500.0,-0.0,0,2,...,1.444444,1.777778,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Serie A,1997-08-31,brescia,inter_milan,0.0,1500.0,1500.0,-0.0,0,1,...,1.444444,1.777778,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Serie A,1997-08-31,empoli,roma,0.0,1500.0,1500.0,0.0,1,1,...,1.444444,1.777778,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [152]:
data.drop(['team_goals_scored',
           'opponent_goals_scored',
           'team_goals_conceded',
           'opponent_goals_conceded'], axis=1, inplace=True)

In [155]:
# start_date = pd.datetime(2021, 8, 1).date()
# end_date = pd.datetime(2022, 8, 1).date()

# test_index = data[(data['date']>=start_date) & (data['date']<=end_date)].index
data = data[data['league']=='Serie A']

In [156]:
data.head()

Unnamed: 0,league,date,team,opponent,result,elo_team,elo_opponent,elo_diff,home,team_goals_scored_avg,...,league_home_goals_conceded,league_away_goals_conceded,league_home_goals_conceded_avg,league_away_goals_conceded_avg,team_attack_strength,team_defense_strength,opponent_attack_strength,opponent_defense_strength,team_lambda,opponent_lambda
0,Serie A,1997-08-31,atalanta,bologna,1.0,1500.0,1500.0,0.0,1,0.0,...,1.444444,1.777778,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Serie A,1997-08-31,bari,parma,0.0,1500.0,1500.0,0.0,1,0.0,...,1.444444,1.777778,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Serie A,1997-08-31,bologna,atalanta,0.0,1500.0,1500.0,-0.0,0,0.0,...,1.444444,1.777778,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Serie A,1997-08-31,brescia,inter_milan,0.0,1500.0,1500.0,-0.0,0,0.0,...,1.444444,1.777778,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Serie A,1997-08-31,empoli,roma,0.0,1500.0,1500.0,0.0,1,0.0,...,1.444444,1.777778,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [157]:
def build_dataset(df):
    df_copy = df.copy()
    df_copy.reset_index(inplace=True, drop=True)
    date = df_copy[['date']].iloc[:,0]
    df_copy.drop(['league', 'date', 'team', 'opponent'], axis=1, inplace=True)
    df_copy['date'] = date
    df_copy.sort_values(by=['date'], inplace=True)
    df_copy.drop(['date'], axis=1, inplace=True)
    
    X = df_copy.drop(['result'], axis=1).to_numpy()
    Y = np.array(df_copy['result']) / 0.5
    
    X = torch.tensor(X).float()
    Y = torch.tensor(Y).long()
    
    return X, Y

In [158]:
def add_past_to_row(df, i):
    df_past = df.copy()
    df_past.index += i
    df_past.rename(columns={c: c+f'_{i}' for c in df_past.columns if c not in ['league',
                                                                               'date',
                                                                               'team',
                                                                               'opponent']}, inplace=True)
    return df_past

In [159]:
def build_matches_dataset(df, past_matches, team):
    dfs_past = []
    df_team = df[df['team']==team]
    for i in range(1, past_matches+1):
        df_past = add_past_to_row(df_team, i)
        dfs_past.append(df_past)

    df_team_joined = df_team.copy()
    for df_past in dfs_past:
        df_team_joined = pd.concat([df_team_joined, df_past],
                                    axis=1,
                                    )
    df_team_joined = df_team_joined[past_matches:-past_matches]

    return df_team_joined

In [160]:
def build_teams_dataset(df, past_matches):
    dfs = []
    for team in df['team'].unique():
        df_team_joined = build_matches_dataset(df, past_matches, team)
        dfs.append(df_team_joined)
    dfs = pd.concat(dfs)
    dfs.insert(5, 'result_0', 0)

    return dfs

In [220]:
def build_wavenet_dataset(df, past_matches=7, date=False):
    df_copy = df.copy()
    df_copy.sort_values(by=['team', 'date'], inplace=True)
    df_copy.reset_index(inplace=True, drop=True)
    dfs = build_teams_dataset(df_copy, past_matches)
    if date:
        dfs = dfs[(dfs.iloc[:, 1]>=start_date) & (dfs.iloc[:, 1]<=end_date)].reset_index(drop=True)
        
    dfs.reset_index(inplace=True, drop=True)
    X, Y = build_dataset(dfs)
    
    return X, Y, dfs

#     return dfs

In [221]:
start_date = pd.datetime(2021, 8, 1).date();
end_date = pd.datetime(2022, 8, 1).date();

test_index = data[(data['date']>=start_date) & (data['date']<=end_date)]

  start_date = pd.datetime(2021, 8, 1).date();
  end_date = pd.datetime(2022, 8, 1).date();


In [282]:
X, Y, dfs = build_wavenet_dataset(data, 7, date=False)

In [283]:
X.shape

torch.Size([11877, 248])

In [284]:
dfs.head()

Unnamed: 0,league,date,team,opponent,result,result_0,elo_team,elo_opponent,elo_diff,home,...,league_home_goals_conceded_7,league_away_goals_conceded_7,league_home_goals_conceded_avg_7,league_away_goals_conceded_avg_7,team_attack_strength_7,team_defense_strength_7,opponent_attack_strength_7,opponent_defense_strength_7,team_lambda_7,opponent_lambda_7
0,Serie A,2003-11-02,ancona_matelica,siena,0.5,0,1442.574926,1530.365639,-87.790713,1.0,...,2.0,0.0,0.985798,1.555973,0.507383,1.334746,1.334746,0.710336,0.560791,1.756244
1,Serie A,2003-11-09,ancona_matelica,inter_milan,0.0,0,1446.286498,1659.564857,-213.278359,0.0,...,0.857143,1.857143,0.982038,1.480785,0.696725,1.706065,0.781946,1.39345,0.953413,1.975444
2,Serie A,2003-11-23,ancona_matelica,brescia,0.5,0,1439.489123,1517.338957,-77.849834,1.0,...,1.0,1.0,0.997076,1.533417,0.514846,1.319648,0.686217,1.201308,0.948401,0.902917
3,Serie A,2003-11-30,ancona_matelica,sampdoria,0.0,0,1442.795028,1543.763852,-100.968824,0.0,...,0.333333,3.333333,1.025272,1.356725,0.667346,1.862069,1.706897,0.975351,0.667346,4.31216
4,Serie A,2003-12-07,ancona_matelica,bologna,0.0,0,1432.035493,1478.964197,-46.928704,0.0,...,1.555556,0.666667,1.01274,1.419382,0.55621,1.299237,0.883481,0.889935,0.70258,1.162475


In [252]:
@torch.no_grad()
def accuracy(x, y):
#     x, y = {
#         'train': [Xtr, Ytr],
#         'val'  : [Xdev, Ydev],
#         'test' : [Xte, Yte]
#     }[split]
    
    x = x[:, None, :]
    logits = model(x)
    preds = []
    preds = torch.argmax(logits, dim=1)

    i = 0
    for pred, true in zip(preds, y):
        if pred == true:
            i += 1
    
#     print(f"----{split}----")
    print(f"Correctly predicted {i} out of {y.shape[0]}.")
    print(f"{i / y.shape[0]:.4f}")
    print(f"Guessing would give an accuracy of {1 / len(torch.unique(y))}")

In [253]:
PATH = "../../src/model/trained_models/wavenet_3.pt"
# model.load_state_dict(torch.load(PATH))
model = torch.load(PATH)
model.train()

Sequential(
  (0): Conv1d(1, 32, kernel_size=(31,), stride=(31,))
  (1): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): Tanh()
  (3): Conv1d(32, 64, kernel_size=(2,), stride=(2,))
  (4): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (5): Tanh()
  (6): Conv1d(64, 128, kernel_size=(2,), stride=(2,))
  (7): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (8): Tanh()
  (9): Flatten(start_dim=1, end_dim=-1)
  (10): Linear(in_features=256, out_features=3, bias=True)
)

In [254]:
accuracy(X, Y)

Correctly predicted 8108 out of 11877.
0.6827
Guessing would give an accuracy of 0.3333333333333333


In [255]:
model.train()

Sequential(
  (0): Conv1d(1, 32, kernel_size=(31,), stride=(31,))
  (1): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): Tanh()
  (3): Conv1d(32, 64, kernel_size=(2,), stride=(2,))
  (4): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (5): Tanh()
  (6): Conv1d(64, 128, kernel_size=(2,), stride=(2,))
  (7): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (8): Tanh()
  (9): Flatten(start_dim=1, end_dim=-1)
  (10): Linear(in_features=256, out_features=3, bias=True)
)

In [293]:
@torch.no_grad()
def get_single_prediction(x, y):
#     x, y = {
#         'train': [Xtr, Ytr],
#         'val'  : [Xdev, Ydev],
#         'test' : [Xte, Yte]
#     }[split]
    
    x = x[:, None, :]
    logits = model(x)
    preds = []
    preds = torch.argmax(logits, dim=1)

    i = 0
    for pred, true in zip(preds, y):
        if pred == true:
            i += 1
    
#     print(f"----{split}----")
    print(f"Correctly predicted {i} out of {y.shape[0]}.")
    print(f"{i / y.shape[0]:.4f}")
    print(f"Guessing would give an accuracy of {1 / len(torch.unique(y))}")
    
    return preds

In [294]:
pred_single = get_single_prediction(X, Y)
pred_single

Correctly predicted 8108 out of 11877.
0.6827
Guessing would give an accuracy of 0.3333333333333333


tensor([0, 0, 2,  ..., 2, 0, 2])

In [300]:
@torch.no_grad()
def get_predictions(x, df):
    x = x[:, None, :]
    logits = model(x)
    preds = []
    preds = torch.softmax(logits, dim=1)
    print(preds)
#     print(preds.numpy())
#     print(pd.DataFrame(preds.numpy()))
    df[['loss', 'draw', 'win']] = pd.DataFrame(preds.numpy())
    
    return df

In [303]:
dfs_preds = dfs.copy()
date = dfs_preds.iloc[:, 1]
dfs_preds.drop('date', axis=1, inplace=True)
dfs_preds['date'] = date
dfs_preds.sort_values('date', inplace=True)
dfs_preds.reset_index(inplace=True)
dfs_preds = get_predictions(X, dfs_preds)

tensor([[0.5598, 0.2679, 0.1723],
        [0.6650, 0.2253, 0.1097],
        [0.0808, 0.2052, 0.7140],
        ...,
        [0.1744, 0.3127, 0.5129],
        [0.7300, 0.2232, 0.0467],
        [0.2424, 0.3295, 0.4281]])


In [304]:
dfs_preds_cut = dfs_preds.iloc[:, 0:9]
dfs_preds_cut[['loss', 'draw', 'win']] = dfs_preds[['loss', 'draw', 'win']]
dfs_preds_cut.head()

Unnamed: 0,index,league,team,opponent,result,result_0,elo_team,elo_opponent,elo_diff,loss,draw,win
0,1208,Serie A,brescia,milan,0.0,0,1498.152602,1479.411525,18.741076,0.559795,0.267868,0.172337
1,9825,Serie A,sampdoria,lazio,0.0,0,1514.867941,1517.31356,-2.44562,0.664995,0.225331,0.109674
2,6350,Serie A,milan,brescia,1.0,0,1479.411525,1498.152602,-18.741076,0.080786,0.205187,0.714027
3,7281,Serie A,napoli,juventus,0.0,0,1441.459893,1575.166302,-133.706409,0.173399,0.176076,0.650525
4,5995,Serie A,lecce,fiorentina,0.0,0,1451.653661,1482.888788,-31.235127,0.73132,0.197683,0.070997


In [305]:
dfs_preds_cut['prediction'] = pred_single.numpy()
dfs_preds_cut

Unnamed: 0,index,league,team,opponent,result,result_0,elo_team,elo_opponent,elo_diff,loss,draw,win,prediction
0,1208,Serie A,brescia,milan,0.0,0,1498.152602,1479.411525,18.741076,0.559795,0.267868,0.172337,0
1,9825,Serie A,sampdoria,lazio,0.0,0,1514.867941,1517.313560,-2.445620,0.664995,0.225331,0.109674,0
2,6350,Serie A,milan,brescia,1.0,0,1479.411525,1498.152602,-18.741076,0.080786,0.205187,0.714027,2
3,7281,Serie A,napoli,juventus,0.0,0,1441.459893,1575.166302,-133.706409,0.173399,0.176076,0.650525,2
4,5995,Serie A,lecce,fiorentina,0.0,0,1451.653661,1482.888788,-31.235127,0.731320,0.197683,0.070997,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
11872,5336,Serie A,juventus,lazio,1.0,0,1673.363534,1718.407595,-45.044061,0.153014,0.351694,0.495291,2
11873,7217,Serie A,milan,fiorentina,1.0,0,1738.790841,1604.339712,134.451129,0.069785,0.345728,0.584488,2
11874,7280,Serie A,monza,salernitana,1.0,0,1557.090581,1563.193522,-6.102941,0.174398,0.312750,0.512852,2
11875,3410,Serie A,fiorentina,milan,0.0,0,1604.339712,1738.790841,-134.451129,0.730028,0.223227,0.046745,0


In [306]:
dfs_preds_cut['prediction'] = dfs_preds_cut['prediction'].replace({2: 1, 1: 0.5})
dfs_preds_cut['correct'] = np.where((dfs_preds_cut['result'] == dfs_preds_cut['prediction']), 1, 0)
dfs_preds_cut['correct'].mean()

0.6826639723835985

In [307]:
dfs_preds_cut['prediction'] = dfs_preds_cut[['loss', 'draw', 'win']].idxmax(axis=1)
dfs_preds_cut['prediction'] = dfs_preds_cut['prediction'].replace({'win': 1, 'draw': 0.5, 'loss': 0})
dfs_preds_cut['correct'] = np.where((dfs_preds_cut['result'] == dfs_preds_cut['prediction']), 1, 0)
dfs_preds_cut

Unnamed: 0,index,league,team,opponent,result,result_0,elo_team,elo_opponent,elo_diff,loss,draw,win,prediction,correct
0,1208,Serie A,brescia,milan,0.0,0,1498.152602,1479.411525,18.741076,0.559795,0.267868,0.172337,0.0,1
1,9825,Serie A,sampdoria,lazio,0.0,0,1514.867941,1517.313560,-2.445620,0.664995,0.225331,0.109674,0.0,1
2,6350,Serie A,milan,brescia,1.0,0,1479.411525,1498.152602,-18.741076,0.080786,0.205187,0.714027,1.0,1
3,7281,Serie A,napoli,juventus,0.0,0,1441.459893,1575.166302,-133.706409,0.173399,0.176076,0.650525,1.0,0
4,5995,Serie A,lecce,fiorentina,0.0,0,1451.653661,1482.888788,-31.235127,0.731320,0.197683,0.070997,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11872,5336,Serie A,juventus,lazio,1.0,0,1673.363534,1718.407595,-45.044061,0.153014,0.351694,0.495291,1.0,1
11873,7217,Serie A,milan,fiorentina,1.0,0,1738.790841,1604.339712,134.451129,0.069785,0.345728,0.584488,1.0,1
11874,7280,Serie A,monza,salernitana,1.0,0,1557.090581,1563.193522,-6.102941,0.174398,0.312750,0.512852,1.0,1
11875,3410,Serie A,fiorentina,milan,0.0,0,1604.339712,1738.790841,-134.451129,0.730028,0.223227,0.046745,0.0,1


In [308]:
dfs_preds_cut['correct'].mean()

0.6826639723835985

In [309]:
dfs_preds_cut['correct'].value_counts()

1    8108
0    3769
Name: correct, dtype: int64