In [7]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F

%matplotlib inline

In [8]:
class Loader:

    def __init__(self, files):
        self.files = files

    def get_data(self):
        dfs = []
        for file in self.files:
            df = self.load_past_matches(file)
            dfs.append(df)
        
        df_join = self.join_data(dfs[0], dfs[1])

        return df_join

    def load_past_matches(self, file):
        df = pd.read_csv(f'../../data/{file}')
        df.drop('Unnamed: 0', axis=1, inplace=True)
        df['date'] = pd.to_datetime(df['date']).dt.date

        return df

    def join_data(self, df1, df2):
        df = pd.merge(df1, df2,  how='inner',
            left_on=['league', 'date','team', 'opponent', 'home'],
            right_on=['league', 'date','team', 'opponent', 'home'])
        df.sort_values(by=['date', 'league', 'team', 'opponent'], inplace=True)
        df.reset_index(inplace=True, drop=True)
              
        return df

In [9]:
FILES = ["elos_matches.csv", "goals_matches.csv"]
loader = Loader(FILES)
data = loader.get_data()

In [10]:
data.head()

Unnamed: 0,league,date,team,opponent,result,elo_team,elo_opponent,elo_diff,home,team_goals_scored,...,league_home_goals_conceded,league_away_goals_conceded,league_home_goals_conceded_avg,league_away_goals_conceded_avg,team_attack_strength,team_defense_strength,opponent_attack_strength,opponent_defense_strength,team_lambda,opponent_lambda
0,Serie A,1997-08-31,atalanta,bologna,1.0,1500.0,1500.0,0.0,1,4.0,...,1.444444,1.777778,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Serie A,1997-08-31,bari,parma,0.0,1500.0,1500.0,0.0,1,0.0,...,1.444444,1.777778,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Serie A,1997-08-31,bologna,atalanta,0.0,1500.0,1500.0,-0.0,0,2.0,...,1.444444,1.777778,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Serie A,1997-08-31,brescia,inter_milan,0.0,1500.0,1500.0,-0.0,0,1.0,...,1.444444,1.777778,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Serie A,1997-08-31,empoli,roma,0.0,1500.0,1500.0,0.0,1,1.0,...,1.444444,1.777778,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
data.drop(['team_goals_scored',
           'opponent_goals_scored',
           'team_goals_conceded',
           'opponent_goals_conceded'], axis=1, inplace=True)

In [12]:
# start_date = pd.datetime(2021, 8, 1).date()
# end_date = pd.datetime(2022, 8, 1).date()

# test_index = data[(data['date']>=start_date) & (data['date']<=end_date)].index
data = data[data['league']=='Serie A']

In [13]:
data.head()

Unnamed: 0,league,date,team,opponent,result,elo_team,elo_opponent,elo_diff,home,team_goals_scored_avg,...,league_home_goals_conceded,league_away_goals_conceded,league_home_goals_conceded_avg,league_away_goals_conceded_avg,team_attack_strength,team_defense_strength,opponent_attack_strength,opponent_defense_strength,team_lambda,opponent_lambda
0,Serie A,1997-08-31,atalanta,bologna,1.0,1500.0,1500.0,0.0,1,0.0,...,1.444444,1.777778,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Serie A,1997-08-31,bari,parma,0.0,1500.0,1500.0,0.0,1,0.0,...,1.444444,1.777778,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Serie A,1997-08-31,bologna,atalanta,0.0,1500.0,1500.0,-0.0,0,0.0,...,1.444444,1.777778,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Serie A,1997-08-31,brescia,inter_milan,0.0,1500.0,1500.0,-0.0,0,0.0,...,1.444444,1.777778,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Serie A,1997-08-31,empoli,roma,0.0,1500.0,1500.0,0.0,1,0.0,...,1.444444,1.777778,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
def build_dataset(df):
    df_copy = df.copy()
    df_copy.reset_index(inplace=True, drop=True)
    date = df_copy[['date']].iloc[:,0]
    df_copy.drop(['league', 'date', 'team', 'opponent'], axis=1, inplace=True)
    df_copy['date'] = date
    df_copy.sort_values(by=['date'], inplace=True)
    df_copy.drop(['date'], axis=1, inplace=True)
    
    X = df_copy.drop(['result'], axis=1).to_numpy()
    Y = np.array(df_copy['result']) / 0.5
    
    X = torch.tensor(X).float()
    Y = torch.tensor(Y).long()
    
    return X, Y

In [15]:
def add_past_to_row(df, i):
    df_past = df.copy()
    df_past.index += i
    df_past.rename(columns={c: c+f'_{i}' for c in df_past.columns if c not in ['league',
                                                                               'date',
                                                                               'team',
                                                                               'opponent']}, inplace=True)
    return df_past

In [16]:
def build_matches_dataset(df, past_matches, team):
    dfs_past = []
    df_team = df[df['team']==team]
    for i in range(1, past_matches+1):
        df_past = add_past_to_row(df_team, i)
        dfs_past.append(df_past)

    df_team_joined = df_team.copy()
    for df_past in dfs_past:
        df_team_joined = pd.concat([df_team_joined, df_past],
                                    axis=1,
                                    )
    df_team_joined = df_team_joined[past_matches:-past_matches]

    return df_team_joined

In [17]:
def build_teams_dataset(df, past_matches):
    dfs = []
    for team in df['team'].unique():
        df_team_joined = build_matches_dataset(df, past_matches, team)
        dfs.append(df_team_joined)
    dfs = pd.concat(dfs)
    dfs.insert(5, 'result_0', 0)

    return dfs

In [18]:
def build_wavenet_dataset(df, past_matches=7, date=False):
    df_copy = df.copy()
    df_copy.sort_values(by=['team', 'date'], inplace=True)
    df_copy.reset_index(inplace=True, drop=True)
    dfs = build_teams_dataset(df_copy, past_matches)
    if date:
        dfs = dfs[(dfs.iloc[:, 1]>=start_date) & (dfs.iloc[:, 1]<=end_date)].reset_index(drop=True)
        
    dfs.reset_index(inplace=True, drop=True)
    X, Y = build_dataset(dfs)
    
    return X, Y, dfs

#     return dfs

In [19]:
start_date = pd.datetime(2021, 8, 1).date();
end_date = pd.datetime(2022, 8, 1).date();

test_index = data[(data['date']>=start_date) & (data['date']<=end_date)]

  start_date = pd.datetime(2021, 8, 1).date();
  end_date = pd.datetime(2022, 8, 1).date();


In [20]:
X, Y, dfs = build_wavenet_dataset(data, 7, date=False)

In [21]:
X.shape

torch.Size([9446, 248])

In [22]:
dfs.head()

Unnamed: 0,league,date,team,opponent,result,result_0,elo_team,elo_opponent,elo_diff,home,...,league_home_goals_conceded_7,league_away_goals_conceded_7,league_home_goals_conceded_avg_7,league_away_goals_conceded_avg_7,team_attack_strength_7,team_defense_strength_7,opponent_attack_strength_7,opponent_defense_strength_7,team_lambda_7,opponent_lambda_7
0,Serie A,2003-11-02,ancona_matelica,siena,0.5,0,1435.920241,1522.633802,-86.71356,1.0,...,2.0,0.0,0.985798,1.555973,0.507383,1.334746,1.334746,0.710336,0.560791,1.756244
1,Serie A,2003-11-09,ancona_matelica,inter_milan,0.0,0,1439.588122,1641.224976,-201.636853,0.0,...,0.857143,1.857143,0.982038,1.480785,0.696725,1.706065,0.781946,1.39345,0.953413,1.975444
2,Serie A,2003-11-23,ancona_matelica,brescia,0.5,0,1432.432001,1501.641748,-69.209747,1.0,...,1.0,1.0,0.997076,1.533417,0.514846,1.319648,0.739003,1.166984,0.921304,0.972372
3,Serie A,2003-11-30,ancona_matelica,sampdoria,0.0,0,1435.38112,1532.285537,-96.904416,0.0,...,0.333333,3.333333,1.025272,1.356725,0.667346,1.862069,1.706897,0.975351,0.667346,4.31216
4,Serie A,2003-12-07,ancona_matelica,bologna,0.0,0,1424.459606,1466.965813,-42.506207,0.0,...,1.555556,0.666667,1.01274,1.419382,0.55621,1.299237,0.883481,0.889935,0.70258,1.162475


In [23]:
@torch.no_grad()
def accuracy(x, y):
#     x, y = {
#         'train': [Xtr, Ytr],
#         'val'  : [Xdev, Ydev],
#         'test' : [Xte, Yte]
#     }[split]
    
    x = x[:, None, :]
    logits = model(x)
    preds = []
    preds = torch.argmax(logits, dim=1)

    i = 0
    for pred, true in zip(preds, y):
        if pred == true:
            i += 1
    
#     print(f"----{split}----")
    print(f"Correctly predicted {i} out of {y.shape[0]}.")
    print(f"{i / y.shape[0]:.4f}")
    print(f"Guessing would give an accuracy of {1 / len(torch.unique(y))}")

In [24]:
PATH = "../../src/model/trained_models/wavenet_3.pt"
# model.load_state_dict(torch.load(PATH))
model = torch.load(PATH)
model.train()

Sequential(
  (0): Conv1d(1, 32, kernel_size=(31,), stride=(31,))
  (1): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): Tanh()
  (3): Conv1d(32, 64, kernel_size=(2,), stride=(2,))
  (4): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (5): Tanh()
  (6): Conv1d(64, 128, kernel_size=(2,), stride=(2,))
  (7): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (8): Tanh()
  (9): Flatten(start_dim=1, end_dim=-1)
  (10): Linear(in_features=256, out_features=3, bias=True)
)

In [25]:
accuracy(X, Y)

Correctly predicted 5703 out of 9446.
0.6037
Guessing would give an accuracy of 0.3333333333333333


In [26]:
# model.train()

In [27]:
@torch.no_grad()
def get_single_prediction(x, y):
#     x, y = {
#         'train': [Xtr, Ytr],
#         'val'  : [Xdev, Ydev],
#         'test' : [Xte, Yte]
#     }[split]
    
    x = x[:, None, :]
    logits = model(x)
    preds = []
    preds = torch.argmax(logits, dim=1)

    i = 0
    for pred, true in zip(preds, y):
        if pred == true:
            i += 1
    
#     print(f"----{split}----")
    print(f"Correctly predicted {i} out of {y.shape[0]}.")
    print(f"{i / y.shape[0]:.4f}")
    print(f"Guessing would give an accuracy of {1 / len(torch.unique(y))}")
    
    return preds

In [28]:
pred_single = get_single_prediction(X, Y)
pred_single

Correctly predicted 5703 out of 9446.
0.6037
Guessing would give an accuracy of 0.3333333333333333


tensor([0, 2, 2,  ..., 0, 0, 0])

In [29]:
@torch.no_grad()
def get_predictions(x, df):
    x = x[:, None, :]
    logits = model(x)
    preds = []
    preds = torch.softmax(logits, dim=1)
    print(preds)
#     print(preds.numpy())
#     print(pd.DataFrame(preds.numpy()))
    df[['loss', 'draw', 'win']] = pd.DataFrame(preds.numpy())
    
    return df

In [30]:
dfs_preds = dfs.copy()
date = dfs_preds.iloc[:, 1]
dfs_preds.drop('date', axis=1, inplace=True)
dfs_preds['date'] = date
dfs_preds.sort_values('date', inplace=True)
dfs_preds.reset_index(inplace=True)
dfs_preds = get_predictions(X, dfs_preds)

tensor([[0.5848, 0.2407, 0.1745],
        [0.2581, 0.1975, 0.5444],
        [0.1976, 0.1742, 0.6282],
        ...,
        [0.5820, 0.1258, 0.2922],
        [0.6151, 0.2132, 0.1718],
        [0.4008, 0.2720, 0.3273]])


In [31]:
dfs_preds_cut = dfs_preds.iloc[:, 0:9]
dfs_preds_cut[['loss', 'draw', 'win']] = dfs_preds[['loss', 'draw', 'win']]
dfs_preds_cut.head()

Unnamed: 0,index,league,team,opponent,result,result_0,elo_team,elo_opponent,elo_diff,loss,draw,win
0,3775,Serie A,juventus,napoli,1.0,0,1575.166302,1441.459893,133.706409,0.5848,0.240748,0.174451
1,43,Serie A,atalanta,inter_milan,0.0,0,1503.323008,1594.043514,-90.720506,0.25815,0.19749,0.54436
2,5862,Serie A,napoli,juventus,0.0,0,1441.459893,1575.166302,-133.706409,0.197606,0.174188,0.628206
3,2033,Serie A,empoli,parma,0.0,0,1472.917207,1551.843017,-78.925811,0.748667,0.171795,0.079538
4,7056,Serie A,roma,bari,1.0,0,1525.651095,1464.290814,61.360281,0.617181,0.246166,0.136652


In [32]:
dfs_preds_cut['prediction'] = pred_single.numpy()
dfs_preds_cut

Unnamed: 0,index,league,team,opponent,result,result_0,elo_team,elo_opponent,elo_diff,loss,draw,win,prediction
0,3775,Serie A,juventus,napoli,1.0,0,1575.166302,1441.459893,133.706409,0.584800,0.240748,0.174451,0
1,43,Serie A,atalanta,inter_milan,0.0,0,1503.323008,1594.043514,-90.720506,0.258150,0.197490,0.544360,2
2,5862,Serie A,napoli,juventus,0.0,0,1441.459893,1575.166302,-133.706409,0.197606,0.174188,0.628206,2
3,2033,Serie A,empoli,parma,0.0,0,1472.917207,1551.843017,-78.925811,0.748667,0.171795,0.079538,0
4,7056,Serie A,roma,bari,1.0,0,1525.651095,1464.290814,61.360281,0.617181,0.246166,0.136652,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9441,4669,Serie A,lazio,sassuolo,1.0,0,1655.913856,1527.800163,128.113693,0.060677,0.135545,0.803777,2
9442,8532,Serie A,spezia,torino,1.0,0,1536.936675,1605.247055,-68.310380,0.313443,0.204234,0.482323,2
9443,7926,Serie A,roma,fiorentina,0.0,0,1644.545349,1591.578409,52.966940,0.582009,0.125801,0.292189,0
9444,8375,Serie A,sassuolo,lazio,0.0,0,1527.800163,1655.913856,-128.113693,0.615067,0.213165,0.171768,0


In [33]:
dfs_preds_cut['prediction'] = dfs_preds_cut['prediction'].replace({2: 1, 1: 0.5})
dfs_preds_cut['correct'] = np.where((dfs_preds_cut['result'] == dfs_preds_cut['prediction']), 1, 0)
dfs_preds_cut['correct'].mean()

0.6037476180393817

In [34]:
dfs_preds_cut['prediction'] = dfs_preds_cut[['loss', 'draw', 'win']].idxmax(axis=1)
dfs_preds_cut['prediction'] = dfs_preds_cut['prediction'].replace({'win': 1, 'draw': 0.5, 'loss': 0})
dfs_preds_cut['correct'] = np.where((dfs_preds_cut['result'] == dfs_preds_cut['prediction']), 1, 0)
dfs_preds_cut

Unnamed: 0,index,league,team,opponent,result,result_0,elo_team,elo_opponent,elo_diff,loss,draw,win,prediction,correct
0,3775,Serie A,juventus,napoli,1.0,0,1575.166302,1441.459893,133.706409,0.584800,0.240748,0.174451,0.0,0
1,43,Serie A,atalanta,inter_milan,0.0,0,1503.323008,1594.043514,-90.720506,0.258150,0.197490,0.544360,1.0,0
2,5862,Serie A,napoli,juventus,0.0,0,1441.459893,1575.166302,-133.706409,0.197606,0.174188,0.628206,1.0,0
3,2033,Serie A,empoli,parma,0.0,0,1472.917207,1551.843017,-78.925811,0.748667,0.171795,0.079538,0.0,1
4,7056,Serie A,roma,bari,1.0,0,1525.651095,1464.290814,61.360281,0.617181,0.246166,0.136652,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9441,4669,Serie A,lazio,sassuolo,1.0,0,1655.913856,1527.800163,128.113693,0.060677,0.135545,0.803777,1.0,1
9442,8532,Serie A,spezia,torino,1.0,0,1536.936675,1605.247055,-68.310380,0.313443,0.204234,0.482323,1.0,1
9443,7926,Serie A,roma,fiorentina,0.0,0,1644.545349,1591.578409,52.966940,0.582009,0.125801,0.292189,0.0,1
9444,8375,Serie A,sassuolo,lazio,0.0,0,1527.800163,1655.913856,-128.113693,0.615067,0.213165,0.171768,0.0,1


In [35]:
dfs_preds_cut['correct'].mean()

0.6037476180393817

In [36]:
dfs_preds_cut['prediction'].value_counts()

0.0    4064
1.0    3990
0.5    1392
Name: prediction, dtype: int64