In [45]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import torch

In [9]:
class Loader:

    def __init__(self, files):
        self.files = files

    def get_data(self):
        dfs = []
        for file in self.files:
            df = self.load_past_matches(file)
            dfs.append(df)
        
        df_join = self.join_data(dfs[0], dfs[1])

        return df_join

    def load_past_matches(self, file):
        df = pd.read_csv(f'../data/{file}')
        df.drop('Unnamed: 0', axis=1, inplace=True)
        df['date'] = pd.to_datetime(df['date']).dt.date

        return df

    def join_data(self, df1, df2):
        df = pd.merge(df1, df2,  how='inner',
            left_on=['league', 'date','team', 'opponent', 'home'],
            right_on=['league', 'date','team', 'opponent', 'home'])
        df.sort_values(by=['date', 'league', 'team', 'opponent'], inplace=True)
        df.reset_index(inplace=True, drop=True)
              
        return df

In [10]:
FILES = ["elos_matches.csv", "goals_matches.csv"]
loader = Loader(FILES)
data = loader.get_data()
data.shape

(74422, 39)

In [11]:
data.drop(['team_goals_scored',
           'opponent_goals_scored',
           'team_goals_conceded',
           'opponent_goals_conceded'], axis=1, inplace=True)

In [12]:
class Wavenet:

    def __init__(self, df, future=None, past_matches=7, future_date=None):
        self.df = df
        self.future = future
        self.X = None
        self.Y = None
        self.dfs = None
        self.dfs_future = None
        self.dfs_all = None
        self.past_matches = past_matches
        self.future_date = future_date
        self.index_columns = ['league', 'date', 'team', 'opponent', 'result']

    def set_up_data(self, df):
        df.drop(['team_goals_scored',
            'opponent_goals_scored',
            'team_goals_conceded',
            'opponent_goals_conceded'], axis=1, inplace=True)

    def build_dataset(self, df):
        df_copy = df.copy()
        df_copy.reset_index(inplace=True, drop=True)
        date = df_copy[['date']].iloc[:,0]
        df_copy.drop(['league', 'date', 'team', 'opponent'], axis=1, inplace=True)
        df_copy['date'] = date
        df_copy.sort_values(by=['date'], inplace=True)
        df_copy = df_copy[df_copy['date']<self.future_date]
        df_copy.drop(['date'], axis=1, inplace=True)
        
        self.X = df_copy.drop(['result'], axis=1).to_numpy()
        self.Y = np.array(df_copy['result']) / 0.5
        
        self.X = torch.tensor(self.X).float().to("mps")
        self.Y = torch.tensor(self.Y).long().to("mps")

    def add_past_to_row(self, df, i):
        df_past = df.copy()
        df_past.index += i
        df_past.rename(columns={c: c+f'_{i}' for c in df_past.columns if c not in ['league',
                                                                                'date',
                                                                                'team',
                                                                                'opponent']}, inplace=True)
        return df_past

    def build_matches_dataset(self, df, past_matches, team):
        dfs_past = []
        df_team = df[df['team']==team]
        for i in range(1, past_matches+1):
            df_past = self.add_past_to_row(df_team, i)
            dfs_past.append(df_past)

        df_team_joined = df_team.copy()
        for df_past in dfs_past:
            df_team_joined = pd.concat([df_team_joined, df_past],
                                        axis=1,
                                        )
        df_team_joined = df_team_joined[past_matches:-past_matches]

        return df_team_joined

    def build_teams_dataset(self, df, past_matches):
        dfs = []
        for team in df['team'].unique():
            df_team_joined = self.build_matches_dataset(df, past_matches, team)
            dfs.append(df_team_joined)
        dfs = pd.concat(dfs)
        dfs.insert(5, 'result_0', 0)
        dfs = self.add_opponent_past_matches(dfs)
        dfs = self.ordering_columns(dfs)

        return dfs
    
    def add_opponent_past_matches(self, df):
        df = df.loc[:,~df.columns.duplicated()].copy()
        df_copy = df.copy()
        keep_same = {'league', 'date', 'team', 'opponent', 'result'}
        df_copy.columns = ['{}{}'.format(c, '' if c in keep_same else '_y') for c in df_copy.columns]
        df_copy['result'] = 1 - df_copy['result']
        df_copy.rename(columns={'team': 'opponent', 'opponent': 'team'}, inplace=True)
        df_combined = pd.merge(df, df_copy, how='left',
                              left_on=['league', 'date', 'team', 'opponent', 'result'],
                              right_on=['league', 'date', 'team', 'opponent', 'result'])
        return df_combined

    def ordering_columns(self, df):
        index_columns = self.index_columns
        template_columns = ['result', 'elo_team', 'elo_opponent', 'elo_diff', 'home', 'team_goals_scored_avg',
                           'team_goals_conceded_avg', 'team_goals_scored_avg_home',
                           'team_goals_conceded_avg_home', 'team_goals_scored_avg_away',
                           'team_goals_conceded_avg_away', 'opponent_goals_scored_avg',
                           'opponent_goals_conceded_avg', 'opponent_goals_scored_avg_home',
                           'opponent_goals_conceded_avg_home', 'opponent_goals_scored_avg_away',
                           'opponent_goals_conceded_avg_away', 'league_home_goals_scored',
                           'league_away_goals_scored', 'league_home_goals_scored_avg',
                           'league_away_goals_scored_avg', 'league_home_goals_conceded',
                           'league_away_goals_conceded', 'league_home_goals_conceded_avg',
                           'league_away_goals_conceded_avg', 'team_attack_strength',
                           'team_defense_strength', 'opponent_attack_strength',
                           'opponent_defense_strength', 'team_lambda', 'opponent_lambda']
        template_columns_y = [s + f'_y' for s in template_columns]
        columns = [[index_columns + ['result_0'] + template_columns[1:] + ['result_0_y'] + template_columns_y[1:]]]
        for i in range(1, self.past_matches+1):
            team_cols = [s + f'_{i}' for s in template_columns]
            opp_cols = [s + f'_{i}_y' for s in template_columns]
            columns.append([team_cols, opp_cols])

        columns = [subitem for sublist in columns for item in sublist for subitem in item]
        df = df[columns]
        df.dropna(inplace=True)
        df.reset_index(inplace=True, drop=True)
        return df
    
    def get_final_entry(self, df, team_or_opponent):
        df = df.copy()
        df = df.loc[:,~df.columns.duplicated()].copy()
        df.sort_values(by='date', inplace=True)
        df.reset_index(inplace=True, drop=True)
        df.drop_duplicates(subset=team_or_opponent, keep='last', inplace=True)
        df = df.loc[:, ~df.columns.str.contains('_y')]
        df = df.drop(['home'], axis=1)
        return df
    
    def team_to_opponent(self, df):
        df_opponent = df.copy()     
        keep_same = {'league', 'date', 'team', 'opponent', 'result'}
        df_opponent.columns = ['{}{}'.format(c, '' if c in keep_same else '_y') for c in df_opponent.columns]
        df_opponent['result'] = 1 - df_opponent['result']
        df_opponent = self.drop_common_columns(df_opponent, 'opponent')
        return df_opponent
    
    def drop_common_columns(self, df, team_or_opp):
        columns_to_drop = [item for item in self.index_columns if item not in [team_or_opp]]
        df.drop(columns=columns_to_drop, axis=1, inplace=True)
        return df
    
    def add_stats_to_future(self, stats, future):
        stats = self.get_final_entry(stats, 'team')
        stats_opp = self.team_to_opponent(stats)
        stats = self.drop_common_columns(stats, 'team')

        df_future = pd.merge(future, stats, how='left', on='team')
        df_future = pd.merge(df_future, stats_opp, how='left', on='opponent')
        df_future['home_y'] = 1 - df_future['home']
        df_future['date'] = pd.to_datetime(df_future['date'], dayfirst=True)
        df_future['date'] = df_future['date'].dt.date
        df_future.sort_values(by='date', inplace=True)
        return df_future
    
    def remove_duplicate_columns(self, df):
        df = df.loc[:,~df.columns.duplicated()].copy()
        return df

    def build_wavenet_dataset(self):
        df_copy = self.df.copy()
        df_copy.sort_values(by=['team', 'date'], inplace=True)
        df_copy.reset_index(inplace=True, drop=True)
        self.dfs = self.build_teams_dataset(df_copy, self.past_matches)
        self.build_dataset(self.dfs)
        
    def order_date(self, df):
        df = df.sort_values(by=['team', 'date'])
        df = df.reset_index(drop=True)
        return df

    def build_wavenet_dataset_past_future(self):
        df_copy = self.df.copy()
        self.set_up_data(df_copy)
        df_copy.sort_values(by=['team', 'date'], inplace=True)
        df_copy.reset_index(inplace=True, drop=True)
        self.dfs = self.build_teams_dataset(df_copy, self.past_matches)
        self.dfs_future = self.add_stats_to_future(self.dfs, self.future)
        self.dfs_future = self.dfs_future[self.dfs.drop(['result'], axis=1).columns]
        self.dfs_future = self.order_date(self.dfs_future)
        self.dfs = self.dfs.loc[:,~self.dfs.columns.duplicated()].copy()
        self.dfs = self.dfs.drop_duplicates(subset=['date', 'team', 'opponent'])
        self.dfs_all = self.dfs.copy()
        self.dfs = self.dfs[self.dfs['date']<future_date]
        self.build_dataset(self.dfs)
        self.dfs = self.remove_duplicate_columns(self.dfs)
    

def load_future_matches():
    df = pd.read_csv('../data/future_matches.csv', parse_dates=True, dayfirst=True)
    df['date'] = pd.to_datetime(df['date'], dayfirst=True)
    df.drop('Unnamed: 0', axis=1, inplace=True)
    df = duplicate_to_team_and_opponent(df)
    df.reset_index(inplace=True, drop=True)
    return df

def duplicate_to_team_and_opponent(df_matches):
    df_matches_copy = df_matches.copy()
    df_matches = df_matches.rename(columns={'pt1': 'team', 'pt2': 'opponent',
                                            })
    df_matches_copy = df_matches_copy.rename(columns={'pt2': 'team', 'pt1': 'opponent',
                                                    })
    df_matches_copy = df_matches_copy[['league', 'date', 'team', 'opponent' 
                                        ]]
    df_matches.loc[:, 'home'] = 1
    df_matches_copy.loc[:, 'home'] = 0
    df_matches = pd.concat([df_matches, df_matches_copy])
    df_matches.sort_values(by='date', inplace=True)

    return df_matches

def build_future_dataset(df):
    df_copy = df.copy()
    df_copy.reset_index(inplace=True, drop=True)
    date = df_copy[['date']].iloc[:,0]
    df_copy.drop(['league', 'date', 'team', 'opponent'], axis=1, inplace=True)
    df_copy['date'] = date
    df_copy.sort_values(by=['date'], inplace=True)
    df_copy.drop(['date'], axis=1, inplace=True)
    df_copy.to_csv("trained_models/future_data.csv")
    X = df_copy.to_numpy()
    X = torch.tensor(X).float().to("mps")
    return X

def add_stats_to_future(stats, future):
    columns = stats.drop(['result'], axis=1).columns
    stats = get_final_entry(stats, 'team')
    stats_opp = team_to_opponent(stats)

    df_future = pd.merge(future, stats, how='left', on='team')
    df_future = pd.merge(df_future, stats_opp, how='left', on='opponent')
    df_future['elo_diff'] = df_future['elo_team'] - df_future['elo_opponent']
    df_future['date'] = pd.to_datetime(df_future['date'], dayfirst=True)
    df_future['date'] = df_future['date'].dt.date

    df_future.sort_values(by='date', inplace=True)
    df_future = df_future[columns]

    return df_future


def get_final_entry(df, team_or_opponent):
    df.sort_values(by='date', inplace=True)
    df.reset_index(inplace=True, drop=True)
    df.drop_duplicates(subset=team_or_opponent, keep='last', inplace=True)
    df = df.loc[:, df.columns.str.contains(team_or_opponent) | df.columns.str.contains('league_')]

    return df


def duplicate_to_team_and_opponent(df_matches):
    df_matches_copy = df_matches.copy()
    df_matches = df_matches.rename(columns={'pt1': 'team', 'pt2': 'opponent',
                                            })
    df_matches_copy = df_matches_copy.rename(columns={'pt2': 'team', 'pt1': 'opponent',
                                                    })
    df_matches_copy = df_matches_copy[['league', 'date', 'team', 'opponent' 
                                        ]]
    df_matches.loc[:, 'home'] = 1
    df_matches_copy.loc[:, 'home'] = 0
    df_matches = pd.concat([df_matches, df_matches_copy])
    df_matches.sort_values(by='date', inplace=True)

    return df_matches


def team_to_opponent(df):
    df_opponent = df.copy()
    df_opponent = df_opponent.loc[:, df_opponent.columns.str.contains("team")]
    df_opponent.columns = df_opponent.columns.str.replace("team", "opponent")

    return df_opponent

def define_split(X, Y, tr_split, val_split):
    n1 = int(0.8 * X.shape[0])
    n2 = int(0.9 * X.shape[0])

    Xtr, Ytr = X[:n1], Y[:n1]
    Xdev, Ydev = X[n1:n2], Y[n1:n2]
    Xte, Yte = X[n2:], Y[n2:]


FILES = ["elos_matches.csv", "goals_matches.csv"]
loader = Loader(FILES)
future = load_future_matches()
future_date = future['date'][0]
data = loader.get_data()
wavenet = Wavenet(data, future, 15, future_date)
wavenet.build_wavenet_dataset_past_future()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace=True)
  self.dfs = self.dfs[self.dfs['date']<future_date]
  df_copy = df_copy[df_copy['date']<self.future_date]


In [14]:
wavenet.dfs.head()

Unnamed: 0,league,date,team,opponent,result,result_0,elo_team,elo_opponent,elo_diff,home,...,league_home_goals_conceded_15_y,league_away_goals_conceded_15_y,league_home_goals_conceded_avg_15_y,league_away_goals_conceded_avg_15_y,team_attack_strength_15_y,team_defense_strength_15_y,opponent_attack_strength_15_y,opponent_defense_strength_15_y,team_lambda_15_y,opponent_lambda_15_y
0,Serie B,2012-10-30,a_s_d_lanciano_calcio_1920,u_s_livorno_calcio_1915,0.5,0,1402.793496,1527.718173,-124.924677,1.0,...,1.615385,1.846154,1.395872,1.438273,0.829514,0.768465,0.695278,0.980334,1.135124,0.768465
1,Serie B,2012-11-03,a_s_d_lanciano_calcio_1920,l_r_vicenza,1.0,0,1407.96592,1435.081701,-27.115781,0.0,...,1.222222,2.777778,1.544737,2.189474,0.432692,0.749574,0.749574,0.697115,0.660425,0.867928
2,Serie B,2012-11-10,a_s_d_lanciano_calcio_1920,juve_stabia,0.0,0,1424.134231,1486.203601,-62.06937,0.0,...,1.090909,1.272727,1.155077,1.540118,1.196081,1.139135,1.048005,0.751822,1.384936,1.378953
3,Serie B,2012-11-17,a_s_d_lanciano_calcio_1920,bari,0.0,0,1411.785832,1458.981092,-47.195261,1.0,...,1.25,1.583333,1.10723,1.50184,1.235895,1.156476,0.911163,0.665482,0.91066,1.582546
4,Serie B,2012-11-24,a_s_d_lanciano_calcio_1920,ternana,0.5,0,1398.810974,1470.502541,-71.691567,0.0,...,1.25,1.583333,1.10723,1.50184,1.121431,0.950689,0.903154,1.156476,1.947749,0.950689


In [62]:
wavenet.dfs.sort_values(by='date', inplace=True)
wavenet.dfs.reset_index(inplace=True, drop=True)
n1 = round(wavenet.dfs.shape[0] * 0.8)
n2 = round(wavenet.dfs.shape[0] * 0.9)

train_set = wavenet.dfs[:n1]
X_train = train_set.drop(['league', 'date', 'team', 'opponent', 'result'], axis=1)
Y_train = (train_set['result'] / 0.5).astype(int)

val_set = wavenet.dfs[n1:n2]
X_val = val_set.drop(['league', 'date', 'team', 'opponent', 'result'], axis=1)
Y_val = (val_set['result'] / 0.5).astype(int)

test_set = wavenet.dfs[n2:]
X_test = test_set.drop(['league', 'date', 'team', 'opponent', 'result'], axis=1)
Y_test= (test_set['result'] / 0.5).astype(int)

In [55]:
X_train.shape

(51394, 992)

In [56]:
Y_train.shape

(51394,)

In [57]:
print(Y_train.unique())
print(Y_val.unique())

[0 1 2]
[0 2 1]


In [63]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_val = sc.transform(X_val)
X_test = sc.transform(X_test)

In [64]:
classifier = LogisticRegression(random_state = 10)
classifier.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [65]:
Y_pred = classifier.predict(X_train)
test_acc = accuracy_score(Y_train, Y_pred)
print("The Accuracy for Test Set is {}".format(test_acc*100))

The Accuracy for Test Set is 52.383546717515664


In [66]:
Y_pred = classifier.predict(X_val)
test_acc = accuracy_score(Y_val, Y_pred)
print("The Accuracy for Test Set is {}".format(test_acc*100))

The Accuracy for Test Set is 48.50560398505604


In [67]:
Y_pred = classifier.predict(X_test)
test_acc = accuracy_score(Y_test, Y_pred)
print("The Accuracy for Test Set is {}".format(test_acc*100))

The Accuracy for Test Set is 46.03051058530511
