In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import RFE, RFECV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 

In [None]:
df = pd.read_csv('../data/preprocessed/layer1/matches_stats_data_1.csv')
result_cols = ['result_A', 'result_D', 'result_H']

In [None]:
df = df[[col for col in df if col not in result_cols]+[col for col in result_cols if col in df]]
df['season'] = df['season'].apply(lambda x:  int(x[2:4]) - 19)
df['result_H'] = df['result_H'].apply(lambda x: 1 if x else 0)
df['result_A'] = df['result_A'].apply(lambda x: 1 if x else 0)
df['result_D'] = df['result_D'].apply(lambda x: 1 if x else 0)
df = df.drop(columns='round')

In [None]:
home_columns = ['gf','ga', 
                'h_xg',
                'h_xga', 'h_standard sot', 'h_kp', 'h_xa', 'h_poss_x',
                'h_touches att pen', 'h_carries prgdist', 'h_progressive passing dist',
                'h_tackles tklw', 'h_challenges tkl%', 'h_saves',
                'h_strength_overall_home', 'h_overall_rating', 'h_attack_rating',
                'h_midfield_rating', 'h_defence_rating', 'h_avg_age','soh']
away_columns = ['ga','gf',
                'a_xg', 
                'a_xga','a_standard sot', 'a_kp', 'a_xa', 'a_poss_x', 
                'a_touches att pen','a_carries prgdist', 'a_progressive passing dist', 
                'a_tackles tklw','a_challenges tkl%', 'a_saves',
                'a_strength_overall_away', 'a_overall_rating', 'a_attack_rating',
                'a_midfield_rating', 'a_defence_rating', 'a_avg_age', 'soa']

bet_columns = ['b365h', 'b365d', 'b365a']

In [None]:
Home_run = pd.read_csv('../data/preprocessed/layer2/fts_home_0.csv')
Home_run['season'] = Home_run['season'].apply(lambda x:  int(x[2:4]) - 19)
# Home_run = Home_run.rename(columns={'team':'hometeam'})
Home_run = Home_run.set_index(['team','opponent_team']).sort_index()


Away_run = pd.read_csv('../data/preprocessed/layer2/fts_away_0.csv')
Away_run['season'] = Away_run['season'].apply(lambda x:  int(x[2:4]) - 19)
# Away_run = Away_run.rename(columns={'team':'awayteam'})
Away_run = Away_run.set_index(['team','opponent_team']).sort_index()


Home_sub = pd.read_csv('../data/preprocessed/layer2/fts_home_1.csv')
Home_sub['season'] = Home_sub['season'].apply(lambda x:  int(x[2:4]) - 19)
Home_sub = Home_sub.set_index(['team','opponent_team']).sort_index()


Away_sub = pd.read_csv('../data/preprocessed/layer2/fts_away_1.csv')
Away_sub['season'] = Away_sub['season'].apply(lambda x:  int(x[2:4]) - 19)
Away_sub = Away_sub.set_index(['team','opponent_team']).sort_index()

In [None]:
ss20 = df[df['season']==1]
hr20 = Home_run[Home_run['season']==1].drop(columns='season')
hs20 = Home_sub[Home_sub['season']==1].drop(columns='season')
ar20 = Away_run[Away_run['season']==1].drop(columns='season')
as20 = Away_sub[Away_sub['season']==1].drop(columns='season')

In [None]:
def prepare_data(org_df: pd.DataFrame,hr:pd.DataFrame, ar:pd.DataFrame,
                hsub: pd.DataFrame, asub: pd.DataFrame, prematch: int =5):
    match_list = hr.index.drop_duplicates().values.tolist()
    h_team_data = {}
    a_team_data = {}
    final_dict = {}
    for team,_ in match_list:
        h_team_data[team]=[]
        a_team_data[team]=[]
        final_dict[team] = {'HOME':{}, 'AWAY':{}}
    for hometeam, awayteam in match_list:
        tmp_data = org_df[(org_df['hometeam']==hometeam) & (org_df['awayteam']==awayteam)]
        h_team_data[hometeam].append(np.hstack([hr.loc[hometeam, awayteam].iloc[:,1:-2].mean().values,
                                                hsub.loc[hometeam, awayteam].iloc[:,2:-2].mean().values,tmp_data[home_columns].values[0] ]))
        a_team_data[awayteam].append(np.hstack([ar.loc[awayteam, hometeam].iloc[:,1:-2].mean().values,
                                                asub.loc[awayteam, hometeam].iloc[:,2:-2].mean().values, tmp_data[away_columns].values[0]]))
    
    for team, matches in h_team_data.items():
        for idx in range(1,len(matches)):
            if idx < prematch-1:
                pre_match_data = matches[:idx+1]
            else:
                pre_match_data = matches[idx-prematch+1:idx+1]
            pre_match_data_key = hr[hr['round']==int(pre_match_data[-1][0])].loc[team].index.unique().values[0]
            pre_match_data = np.vstack(pre_match_data)
            final_dict[team]['HOME'][pre_match_data_key] = pre_match_data[:-1, 1:]
            
    for team, matches in a_team_data.items():
        for idx in range(1, len(matches)):
            if idx < prematch-1:
                pre_match_data = matches[:idx+1]
            else:
                pre_match_data = matches[idx-prematch+1:idx+1]
            pre_match_data_key = ar[ar['round']==int(pre_match_data[-1][0])].loc[team].index.unique().values[0]
            pre_match_data = np.vstack(pre_match_data)
            final_dict[team]['AWAY'][pre_match_data_key] = pre_match_data[:-1, 1:]
            
    return final_dict