In [336]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from GRUconfig import GRUconfig

import torch
import torch.nn as nn
from torch.nn import GRU, CrossEntropyLoss
from torch import optim
device = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
from GRUconfig import GRUconfig

In [337]:
import pandas as pd 
import numpy as np 
import json
import matplotlib.pyplot as plt 

In [338]:
with open('../data/team_map.json', 'r') as file:
    team_map_dict = json.load(file)
result_cols = ['result_A', 'result_D', 'result_H']

In [339]:
df = pd.read_csv('../data/preprocessed/layer1/matches_stats_data_1.csv')

In [340]:
df = df[[col for col in df if col not in result_cols]+[col for col in result_cols if col in df]]
df['season'] = df['season'].apply(lambda x:  int(x[2:4]) - 19)

In [341]:
df['result_H'] = df['result_H'].apply(lambda x: 1 if x else 0)
df['result_A'] = df['result_A'].apply(lambda x: 1 if x else 0)
df['result_D'] = df['result_D'].apply(lambda x: 1 if x else 0)

In [342]:
df.columns

Index(['hometeam', 'awayteam', 'date', 'season', 'round', 'gf', 'ga', 'h_xg',
       'h_xga', 'h_standard sot', 'h_kp', 'h_xa', 'h_poss_x',
       'h_touches att pen', 'h_carries prgdist', 'h_progressive passing dist',
       'h_tackles tklw', 'h_challenges tkl%', 'h_saves', 'a_xg', 'a_xga',
       'a_standard sot', 'a_kp', 'a_xa', 'a_poss_x', 'a_touches att pen',
       'a_carries prgdist', 'a_progressive passing dist', 'a_tackles tklw',
       'a_challenges tkl%', 'a_saves', 'b365h', 'b365d', 'b365a',
       'h_strength_overall_home', 'h_overall_rating', 'h_attack_rating',
       'h_midfield_rating', 'h_defence_rating', 'h_avg_age',
       'a_strength_overall_away', 'a_overall_rating', 'a_attack_rating',
       'a_midfield_rating', 'a_defence_rating', 'a_avg_age', 'soh', 'soa',
       'result_A', 'result_D', 'result_H'],
      dtype='object')

In [343]:
home_columns = ['result_H',
                'gf','ga', 
                'h_xg',
                'h_xga', 'h_standard sot', 'h_kp', 'h_xa', 'h_poss_x',
                'h_touches att pen', 'h_carries prgdist', 'h_progressive passing dist',
                'h_tackles tklw', 'h_challenges tkl%', 'h_saves',
                'h_strength_overall_home', 'h_overall_rating', 'h_attack_rating',
                'h_midfield_rating', 'h_defence_rating', 'h_avg_age','soh']
away_columns = ['result_A',
                'ga','gf',
                'a_xg', 
                'a_xga','a_standard sot', 'a_kp', 'a_xa', 'a_poss_x', 
                'a_touches att pen','a_carries prgdist', 'a_progressive passing dist', 
                'a_tackles tklw','a_challenges tkl%', 'a_saves',
                'a_strength_overall_away', 'a_overall_rating', 'a_attack_rating',
                'a_midfield_rating', 'a_defence_rating', 'a_avg_age', 'soa']
bet_columns = ['b365h', 'b365d', 'b365a']

In [344]:
df = df.drop(columns='date')
df

Unnamed: 0,hometeam,awayteam,season,round,gf,ga,h_xg,h_xga,h_standard sot,h_kp,...,a_overall_rating,a_attack_rating,a_midfield_rating,a_defence_rating,a_avg_age,soh,soa,result_A,result_D,result_H
0,CRY,SOU,1,1,1.0,0.0,0.157143,0.128571,0.1875,0.166667,...,0.250000,0.333333,0.3750,0.333333,0.562586,0.353518,0.349016,0,0,1
1,FUL,ARS,1,1,0.0,3.0,0.014286,0.271429,0.1250,0.100000,...,0.583333,0.666667,0.5000,0.533333,0.375516,0.353518,0.491616,1,0,0
2,LIV,LEE,1,1,4.0,3.0,0.385714,0.042857,0.2500,0.466667,...,0.250000,0.388889,0.3125,0.266667,0.425034,0.514222,0.415563,0,0,1
3,WHU,NEW,1,1,0.0,2.0,0.142857,0.228571,0.1875,0.366667,...,0.250000,0.333333,0.2500,0.200000,0.675378,0.494134,0.320495,1,0,0
4,TOT,EVE,1,1,0.0,1.0,0.157143,0.171429,0.3125,0.300000,...,0.500000,0.555556,0.5000,0.600000,0.662999,0.453958,0.377536,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1615,SOU,EVE,5,10,1.0,0.0,0.100000,0.228571,0.1250,0.300000,...,0.250000,0.277778,0.3125,0.333333,1.000000,0.270891,0.277075,0,0,1
1616,WOL,CRY,5,10,2.0,2.0,0.214286,0.342857,0.3750,0.300000,...,0.416667,0.388889,0.4375,0.400000,0.449794,0.817935,0.571101,0,1,0
1617,MUN,CHE,5,10,1.0,1.0,0.285714,0.157143,0.1875,0.233333,...,0.666667,0.555556,0.6250,0.533333,0.088033,0.515520,0.722152,0,1,0
1618,TOT,AVL,5,10,4.0,1.0,0.342857,0.257143,0.3750,0.333333,...,0.583333,0.777778,0.5625,0.600000,0.562586,0.559673,0.480735,0,0,1


In [345]:
def prepare_data(ssdf: pd.DataFrame, pre_match: int = 5) -> dict:
    teams_data = {}
    team_idx = sorted(df['hometeam'].unique())
    final = {}
    for idx in team_idx:
        teams_data[idx] = []
        final[idx] = {}
    for idx, match in ssdf.iterrows():
        hometeam = match['hometeam']
        awayteam = match['awayteam']
        
        tmp_h_data = match[['hometeam', 'awayteam']+home_columns+bet_columns]
        tmp_h_data['home'] = 1
        tmp_h_data['opponent'] = awayteam
        teams_data[hometeam].append(tmp_h_data.values)
        
        tmp_a_data = match[['hometeam', 'awayteam']+away_columns+bet_columns]
        tmp_a_data['home'] = 0
        tmp_a_data['opponent'] = hometeam
        teams_data[awayteam].append(tmp_a_data.values)
    
    for team,matches in teams_data.items():
        for idx in range(len(matches)-pre_match):
            pre_match_data = matches[idx:idx+pre_match]
            pre_match_data_key = pre_match_data[-1][0] + pre_match_data[-1][1]
            pre_match_data = np.vstack(pre_match_data)
            final[team][pre_match_data_key] = [pre_match_data[:, 2:-1], pre_match_data[-1][-1]]
    return final

In [346]:
"""_structure_
    [This team win: 1/0][goal score][goal scored][rate-bet x3][home] = 26
    -------------------- TEAM 1__________________------------------- = 21 -> 42 +[this team home| other team home][rate-bet x3] = 47
"""

'_structure_\n    [This team win: 1/0][goal score][goal scored][rate-bet x3][home] = 26\n    -------------------- TEAM 1__________________------------------- = 21 -> 42 +[this team home| other team home][rate-bet x3] = 47\n'

In [347]:
def create_X_y(final_dict: dict, seq: bool = False):
    checked_match = []
    X = np.empty((0,5,42)) if seq else np.empty((0,47))
    y = []
    data_dict = {}
    for _, match_dict in final_dict.items():
        for match_key, match_data in match_dict.items():
            match_dict = {}
            if match_key in checked_match:
                continue
            checked_match.append(match_key)
            team1_data = match_data[0]
            try:
                team2_data = final_dict[match_data[1]][match_key][0]
            except KeyError:

                continue
            
            team1_home = (team1_data[-1][-1] == 1)
            h,d,a = team1_data[-1][-4:-1].tolist()
            if team1_home:
                bet_rate = np.array([1.,0.,h,d,a])
            else:
                bet_rate = np.array([0.,1.,a,d,h])
            weights = np.array([0.25, 0.25, 0.25, 0.25])
            if not seq:
                team1_prematch = np.dot(weights, team1_data[:-1,1:-4]).reshape(-1)
                team2_prematch = np.dot(weights, team2_data[:-1,1:-4]).reshape(-1)
                prematch = np.concatenate((team1_prematch, team2_prematch))
                prematch = np.concatenate((prematch, bet_rate))
            else:
                team1_prematch =  team1_data[:-1,1:-4]
                team2_prematch = team2_data[:-1,1:-4]
                place_holder = np.zeros((1,42))
                place_holder[:,:5] = bet_rate
                prematch = np.concatenate((team1_prematch, team2_prematch), axis=1)
                prematch = np.concatenate((prematch, place_holder), axis=0).reshape(1,5,42)
            
            this_match_res = 0 if team1_data[-1][0] == 1 else 2 if team2_data[-1][0] == 1 else 0
            X = np.vstack((X, prematch))
            y.append(this_match_res)
            
            match_dict['x'] = prematch
            match_dict['y'] = this_match_res
            data_dict[match_key] = match_dict
    y = np.asarray(y)
    return data_dict, X, y

In [348]:
def handle_data(ss: int, prematch:int = 5, seq: bool = False):
    global df
    tmpss=ss-19
    sscheck = df[df['season']==tmpss]
    finalss = prepare_data(sscheck, prematch)
    _, X, y = create_X_y(finalss, seq)
    print(X.shape, y.shape)
    if seq:
        np.save(f'inputs{ss}_{ss+1}_seq.npy',X)
        np.save(f'outputs{ss}_{ss+1}_seq.npy',y)
    else:
        np.save(f'inputs{ss}_{ss+1}.npy',X)
        np.save(f'outputs{ss}_{ss+1}.npy',y)

In [349]:
handle_data(20)
handle_data(21)
handle_data(22)

(328, 47) (328,)
(330, 47) (330,)
(330, 47) (330,)


In [350]:
input_20_1 = np.load('inputs20_21.npy', allow_pickle=True)
y_20_1 = np.load('outputs20_21.npy', allow_pickle=True)
input_21_1 = np.load('inputs21_22.npy', allow_pickle=True)
y_21_1 = np.load('outputs21_22.npy', allow_pickle=True)
input_22_1_1 = np.load('inputs22_23.npy', allow_pickle=True)
y_22_1_1 = np.load('outputs22_23.npy', allow_pickle=True)

INPUTS = np.vstack((input_20_1,input_21_1))
INPUTS = np.vstack((INPUTS, input_22_1_1))
LABELS = np.concatenate((y_20_1,y_21_1))
LABELS = np.concatenate((LABELS, y_22_1_1))
train_X, test_X, train_y, test_y = train_test_split(INPUTS, LABELS, test_size = 0.3, shuffle = True)

In [351]:
svc_ovo = SVC(decision_function_shape='ovo')
svc_ovo.fit(train_X, train_y)
y_predict = svc_ovo.predict(test_X)
accuracy_score(y_predict, test_y)

0.734006734006734

In [352]:
rf_clf = RandomForestClassifier(criterion='entropy')
rf_clf.fit(train_X, train_y)
y_predict = rf_clf.predict(test_X)
accuracy_score(y_predict, test_y)

0.734006734006734

In [353]:
xGboost_clf = GradientBoostingClassifier()
xGboost_clf.fit(train_X, train_y)
y_predict = xGboost_clf.predict(test_X)
accuracy_score(y_predict, test_y)

0.7239057239057239

In [354]:
handle_data(23)
inputs = np.load('inputs23_24.npy', allow_pickle=True)
outputs = np.load('outputs23_24.npy', allow_pickle=True)

(329, 47) (329,)


In [355]:
y_predict = svc_ovo.predict(inputs)
accuracy_score(y_predict, outputs)

0.7142857142857143

In [356]:
# import pickle

# with open('svc_ovo.pkl','wb') as f:
#     pickle.dump(svc_ovo,f)

# # # load
# # with open('svc_model.pkl', 'rb') as f:
# #     clf2 = pickle.load(f)

# with open('rf.pkl','wb') as f:
#     pickle.dump(rf_clf, f)

# with open('xGboost.pkl','wb') as f:
#     pickle.dump(xGboost_clf, f)

In [357]:
handle_data(20, seq=True)
handle_data(21, seq=True)
handle_data(22, seq=True)

(328, 5, 42) (328,)
(330, 5, 42) (330,)
(330, 5, 42) (330,)


In [358]:
class GRU_model(nn.Module):
    def __init__(self, config: GRUconfig):
        super(GRU_model, self).__init__()
        self.layers = nn.ModuleList([
            nn.Linear(5, config.input_size),
            GRU(input_size=config.input_size, 
                hidden_size=config.hidden_size, 
                num_layers=config.num_layer, 
                batch_first=True, 
                dropout=config.drop_out_rate),
            nn.Linear(config.hidden_size, 3)  # Output layer: 3 classes
        ])
    def forward(self, X, Y: None):
        bet_vectors = X[:,-1,:5]
        X = X[:,:-1]
        bet_vectors = self.layers[0](bet_vectors).view(-1,1,42)
        X = torch.concat([X, bet_vectors], dim=1)
        _, hidden = self.layers[1](X)  # GRU Layer
        
        # Use the last hidden state from the GRU
        last_hidden = hidden[-1]  # Take the last layer's hidden state
        
        # Pass through the Fully Connected Layer
        out_logits = self.layers[2](last_hidden)  # Linear Layer
        loss = 0
        if Y is not None:
            logits = out_logits.contiguous()
            loss_fct = CrossEntropyLoss()
            logits = logits.view(-1, 3)
            targets = Y.view(-1)
            loss = loss_fct(logits, targets)
        return out_logits, loss

In [359]:
input_20_seq = np.load('inputs20_21_seq.npy', allow_pickle=True)
y_20_seq = np.load('outputs20_21_seq.npy', allow_pickle=True)
input_21_seq = np.load('inputs21_22_seq.npy', allow_pickle=True)
y_21_seq = np.load('outputs21_22_seq.npy', allow_pickle=True)
input_22_seq = np.load('inputs22_23_seq.npy', allow_pickle=True)
y_22_seq = np.load('outputs22_23_seq.npy', allow_pickle=True)

INPUTS = np.vstack((input_20_seq,input_21_seq))
INPUTS = np.vstack((INPUTS, input_21_seq))
LABELS = np.concatenate((y_20_seq,y_21_seq))
LABELS = np.concatenate((LABELS, y_22_seq))
train_X, test_X, train_y, test_y = train_test_split(INPUTS, LABELS, test_size = 0.3, shuffle = True)

train_X = train_X.astype(np.float32)
test_X  = test_X.astype(np.float32)
train_y = train_y
test_y = test_y

train_X = torch.from_numpy(train_X).to(device=device)
test_X = torch.from_numpy(test_X).to(device=device)
train_y = torch.from_numpy(train_y).to(dtype=torch.long, device=device)
test_y = torch.from_numpy(test_y).to(dtype=torch.long, device=device)

In [371]:
config = GRUconfig(input_size=train_X.size(-1), hidden_size=512, num_layer=4 ,drop_out_rate=0.2)
model = GRU_model(config=config).to(device=device)
optimizer = optim.AdamW(model.parameters(), lr=0.0004)
batch = 32
train_losses = []
test_losses = []
test_accu = []
for epoch in range(6):
    for i in range(train_X.shape[0] // batch):
        optimizer.zero_grad()
        out, loss = model.forward(train_X[i * batch:(i + 1) * batch], train_y[i * batch:(i + 1) * batch])
        test_out, test_loss = model.forward(test_X, test_y)
        test_out = torch.argmax(test_out, dim=1)
        correct = (test_out == test_y)
        accuracy = correct.float().mean()
        test_accu.append(accuracy)
        
        if i%10 == 0:
            print(f"epoch:{epoch}, iter:{i},loss_train: {loss.item():.4f}, loss_test: {test_loss.item():.4f}, accuracy: {accuracy}")
        loss.backward()
        optimizer.step()

epoch:0, iter:0,loss_train: 1.1039, loss_test: 1.1035, accuracy: 0.07744107395410538
epoch:0, iter:10,loss_train: 0.7003, loss_test: 0.6147, accuracy: 0.6936026811599731
epoch:0, iter:20,loss_train: 0.6805, loss_test: 0.6802, accuracy: 0.616161584854126
epoch:1, iter:0,loss_train: 0.7228, loss_test: 0.7171, accuracy: 0.32996633648872375
epoch:1, iter:10,loss_train: 0.6261, loss_test: 0.6094, accuracy: 0.6969696879386902
epoch:1, iter:20,loss_train: 0.7175, loss_test: 0.5937, accuracy: 0.7070707082748413
epoch:2, iter:0,loss_train: 0.5508, loss_test: 0.6087, accuracy: 0.6868686676025391
epoch:2, iter:10,loss_train: 0.6209, loss_test: 0.5862, accuracy: 0.7003366947174072
epoch:2, iter:20,loss_train: 0.7070, loss_test: 0.5861, accuracy: 0.6902356743812561
epoch:3, iter:0,loss_train: 0.5352, loss_test: 0.5891, accuracy: 0.6936026811599731
epoch:3, iter:10,loss_train: 0.6150, loss_test: 0.5774, accuracy: 0.6936026811599731
epoch:3, iter:20,loss_train: 0.6714, loss_test: 0.5852, accuracy: 0.

In [372]:
torch.save(model.state_dict(), 'GRU_Layer1')
# model_load = GRU_model(config)
# model_load.load_state_dict(torch.load('GRU_Layer1', weights_only=True))