In [83]:
import torch
import torch.nn as nn
from torch.nn import GRU, CrossEntropyLoss
from torch import optim
device = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
from GRUconfig import GRUconfig

In [84]:
import pandas as pd 
import numpy as np 
import json
import matplotlib.pyplot as plt 

In [85]:
with open('../data/team_map.json', 'r') as file:
    team_map_dict = json.load(file)
team_map_dict

{'MCI': 0,
 'LIV': 1,
 'ARS': 2,
 'MUN': 3,
 'TOT': 4,
 'CHE': 5,
 'AVL': 6,
 'NEW': 7,
 'WHU': 8,
 'BHA': 9,
 'LEI': 10,
 'BRE': 11,
 'CRY': 12,
 'BOU': 13,
 'EVE': 14,
 'WOL': 15,
 'FUL': 16,
 'LEE': 17,
 'NFO': 18,
 'SOU': 19,
 'BUR': 20,
 'WBA': 21,
 'LUT': 22,
 'WAT': 23,
 'NOR': 24,
 'SHU': 25,
 'IPS': 26}

In [86]:
data_layer_1 = pd.read_csv('../data/preprocessed/layer1/matches_stats_data.csv')

In [87]:
data_layer_1.head()

Unnamed: 0,hometeam,awayteam,date,season,round,gf,ga,h_form_gf,h_form_ga,h_form_xg,...,h_defence_rating,h_avg_age,a_strength_overall_away,a_overall_rating,a_attack_rating,a_midfield_rating,a_defence_rating,a_avg_age,soh,soa
0,CRY,SOU,2020-09-12,2020-2021,1,1.0,0.0,0.111111,0.0,0.157143,...,0.266667,0.900963,0.202899,0.25,0.333333,0.375,0.333333,0.562586,0.353518,0.349016
1,FUL,ARS,2020-09-12,2020-2021,1,0.0,3.0,0.0,0.333333,0.014286,...,0.2,0.337001,0.637681,0.583333,0.666667,0.5,0.533333,0.375516,0.353518,0.491616
2,LIV,LEE,2020-09-12,2020-2021,1,4.0,3.0,0.444444,0.333333,0.385714,...,0.6,0.537827,0.405797,0.25,0.388889,0.3125,0.266667,0.425034,0.514222,0.415563
3,WHU,NEW,2020-09-12,2020-2021,1,0.0,2.0,0.0,0.222222,0.142857,...,0.4,0.788171,0.115942,0.25,0.333333,0.25,0.2,0.675378,0.494134,0.320495
4,TOT,EVE,2020-09-13,2020-2021,1,0.0,1.0,0.0,0.111111,0.157143,...,0.666667,0.662999,0.289855,0.5,0.555556,0.5,0.6,0.662999,0.453958,0.377536


In [88]:
def sequence_data(X: pd.DataFrame, time_step = 1):
    teams_data = {}
    team_idx = sorted(X['hometeam'].unique())
    final = {}
    for idx in team_idx:
        teams_data[idx] = []
        final[idx] = []
    for idx, match in X.iterrows():
        hometeam = match['hometeam']
        awayteam = match['awayteam']
        
        teams_data[hometeam].append(match[2:])
        teams_data[awayteam].append(match[2:])
    
    for team,matches in teams_data.items():
        for idx in range(len(matches)-time_step):
            final[team].append(matches[idx:idx+time_step])
    return final


In [89]:
data_layer_1['hometeam'] = data_layer_1['hometeam'].apply(lambda x: team_map_dict[x])
data_layer_1['awayteam'] = data_layer_1['awayteam'].apply(lambda x: team_map_dict[x])
data_layer_1['season'] = data_layer_1['season'].apply(lambda x:  int(x[2:4]) - 19)
data_layer_1 = data_layer_1.drop(columns=['gf', 'ga'])

In [90]:
result_cols = ['result_A', 'result_D', 'result_H']

In [91]:
def handle_data_ss(ss: int, lower: int = 4, upper: int = 38, part: int = 0):
    df = data_layer_1[data_layer_1['season'] == ss-19]
    df = df[df['round'] > lower]
    df = df[df['round'] <= upper]
    df = df.drop(columns=['date', 'season','round'])
    df = df[[col for col in df if col not in result_cols]+[col for col in result_cols if col in df]]
    x_seq = sequence_data(df)
    input_data = np.empty((0,1,48))
    for key, val in x_seq.items():
        input_data = np.vstack((input_data, np.asarray(val)))
    input_data = input_data.astype(np.float32)
    input_data = np.unique(input_data, axis=0)
    
    y = []
    for idx, sample in enumerate(input_data):
        check_match = list(map(int, sample[-1][-3:]))
        y.append(check_match.index(1))
    y = np.asarray(y)
    
    input_data = np.delete(arr=input_data, obj=[45, 46, 47], axis=2)
    print(input_data.shape, y.shape)
    
    np.save(f'inputs_{ss}_{ss+1}_{part}.npy', input_data)
    np.save(f'outputs_{ss}_{ss+1}_{part}.npy', y)


In [92]:
handle_data_ss(20)
handle_data_ss(21)
handle_data_ss(22,5,24,1)
handle_data_ss(22,24,38,2)

(330, 1, 45) (330,)
(330, 1, 45) (330,)
(183, 1, 45) (183,)
(130, 1, 45) (130,)


In [93]:
from sklearn.model_selection import train_test_split

In [94]:
input_20_1 = np.load('inputs20_21_0.npy', allow_pickle=True)
y_20_1 = np.load('outputs20_21_0.npy', allow_pickle=True)
input_21_1 = np.load('inputs21_22_0.npy', allow_pickle=True)
y_21_1 = np.load('outputs21_22_0.npy', allow_pickle=True)
input_22_1_1 = np.load('inputs22_23_1.npy', allow_pickle=True)
y_22_1_1 = np.load('outputs22_23_1.npy', allow_pickle=True)

INPUTS = np.vstack((input_20_1,input_21_1))
INPUTS = np.vstack((INPUTS, input_22_1_1))
LABELS = np.concatenate((y_20_1,y_21_1))
LABELS = np.concatenate((LABELS, y_22_1_1))
train_X, test_X, train_y, test_y = train_test_split(INPUTS, LABELS, test_size = 0.3, shuffle = True)

train_X = train_X
test_X  = test_X
train_y = train_y
test_y = test_y

train_X = torch.from_numpy(train_X).to(device=device)
test_X = torch.from_numpy(test_X).to(device=device)
train_y = torch.from_numpy(train_y).to(dtype=torch.long, device=device)
test_y = torch.from_numpy(test_y).to(dtype=torch.long, device=device)

In [95]:
train_X.size(-1)

45

In [96]:
class GRU_model(nn.Module):
    def __init__(self, config: GRUconfig):
        super(GRU_model, self).__init__()
        self.layers = nn.ModuleList([
            
            GRU(input_size=config.input_size, 
                hidden_size=config.hidden_size, 
                num_layers=config.num_layer, 
                batch_first=True, 
                dropout=config.drop_out_rate),
            nn.Linear(config.hidden_size, 3)  # Output layer: 3 classes
        ])
    def forward(self, X, Y: None):
        _, hidden = self.layers[0](X)  # GRU Layer
        
        # Use the last hidden state from the GRU
        last_hidden = hidden[-1]  # Take the last layer's hidden state
        
        # Pass through the Fully Connected Layer
        out_logits = self.layers[1](last_hidden)  # Linear Layer
        loss = 0
        if Y is not None:
            logits = out_logits.contiguous()
            loss_fct = CrossEntropyLoss()
            logits = logits.view(-1, 3)
            targets = Y.view(-1)
            loss = loss_fct(logits, targets)
        return out_logits, loss

In [97]:
config = GRUconfig(input_size=train_X.size(-1), hidden_size=512, num_layer=3, drop_out_rate=0.2)
model = GRU_model(config=config).to(device=device)
optimizer = optim.AdamW(model.parameters(), lr=0.0004)
batch = 32
train_losses = []
test_losses = []
test_accu = []

for epoch in range(20):
    for i in range(train_X.shape[0] // batch):
        optimizer.zero_grad()
        out, loss = model.forward(train_X[i * batch:(i + 1) * batch], train_y[i * batch:(i + 1) * batch])
        test_out, test_loss = model.forward(test_X, test_y)
        test_out = torch.argmax(test_out, dim=1)
        correct = (test_out == test_y)
        accuracy = correct.float().mean()
        test_accu.append(accuracy)
        
        if accuracy > 0.60:
            print(f"Accuracy exceeded 60% at epoch {epoch}, iter {i}. Stopping training.")
            break

        if i % 10 == 0:
            print(f"epoch: {epoch}, iter: {i}, loss_train: {loss.item():.4f}, loss_test: {test_loss.item():.4f}, accuracy: {accuracy}")

        loss.backward()
        optimizer.step()

    if accuracy > 0.60:
        break


epoch: 0, iter: 0, loss_train: 1.0990, loss_test: 1.0974, accuracy: 0.3754940629005432
epoch: 0, iter: 10, loss_train: 1.0826, loss_test: 1.0878, accuracy: 0.35573121905326843
epoch: 1, iter: 0, loss_train: 1.0980, loss_test: 1.0929, accuracy: 0.3754940629005432
epoch: 1, iter: 10, loss_train: 1.0674, loss_test: 1.0739, accuracy: 0.45059287548065186
epoch: 2, iter: 0, loss_train: 1.0890, loss_test: 1.0555, accuracy: 0.4901185631752014
epoch: 2, iter: 10, loss_train: 1.0458, loss_test: 1.0208, accuracy: 0.5256916880607605
epoch: 3, iter: 0, loss_train: 1.1726, loss_test: 1.0327, accuracy: 0.5059288740158081
epoch: 3, iter: 10, loss_train: 1.0549, loss_test: 1.0120, accuracy: 0.5177865624427795
epoch: 4, iter: 0, loss_train: 1.1354, loss_test: 1.0317, accuracy: 0.49802371859550476
epoch: 4, iter: 10, loss_train: 1.0328, loss_test: 1.0083, accuracy: 0.52173912525177
epoch: 5, iter: 0, loss_train: 1.1167, loss_test: 1.0025, accuracy: 0.5256916880607605
epoch: 5, iter: 10, loss_train: 1.022

!! =====


In [98]:
true_test_X = torch.from_numpy(np.load('inputs22_23_2.npy', allow_pickle=True).astype(np.float32)).to(device=device)
true_test_Y = torch.from_numpy(np.load('outputs22_23_2.npy', allow_pickle=True).astype(np.float32)).to(device=device, dtype=torch.long)

In [99]:
test_out, test_loss = model.forward(true_test_X, true_test_Y)
test_out = torch.argmax(test_out, dim=1)
correct = (test_out == true_test_Y)
accuracy = correct.float().mean()
accuracy

tensor(0.6154)

In [100]:
torch.save(model.state_dict(), 'GRU_Layer1')

In [101]:
model_load = GRU_model(config)
model_load.load_state_dict(torch.load('GRU_Layer1', weights_only=True))


<All keys matched successfully>

In [102]:
new_dfl1 = pd.read_csv('../data/preprocessed/layer1/matches_stats_data_1.csv')

In [103]:
new_dfl1 = new_dfl1[[col for col in new_dfl1 if col not in result_cols]+[col for col in result_cols if col in new_dfl1]]

In [104]:
new_dfl1['result_H'] = new_dfl1['result_H'].apply(lambda x: 1 if x else 0)
new_dfl1['result_A'] = new_dfl1['result_A'].apply(lambda x: 1 if x else 0)
new_dfl1['result_D'] = new_dfl1['result_D'].apply(lambda x: 1 if x else 0)

In [105]:
new_dfl1

Unnamed: 0,hometeam,awayteam,date,season,round,gf,ga,h_form_gf,h_form_ga,h_form_xg,...,a_overall_rating,a_attack_rating,a_midfield_rating,a_defence_rating,a_avg_age,soh,soa,result_A,result_D,result_H
0,CRY,SOU,2020-09-12,2020-2021,1,1.0,0.0,0.111111,0.000000,0.157143,...,0.250000,0.333333,0.3750,0.333333,0.562586,0.353518,0.349016,0,0,1
1,FUL,ARS,2020-09-12,2020-2021,1,0.0,3.0,0.000000,0.333333,0.014286,...,0.583333,0.666667,0.5000,0.533333,0.375516,0.353518,0.491616,1,0,0
2,LIV,LEE,2020-09-12,2020-2021,1,4.0,3.0,0.444444,0.333333,0.385714,...,0.250000,0.388889,0.3125,0.266667,0.425034,0.514222,0.415563,0,0,1
3,WHU,NEW,2020-09-12,2020-2021,1,0.0,2.0,0.000000,0.222222,0.142857,...,0.250000,0.333333,0.2500,0.200000,0.675378,0.494134,0.320495,1,0,0
4,TOT,EVE,2020-09-13,2020-2021,1,0.0,1.0,0.000000,0.111111,0.157143,...,0.500000,0.555556,0.5000,0.600000,0.662999,0.453958,0.377536,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1615,SOU,EVE,2024-11-02,2024-2025,10,1.0,0.0,0.100000,0.244444,0.127143,...,0.250000,0.277778,0.3125,0.333333,1.000000,0.270891,0.277075,0,0,1
1616,WOL,CRY,2024-11-02,2024-2025,10,2.0,2.0,0.200000,0.288889,0.145714,...,0.416667,0.388889,0.4375,0.400000,0.449794,0.817935,0.571101,0,1,0
1617,MUN,CHE,2024-11-03,2024-2025,10,1.0,1.0,0.111111,0.155556,0.218571,...,0.666667,0.555556,0.6250,0.533333,0.088033,0.515520,0.722152,0,1,0
1618,TOT,AVL,2024-11-03,2024-2025,10,4.0,1.0,0.211111,0.144444,0.221429,...,0.583333,0.777778,0.5625,0.600000,0.562586,0.559673,0.480735,0,0,1


In [106]:
new_dfl1.columns

Index(['hometeam', 'awayteam', 'date', 'season', 'round', 'gf', 'ga',
       'h_form_gf', 'h_form_ga', 'h_form_xg', 'h_form_xga',
       'h_form_standard sot', 'h_form_kp', 'h_form_xa', 'h_form_poss_x',
       'h_form_touches att pen', 'h_form_carries prgdist',
       'h_form_progressive passing dist', 'h_form_tackles tklw',
       'h_form_challenges tkl%', 'h_form_saves', 'a_form_gf', 'a_form_ga',
       'a_form_xg', 'a_form_xga', 'a_form_standard sot', 'a_form_kp',
       'a_form_xa', 'a_form_poss_x', 'a_form_touches att pen',
       'a_form_carries prgdist', 'a_form_progressive passing dist',
       'a_form_tackles tklw', 'a_form_challenges tkl%', 'a_form_saves',
       'b365h', 'b365d', 'b365a', 'h_strength_overall_home',
       'h_overall_rating', 'h_attack_rating', 'h_midfield_rating',
       'h_defence_rating', 'h_avg_age', 'a_strength_overall_away',
       'a_overall_rating', 'a_attack_rating', 'a_midfield_rating',
       'a_defence_rating', 'a_avg_age', 'soh', 'soa', 'result_A

In [107]:
home_columns = ['h_form_gf', 'h_form_ga', 'h_form_xg', 'h_form_xga',
              'h_form_standard sot', 'h_form_kp', 'h_form_xa', 'h_form_poss_x',
              'h_form_touches att pen', 'h_form_carries prgdist',
              'h_form_progressive passing dist', 'h_form_tackles tklw',
              'h_form_challenges tkl%', 'h_form_saves','soh',
              'h_strength_overall_home',
              'h_overall_rating', 'h_attack_rating', 'h_midfield_rating',
              'h_defence_rating','h_avg_age']

In [108]:
away_columns = ['a_form_gf', 'a_form_ga','a_form_xg', 'a_form_xga', 
                'a_form_standard sot', 'a_form_kp','a_form_xa', 'a_form_poss_x', 
                'a_form_touches att pen', 'a_form_carries prgdist', 
                'a_form_progressive passing dist', 'a_form_tackles tklw', 
                'a_form_challenges tkl%', 'a_form_saves', 'soa',
                'a_strength_overall_away',
                'a_overall_rating', 'a_attack_rating', 'a_midfield_rating',
                'a_defence_rating', 'a_avg_age']