In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
import re
import datetime

In [None]:
def clean_few_teams(X):
    ser = X.team_1.value_counts()
    good_teams = ser[ser > 9].index.tolist()
    X = X.loc[X["team_1"].isin(good_teams),:].reset_index(drop=True)
    return X

def encode_teams(X):
    enc_teams = OneHotEncoder(handle_unknown='ignore')
    enc_teams = enc_teams.fit(X[['team_1', 'team_2']].to_numpy())
    return enc_teams


def encode_players(X):
    players = np.unique(np.array([player for players in X.players1 for player in players.split()]))
    enc_players = OneHotEncoder(handle_unknown='ignore')
    enc_players = enc_players.fit(players.reshape(-1, 1))
    return enc_players


def encode_maps(X):
    enc_maps = OneHotEncoder(handle_unknown='ignore')
    enc_maps = enc_maps.fit(X['Map'].to_numpy().reshape(-1, 1))
    return enc_maps

In [None]:
df = pd.read_csv(r'mathes_for_a_5_years.csv')
df['players1'] = df.players1.apply(lambda x: x.strip("[]").replace("'", "").replace(",", ""))
df['players2'] = df.players2.apply(lambda x: x.strip("[]").replace("'", "").replace(",", ""))
df = clean_few_teams(df)
enc_teams = encode_teams(df)
enc_players = encode_players(df)
enc_maps = encode_maps(df)

In [None]:
import io
import os
import torchtext
from tqdm.notebook import tqdm
from torch.utils.data import Dataset, DataLoader
from functools import reduce
from scipy.sparse import hstack
from sklearn.preprocessing import StandardScaler
import torch

In [None]:
class CSGOMatchesDataset(Dataset):  
    def __init__(self, path, enc_teams, enc_players, enc_maps): 
        df = pd.read_csv(path)

        df = df[(df['team_rank_1'] <= 50)|(df['team_rank_2'] <= 50)]
      
        df['players1'] = df.players1.apply(lambda x: x.strip("[]").replace("'", "").replace(",", ""))
        df['players2'] = df.players2.apply(lambda x: x.strip("[]").replace("'", "").replace(",", ""))
        df = self.clean_few_teams(df)
        
        self.enc_teams = enc_teams
        self.enc_players = enc_players
        self.enc_maps = enc_maps
        
        self.players = ['player1_team1', 'player2_team1', 'player3_team1',
                        'player4_team1', 'player5_team1', 'player1_team2',
                        'player2_team2', 'player3_team2', 'player4_team2',
                        'player5_team2']
        
        df = self.columns_of_players(df)
        df['date'] = pd.to_datetime(df['date'])
        
        self.labels = df.loc[:, 'score']
        self.texts = df.drop('score', axis=1)
        
        
        self.n_examples = len(self.labels)

        self.sc = StandardScaler()
        self.sc.fit(np.array([(df['date'] - pd.Timestamp("2016-01-01")) // pd.Timedelta('1d')]).reshape(-1, 1))
        return
    
    def columns_of_players(self, X):
        for i in range(5):
            string = f'player{i+1}_team1'
            X.loc[:, string] = X['players1'].apply(lambda x: x.split()[i])

        for i in range(5):
            string = f'player{i+1}_team2'
            X.loc[:, string] = X['players2'].apply(lambda x: x.split()[i])
            
        X.drop(['players1', 'players2'], axis=1, inplace=True)
        
        players_first_team = [self.enc_players.transform(X[f'player{i}_team1'].to_numpy().reshape(-1, 1)).astype(int)
                                       for i in range(1, 6)]
        
        players_second_team = [self.enc_players.transform(X[f'player{i}_team2'].to_numpy().reshape(-1, 1)).astype(int)
                                       for i in range(1, 6)]
    

        players_first_team = reduce(lambda x, y: x + y, players_first_team)
        players_second_team = reduce(lambda x, y: x + y, players_second_team)
        
        
        X['players_team1'] = players_first_team.toarray().tolist()
        X['players_team2'] = players_second_team.toarray().tolist()
        
        X.drop(self.players, axis=1, inplace=True)
        
        return X
        
        
    def clean_few_teams(self, X):
        ser = X.team_1.value_counts()
        good_teams = ser[ser > 9].index.tolist()
        X = X.loc[X["team_1"].isin(good_teams),:].reset_index(drop=True)
        return X
    
    
    def __len__(self):
        return self.n_examples
    
    
    def encode_teams(self, X):
        self.enc_teams = OneHotEncoder(handle_unknown='ignore')
        self.enc_teams = self.enc_teams.fit(X[['team_1', 'team_2']].to_numpy())
    
    
    def encode_players(self, X):
        players = np.unique(np.array([player for players in X.players1 for player in players.split()]))
        self.enc_players = OneHotEncoder(handle_unknown='ignore')
        self.enc_players = self.enc_players.fit(players.reshape(-1, 1))
    
    
    def encode_maps(self, X):
        self.enc_maps = OneHotEncoder(handle_unknown='ignore')
        self.enc_maps = self.enc_maps.fit(X['Map'].to_numpy().reshape(-1, 1))
        
    
    def OneHotEncode(self, X, item):
        teams = self.enc_teams.transform([X.iloc[item][['team_1', 'team_2']].to_numpy()]).astype(int)
        players_first_team = X.iloc[item]['players_team1']
        players_second_team = X.iloc[item]['players_team2']
        Map = self.enc_maps.transform([[X.iloc[item]['Map']]]).astype(int)
        ranks = np.array([X.iloc[item][['team_rank_1', 'team_rank_2']].to_numpy()]).astype(int)
        times = self.sc.transform(np.array([[(X.iloc[item]['date'] - pd.Timestamp("2016-01-01")) // pd.Timedelta('1d')]]).reshape(-1, 1))
        return hstack([times, teams, players_first_team, players_second_team, Map, ranks]).toarray()
        
    
    def __getitem__(self, item):
        vecs = self.OneHotEncode(self.texts, item)
        return {'text':vecs, 'label':self.labels[item]}

In [None]:
batch_size = 1000
device = 'cuda' if torch.cuda.is_available() else 'cpu'

test_dataset = CSGOMatchesDataset(r'test_mathes.csv', enc_teams, enc_players, enc_maps)
train_dataset = CSGOMatchesDataset(r'train_matches.csv', enc_teams, enc_players, enc_maps)

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

In [None]:
print(f'Created `train_dataloader` with {len(train_dataloader)} batches!')
print(f'Created `test_dataloader` with {len(test_dataloader)} batches!')

Created `train_dataloader` with 63 batches!
Created `test_dataloader` with 3 batches!


In [None]:
import torch.nn as nn
class LinearModel(nn.Module):
    def __init__(self, initial_size, teams_size, players_size, maps_size):
        super().__init__()

        self.teams_size, self.players_size, self.maps_size = teams_size, players_size, maps_size

        self.initial_size = initial_size

        self.layer_1000_1 = nn.Sequential(
                                    nn.Linear(self.initial_size, 1000),
                                    nn.ReLU(),
                                    nn.BatchNorm1d(1000, affine=False),
                                    nn.Linear(1000, 1000),
                                    nn.ReLU(),
                                    nn.Dropout(p=0.3)
                                  )
        
        
        
        self.layer_500_1 = nn.Sequential(
                                    nn.Linear(1000, 500),
                                    nn.ReLU(),
                                    nn.BatchNorm1d(500, affine=False),
                                    nn.Linear(500, 500),
                                    nn.ReLU(),
                                    nn.Dropout(p=0.3)
                                  )
  

        
        self.layer_100_1 = nn.Sequential(
                                    nn.Linear(500, 100),
                                    nn.ReLU(),
                                    nn.BatchNorm1d(100, affine=False),
                                    nn.Linear(100, 100),
                                    nn.ReLU(),
                                    nn.Dropout(p=0.3)
                                  )
        
        self.layer_100_2 = nn.Sequential(
                                    nn.Linear(100, 100),
                                    nn.ReLU(),
                                    nn.BatchNorm1d(100, affine=False),
                                    nn.Linear(100, 100),
                                    nn.ReLU(),
                                    nn.Dropout(p=0.3)
                                  )
        
        self.layer_100_3 = nn.Sequential(
                                    nn.Linear(100, 100),
                                    nn.ReLU(),
                                    nn.BatchNorm1d(100, affine=False),
                                    nn.Linear(100, 100),
                                    nn.ReLU(),
                                    nn.Dropout(p=0.3)
                                    )
        
        self.layer_100_4 = nn.Sequential(
                                    nn.Linear(100, 100),
                                    nn.ReLU(),
                                    nn.BatchNorm1d(100, affine=False),
                                    nn.Linear(100, 100),
                                    nn.ReLU(),
                                    nn.Dropout(p=0.3)
                                    )
        
        self.layer_100_5 = nn.Sequential(
                                    nn.Linear(100, 100),
                                    nn.ReLU(),
                                    nn.BatchNorm1d(100, affine=False),
                                    nn.Linear(100, 100),
                                    nn.ReLU(),
                                    nn.Dropout(p=0.3)
                                    )
        
        
        
        self.layer_50_1 = nn.Sequential(
                                    nn.Linear(100, 50),
                                    nn.ReLU(),
                                    nn.BatchNorm1d(50, affine=False),
                                    nn.Linear(50, 50),
                                    nn.ReLU(),
                                    nn.Dropout(p=0.3)
                                    )
        
        self.layer_50_2 = nn.Sequential(
                                    nn.Linear(50, 50),
                                    nn.ReLU(),
                                    nn.BatchNorm1d(50, affine=False),
                                    nn.Linear(50, 50),
                                    nn.ReLU(),
                                    nn.Dropout(p=0.3)
                                    )
        
        self.layer_50_3 = nn.Sequential(
                                    nn.Linear(50, 50),
                                    nn.ReLU(),
                                    nn.BatchNorm1d(50, affine=False),
                                    nn.Linear(50, 50),
                                    nn.ReLU(),
                                    nn.Dropout(p=0.3)
                                    )
        
        self.layer_50_4 = nn.Sequential(
                                    nn.Linear(50, 50),
                                    nn.ReLU(),
                                    nn.BatchNorm1d(50, affine=False),
                                    nn.Linear(50, 50),
                                    nn.ReLU(),
                                    nn.Dropout(p=0.3)
                                    )
        
        self.layer_50_5 = nn.Sequential(
                                    nn.Linear(50, 50),
                                    nn.ReLU(),
                                    nn.BatchNorm1d(50, affine=False),
                                    nn.Linear(50, 50),
                                    nn.ReLU(),
                                    nn.BatchNorm1d(50, affine=False),
                                    nn.Linear(50, 50),
                                    nn.ReLU(),
                                    nn.Dropout(p=0.3)
                                    )
        
        self.emb_map = nn.Linear(self.maps_size, 10)
        self.emb_teams = nn.Linear(self.teams_size, 300)
        self.emb_players = nn.Linear(self.players_size, 512)
        self.lin = nn.Linear(50, 1)


    def forward(self, x0):
        x_1000_1 = self.layer_1000_1(x0)

        x_500_1 = self.layer_500_1(x_1000_1)

        x_100_1 = self.layer_100_1(x_500_1)
        x_100_2 = self.layer_100_2(x_100_1)
        x_100_3 = self.layer_100_3(x_100_2) + x_100_1
        x_100_4 = self.layer_100_2(x_100_3) + x_100_2
        x_100_5 = self.layer_100_5(x_100_4) + x_100_3

        x_50_1 = self.layer_50_1(x_100_5)
        x_50_2 = self.layer_50_2(x_50_1)
        x_50_3 = self.layer_50_3(x_50_2) + x_50_1
        x_50_4 = self.layer_50_2(x_50_3) + x_50_2
        x_50_5 = self.layer_50_3(x_50_4) + x_50_3

        return self.lin(x_50_5)

In [None]:
from torch.optim import AdamW
from torch.optim.lr_scheduler import ExponentialLR
vector_size = train_dataset[0]['text'].shape[1]
teams_size = enc_teams.transform([train_dataset.texts.iloc[0][['team_1', 'team_2']].to_numpy()]).shape[1]
players_size = len(train_dataset.texts['players_team1'][0])
maps_size = train_dataset.texts['Map'].unique().shape[0]

lr = 1e-3
num_epochs = 10

model = LinearModel(vector_size, teams_size, players_size, maps_size)
model = model.to(device)
# criterion = nn.CrossEntropyLoss()
# criterion = nn.MSELoss()
criterion = nn.HuberLoss()
# criterion = nn.BCEWithLogitsLoss()
optimizer = AdamW(model.parameters(), lr=lr)
scheduler = ExponentialLR(optimizer, gamma=0.9)



In [None]:
from tqdm.notebook import tqdm
from sklearn.metrics import precision_score, recall_score

def training(model, optimizer, criterion, train_loader, epoch, device, scheduler):
    pbar = tqdm(train_loader, desc=f"Epoch {e + 1}. Train Loss: {0}")
    model.train()
    
    for batch in pbar:
        features = batch["text"].to(device).type(torch.float).squeeze()
        targets = batch["label"].to(device).type(torch.float).unsqueeze(1)

        optimizer.zero_grad()

        predict = model(features)
        loss = criterion(predict, targets)
        loss.backward()
        optimizer.step()

        pbar.set_description(f"Epoch {e + 1}. Train Loss: {loss:.4}")
    scheduler.step()
    

def testing(model, criterion, test_loader, device="cpu"):
    pbar = tqdm(test_loader, desc=f"Test Loss: {0}, Test Acc: {0}")
    
    mean_loss = 0
    mean_acc = 0
    mean_pr = 0
    mean_re = 0
    mean_acc_60 = 0
    mean_acc_70 = 0
    mean_acc_65 = 0
    lenth_60 = 0
    lenth_70 = 0
    lenth_65 = 0
    all_lenth = 0

    model.eval()
    len_batches = len(test_loader)
    with torch.no_grad():

        for i, batch in enumerate(pbar):
            features = batch["text"].to(device).type(torch.float).squeeze()
            targets = batch["label"].to(device).type(torch.float).unsqueeze(1)

            predict = model(features)

            loss = criterion(predict, targets)
            predict_acc_60 = predict[(predict > 0.6)|(predict < 0.4)]
            predict_acc_60[predict_acc_60 > 0.6] = 1
            predict_acc_60[predict_acc_60 <= 0.6] = 0

            targets_acc_60 = targets[(predict > 0.6)|(predict < 0.4)]
            targets_acc_60[targets_acc_60 > 0.5] = 1
            targets_acc_60[targets_acc_60 <= 0.5] = 0

            predict_acc_70 = predict[(predict > 0.7)|(predict < 0.3)]
            predict_acc_70[predict_acc_70 > 0.7] = 1
            predict_acc_70[predict_acc_70 <= 0.7] = 0

            targets_acc_70 = targets[(predict > 0.7)|(predict < 0.3)]
            targets_acc_70[targets_acc_70 > 0.5] = 1
            targets_acc_70[targets_acc_70 <= 0.5] = 0

            predict_acc_65 = predict[(predict > 0.65)|(predict < 0.35)]
            predict_acc_65[predict_acc_65 > 0.65] = 1
            predict_acc_65[predict_acc_65 <= 0.65] = 0

            targets_acc_65 = targets[(predict > 0.65)|(predict < 0.35)]
            targets_acc_65[targets_acc_65 > 0.5] = 1
            targets_acc_65[targets_acc_65 <= 0.5] = 0

            acc_60 = (predict_acc_60 == targets_acc_60).type(torch.float).mean()
            acc_70 = (predict_acc_70 == targets_acc_70).type(torch.float).mean() if (predict_acc_70 == targets_acc_70).type(torch.float).mean() is not np.nan else 0
            acc_65 = (predict_acc_65 == targets_acc_65).type(torch.float).mean() if (predict_acc_65 == targets_acc_65).type(torch.float).mean() is not np.nan else 0
            
            predict[predict > 0.5] = 1
            predict[predict <= 0.5] = 0
            
            targets[targets > 0.5] = 1
            targets[targets <= 0.5] = 0
            
            acc = (predict == targets).type(torch.float).mean()
            pr = precision_score(targets.cpu(), predict.cpu())
            re = recall_score(targets.cpu(), predict.cpu())
            

            l_60 = len(predict_acc_60)
            l_70 = len(predict_acc_70)
            l_65 = len(predict_acc_65) 

            lenth_60 += l_60
            lenth_70 += l_70
            lenth_65 += l_65

            all_lenth += len(predict)

            mean_loss += loss.item()
            mean_acc += acc.item()
            mean_pr += pr
            mean_re += re
            mean_acc_60 += acc_60
            mean_acc_70 += acc_70
            mean_acc_65 += acc_65

            if i != len_batches - 1:
              pbar.set_description(f"Test Loss: {loss:.4}, Test Acc: {acc:.4}, Test precision: {pr:.4}, Test recall: {re:.4}, "
                                   f'Accuracy 60%: {acc_60:.4}, ratio 60%: {l_60 / len(predict):.4}, Accuracy 65%: {acc_65:.4}, '
                                   f' ratio 65%: {l_65 / len(predict):.4}, Accuracy 70%: {acc_70:.4}, ratio 70%: {l_70 / len(predict):.4}')
            else:
              pbar.set_description(f"Test Loss: {mean_loss / len(test_loader):.4}, Test Acc: {mean_acc / len(test_loader):.4},"
                                   f' Test precision {mean_pr / len(test_loader):.4}, Test recall {mean_re / len(test_loader):.4}, '
                                   f'Accuracy 60%: {mean_acc_60 / len(test_loader):.4}, ratio 60%: {lenth_60 / all_lenth:.4}, '
                                   f'Accuracy 65%: {mean_acc_65 / len(test_loader):.4}, ratio 65%: {lenth_65 / all_lenth:.4} '
                                   f'Accuracy 70%: {mean_acc_70 / len(test_loader):.4}, ratio 70%: {lenth_70 / all_lenth:.4},')

    return {"Test Loss": mean_loss / len(test_loader), "Test Acc": mean_acc / len(test_loader),
            "Test precision": mean_pr / len(test_loader), "Test recall": mean_re / len(test_loader),
            'Accuracy 60%': mean_acc_60/ len(test_loader), 'Accuracy 70%': mean_acc_70/ len(test_loader), 'Accuracy 65%': mean_acc_60/ len(test_loader)}

In [None]:
best_metric = np.inf
print(num_epochs)
for e in range(num_epochs):
    training(model, optimizer, criterion, train_dataloader, e, device, scheduler)
    log = testing(model, criterion, test_dataloader, device)
    if log["Test Loss"] < best_metric:
        torch.save(model.state_dict(), "model.pt")
        best_metric = log["Test Loss"]

10


Epoch 1. Train Loss: 0:   0%|          | 0/63 [00:00<?, ?it/s]

Test Loss: 0, Test Acc: 0:   0%|          | 0/3 [00:00<?, ?it/s]

Epoch 2. Train Loss: 0:   0%|          | 0/63 [00:00<?, ?it/s]

Test Loss: 0, Test Acc: 0:   0%|          | 0/3 [00:00<?, ?it/s]

Epoch 3. Train Loss: 0:   0%|          | 0/63 [00:00<?, ?it/s]

Test Loss: 0, Test Acc: 0:   0%|          | 0/3 [00:00<?, ?it/s]

Epoch 4. Train Loss: 0:   0%|          | 0/63 [00:00<?, ?it/s]

Test Loss: 0, Test Acc: 0:   0%|          | 0/3 [00:00<?, ?it/s]

Epoch 5. Train Loss: 0:   0%|          | 0/63 [00:00<?, ?it/s]

Test Loss: 0, Test Acc: 0:   0%|          | 0/3 [00:00<?, ?it/s]

Epoch 6. Train Loss: 0:   0%|          | 0/63 [00:00<?, ?it/s]

Test Loss: 0, Test Acc: 0:   0%|          | 0/3 [00:00<?, ?it/s]

Epoch 7. Train Loss: 0:   0%|          | 0/63 [00:00<?, ?it/s]

Test Loss: 0, Test Acc: 0:   0%|          | 0/3 [00:00<?, ?it/s]

Epoch 8. Train Loss: 0:   0%|          | 0/63 [00:00<?, ?it/s]

Test Loss: 0, Test Acc: 0:   0%|          | 0/3 [00:00<?, ?it/s]

Epoch 9. Train Loss: 0:   0%|          | 0/63 [00:00<?, ?it/s]

Test Loss: 0, Test Acc: 0:   0%|          | 0/3 [00:00<?, ?it/s]

Epoch 10. Train Loss: 0:   0%|          | 0/63 [00:00<?, ?it/s]

Test Loss: 0, Test Acc: 0:   0%|          | 0/3 [00:00<?, ?it/s]