In [1]:
# !pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.0.6-cp37-none-manylinux1_x86_64.whl (76.6 MB)
[K     |████████████████████████████████| 76.6 MB 94 kB/s 
Installing collected packages: catboost
Successfully installed catboost-1.0.6


In [2]:
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV, ParameterGrid
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv(r'mathes_2022.csv', delimiter=',')

In [4]:
df['right_pick'] = df['right_pick'].astype(int) 

In [5]:
X, y = df.drop(['score_1', 'score_2', 'score', 'date'], axis=1), df['score']

In [6]:
players_cols = ['player1_team1', 'player2_team1', 'player3_team1', 'player4_team1',
                'player5_team1', 'player1_team2', 'player2_team2', 'player3_team2',
                'player4_team2', 'player5_team2']

un = []
for col in players_cols:
      un += X[col].unique().tolist()

un = np.unique(un)

In [7]:
enc = OneHotEncoder()
enc.fit(un.reshape(-1, 1))

OneHotEncoder()

In [8]:
players1_cols = ['player1_team1', 'player2_team1', 'player3_team1', 'player4_team1',
                 'player5_team1']

playersteam1 = np.zeros((X.shape[0], len(un)))
for player in players1_cols:
      playersteam1 += enc.transform(X[player].to_numpy().reshape(-1, 1)).toarray()


players2_cols = ['player1_team2', 'player2_team2', 'player3_team2',
                  'player4_team2', 'player5_team2']

playersteam2 = np.zeros((X.shape[0], len(un)))
for player in players2_cols:
      playersteam2 += enc.transform(X[player].to_numpy().reshape(-1, 1)).toarray()

In [9]:
pca = PCA(n_components=50)
playersteam1_tranformed = pca.fit_transform(playersteam1)
playersteam2_tranformed = pca.transform(playersteam2)

In [10]:
X_new = np.concatenate((X[['DeltaTime', 'team_1', 'team_2']].to_numpy(),
                playersteam1, playersteam2, X[['team_rank_1', 'team_rank_2', 'Map']].to_numpy()), axis=1)

In [11]:
X_new = np.concatenate((X[['DeltaTime', 'team_1', 'team_2', 'team_rank_1', 'team_rank_2', 'Map']].to_numpy(), X[[]].to_numpy(),
                        playersteam1_tranformed, playersteam2_tranformed), axis=1)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    X_new, y, test_size=0.07, random_state=42, shuffle=False)

X_train_train, X_val, y_train_train, y_val = train_test_split(
    X_train, y_train, test_size=0.07, random_state=42, shuffle=False)

In [16]:
cat_features = [1, 2, 3, 4, 5]

In [None]:
train_data = Pool(data=X_train,
                  label=y_train,
                  cat_features=cat_features)

train_train_data = Pool(data=X_train,
                  label=y_train,
                  cat_features=cat_features)

eval_data = Pool(data=X_val,
                label=y_val,
                cat_features=cat_features)

test_data = Pool(data=X_test,
                label=y_test,
                cat_features=cat_features)

In [None]:
def make_datas(X_train, y_train, start, end, val):
    train_data = Pool(data=X_train[end:start:-1, :],
                label=y_train[end:start:-1],
                cat_features=cat_features)

    eval_data = Pool(data=X_train[val:end:-1, :],
               label=y_train[val:end:-1],
               cat_features=cat_features)

    return train_data, eval_data, y_train[val:end:-1]


grid = {'learning_rate': [0.3, 0.2, 0.1],
        'depth': [4, 5, 6, 8, 10],
        'iterations': [100, 200, 500, 1000]}
  
best_score = 0
for g in tqdm(ParameterGrid(grid)):
    scores = []
    for start, end, val in [(-10000, -4000, -3500),
                            (-9000, -3000, -2500),
                            (-8000, -2000, -1500),
                            (-7000, -1000, -500)]:
        train_data, eval_data, y_val = make_datas(X_train, y_train, start, end, val)
        model = CatBoostRegressor(iterations=100)
        model.set_params(**g)
        model.fit(train_data, eval_set=eval_data, silent=True)
        preds_class = model.predict(eval_data)
        scores += [accuracy_score(y_val > 0.5, preds_class > 0.5)]

    scores = np.array(scores)
    score = scores.mean()
        
    if score > best_score:
        best_score = score
        best_grid = g

  0%|          | 0/60 [00:00<?, ?it/s]

In [None]:
best_grid

{'depth': 5, 'iterations': 100, 'learning_rate': 0.3}

In [None]:
best_score

0.609

In [None]:
grid = {'learning_rate': [0.3, 0.2, 0.1],
        'depth': [4, 6, 10],
        'l2_leaf_reg': [1, 5, 9]}
  
best_score = 0

for g in tqdm(ParameterGrid(grid)):
    model = CatBoostRegressor(**g)
    model.fit(train_train_data, eval_set=eval_data, silent=True)
    preds_class = model.predict(eval_data)

    score = accuracy_score(y_val > 0.5, preds_class > 0.5)
    if score > best_score:
        best_score = score
        best_grid = g

  0%|          | 0/27 [00:00<?, ?it/s]

In [None]:
best_score

0.7898230088495575

In [None]:
best_grid

{'depth': 6, 'l2_leaf_reg': 9, 'learning_rate': 0.3}

In [17]:
train_data = Pool(data=X_train[-1:-10000:-1, :],
                  label=y_train[-1:-10000:-1],
                  cat_features=cat_features)

test_data = Pool(data=X_test,
                label=y_test,
                cat_features=cat_features)

In [18]:
model = CatBoostRegressor()

model.fit(train_data)
preds_class = model.predict(test_data)

Learning rate set to 0.058908
0:	learn: 0.2542022	total: 95.3ms	remaining: 1m 35s
1:	learn: 0.2533745	total: 134ms	remaining: 1m 6s
2:	learn: 0.2525271	total: 172ms	remaining: 57.3s
3:	learn: 0.2517810	total: 222ms	remaining: 55.4s
4:	learn: 0.2510666	total: 260ms	remaining: 51.7s
5:	learn: 0.2504671	total: 300ms	remaining: 49.7s
6:	learn: 0.2498003	total: 343ms	remaining: 48.7s
7:	learn: 0.2491649	total: 385ms	remaining: 47.8s
8:	learn: 0.2486661	total: 433ms	remaining: 47.6s
9:	learn: 0.2481444	total: 479ms	remaining: 47.5s
10:	learn: 0.2476626	total: 522ms	remaining: 46.9s
11:	learn: 0.2472481	total: 560ms	remaining: 46.1s
12:	learn: 0.2468247	total: 607ms	remaining: 46.1s
13:	learn: 0.2464910	total: 649ms	remaining: 45.7s
14:	learn: 0.2461103	total: 688ms	remaining: 45.2s
15:	learn: 0.2457523	total: 734ms	remaining: 45.1s
16:	learn: 0.2453846	total: 778ms	remaining: 45s
17:	learn: 0.2450211	total: 816ms	remaining: 44.5s
18:	learn: 0.2446217	total: 860ms	remaining: 44.4s
19:	learn: 