In [1]:
from catboost import CatBoostClassifier, Pool
import timeit
import random
import pandas as pd
from pandas import Series
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_selection import f_classif, mutual_info_classif
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split, cross_validate, learning_curve
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, auc, roc_auc_score, roc_curve
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from catboost import cv

### Getting data from two csv files and merging into one dataframe

In [2]:
df_match = pd.read_csv('match.csv')
df_players = pd.read_csv('players.csv')

In [3]:
display(df_match.head(5))
display(df_players.head(5))

Unnamed: 0,match_id,start_time,duration,tower_status_radiant,tower_status_dire,barracks_status_dire,barracks_status_radiant,first_blood_time,game_mode,radiant_win,negative_votes,positive_votes,cluster
0,0,1446750112,2375,1982,4,3,63,1,22,True,0,1,155
1,1,1446753078,2582,0,1846,63,0,221,22,False,0,2,154
2,2,1446764586,2716,256,1972,63,48,190,22,False,0,0,132
3,3,1446765723,3085,4,1924,51,3,40,22,False,0,0,191
4,4,1446796385,1887,2047,0,0,63,58,22,True,0,0,156


Unnamed: 0,match_id,account_id,hero_id,player_slot,gold,gold_spent,gold_per_min,xp_per_min,kills,deaths,...,unit_order_glyph,unit_order_eject_item_from_stash,unit_order_cast_rune,unit_order_ping_ability,unit_order_move_to_direction,unit_order_patrol,unit_order_vector_target_position,unit_order_radar,unit_order_set_item_combine_lock,unit_order_continue
0,0,0,86,0,3261,10960,347,362,9,3,...,,,,6.0,,,,,,
1,0,1,51,1,2954,17760,494,659,13,3,...,,,,14.0,,,,,,
2,0,0,83,2,110,12195,350,385,0,4,...,,,,17.0,,,,,,
3,0,2,11,3,1179,22505,599,605,8,4,...,1.0,,,13.0,,,,,,
4,0,3,67,4,3307,23825,613,762,20,3,...,3.0,,,23.0,,,,,,


In [4]:
df_stat = df_match[['match_id', 'radiant_win']]

In [5]:
pivot = pd.pivot_table(df_players,
               index=['match_id'],
              columns=['player_slot'],
              values=['hero_id'])

In [6]:
df_stat['0'] = pivot['hero_id'][0]
df_stat['1'] = pivot['hero_id'][1]
df_stat['2'] = pivot['hero_id'][2]
df_stat['3'] = pivot['hero_id'][3]
df_stat['4'] = pivot['hero_id'][4]
df_stat['128'] = pivot['hero_id'][128]
df_stat['129'] = pivot['hero_id'][129]
df_stat['130'] = pivot['hero_id'][130]
df_stat['131'] = pivot['hero_id'][131]
df_stat['132'] = pivot['hero_id'][132]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_stat['0'] = pivot['hero_id'][0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_stat['1'] = pivot['hero_id'][1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_stat['2'] = pivot['hero_id'][2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_in

In [7]:
display(df_stat)

Unnamed: 0,match_id,radiant_win,0,1,2,3,4,128,129,130,131,132
0,0,True,86,51,83,11,67,106,102,46,7,73
1,1,False,7,82,71,39,21,73,22,5,67,106
2,2,False,51,109,9,41,27,38,7,10,12,85
3,3,False,50,44,32,26,39,78,19,31,40,47
4,4,True,8,39,55,87,69,101,100,22,67,21
...,...,...,...,...,...,...,...,...,...,...,...,...
49995,49995,True,73,86,21,20,14,32,7,109,35,112
49996,49996,True,93,74,100,32,85,36,1,112,60,71
49997,49997,True,100,68,75,39,44,28,102,21,9,23
49998,49998,True,56,50,2,72,30,46,7,29,44,3


### For forecasting I use catboost open source gradient boosting library

In [8]:
df_train = df_stat.drop(columns = ['match_id'])

In [9]:
X_data_train = df_train.drop(columns = ['radiant_win'])
y_data_train = df_train['radiant_win']

### First draft

In [10]:
RANDOM_SEED = 42
X_train, X_valid, y_train, y_valid = train_test_split(X_data_train, y_data_train,
                                               random_state=RANDOM_SEED,
                                               test_size=0.2)

In [11]:
def train_on_cpu():  
  model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.05,
    task_type='CPU',
      custom_loss=['AUC', 'Accuracy']
  )
  
  model.fit(
      X_train, y_train,
      eval_set=(X_valid, y_valid),
      verbose=False,
      plot=True
  );     
      
gpu_time = timeit.timeit('train_on_cpu()', 
                         setup="from __main__ import train_on_cpu", 
                         number=1)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

### Cross-validation

In [12]:
params = {
    'loss_function': 'Logloss',
    'iterations': 1000,
    'custom_loss': ['AUC', 'Accuracy'],
    'learning_rate': 0.05,
    'task_type': 'CPU',
}

cv_data = cv(
    params = params,
    pool = Pool(X_data_train, label=y_data_train),
    fold_count=5,
    shuffle=True,
    partition_random_seed=42,
    plot=True,
    verbose=False
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Training on fold [0/5]

bestTest = 0.688500444
bestIteration = 551

Training on fold [1/5]

bestTest = 0.6886132968
bestIteration = 518

Training on fold [2/5]

bestTest = 0.690172876
bestIteration = 733

Training on fold [3/5]

bestTest = 0.6878570543
bestIteration = 852

Training on fold [4/5]

bestTest = 0.6895929935
bestIteration = 572



### Since players can choose heroes in different sequences, I decided to add data by shuffling the heroes into teams for each game. But if there are two teams of five players, the total number of combinations is 14400. Generation of new data and training on them will take a very long time, so I did a parallel mixing of teams and took 10% of random combinations of heroes. And so I increased the original data by 12 times.

In [13]:
df_team1 = df_stat[['0','1','2','3','4']]
df_team2 = df_stat[['128','129','130','131','132']]

In [14]:
from itertools import permutations

df_team1_pmt = pd.DataFrame()
df_team2_pmt = pd.DataFrame()
df_train_pmt = pd.DataFrame(columns=('radiant_win','0','1','2','3','4','128','129','130','131','132'))
df_test = pd.DataFrame()

#df_test = df_train.head(10)

for i in range(df_train.shape[0]):
    df_team1_tmp = pd.DataFrame()
    df_team2_tmp = pd.DataFrame()
    df_radiant_stat = pd.DataFrame()
    
    for j in permutations(df_team1.iloc[i]):
        df_t = pd.DataFrame(j).T
        df_team1_tmp = df_team1_tmp.append(df_t)

    for j in permutations(df_team2.iloc[i]):
        df_t = pd.DataFrame(j).T
        df_team2_tmp = df_team2_tmp.append(df_t)

    df_team1_tmp = df_team1_tmp.sample(frac=0.1, replace=True, random_state=42)
    df_team1_tmp.columns = ['0','1','2','3','4']
    
    df_team2_tmp = df_team2_tmp.sample(frac=0.1, replace=True, random_state=42)
    df_team2_tmp.columns = ['128','129','130','131','132']
    
    df_pmt_tmp = pd.concat([df_team1_tmp, df_team2_tmp], axis=1)
    
    df_train_pmt = df_train_pmt.append(df_pmt_tmp)
    df_train_pmt = df_train_pmt.fillna(df_train['radiant_win'][i])
    
display(df_train_pmt)

Unnamed: 0,radiant_win,0,1,2,3,4,128,129,130,131,132
0,True,67,51,86,83,11,73,102,106,46,7
0,True,83,86,11,67,51,46,106,7,73,102
0,True,11,67,51,86,83,7,73,102,106,46
0,True,86,11,83,51,67,106,7,46,102,73
0,True,67,51,11,86,83,73,102,7,106,46
...,...,...,...,...,...,...,...,...,...,...,...
0,False,94,21,68,19,35,100,53,90,9,73
0,False,21,19,94,68,35,53,9,100,90,73
0,False,35,19,21,94,68,73,9,53,100,90
0,False,35,68,19,94,21,73,90,9,100,53


In [15]:
X_data_train_pmt = df_train_pmt.drop(columns = ['radiant_win'])
y_data_train_pmt = df_train_pmt['radiant_win']

In [16]:
params = {
    'loss_function': 'Logloss',
    'iterations': 10000,
    'custom_loss': ['AUC', 'Accuracy'],
    'learning_rate': 0.5,
    'task_type': 'CPU',
}

cv_data = cv(
    params = params,
    pool = Pool(X_data_train_pmt, label=y_data_train_pmt),
    fold_count=5,
    shuffle=True,
    partition_random_seed=42,
    plot=True,
    verbose=False
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Training on fold [0/5]

bestTest = 0.6510199776
bestIteration = 7945

Training on fold [1/5]

bestTest = 0.6502911677
bestIteration = 6044

Training on fold [2/5]

bestTest = 0.6503841482
bestIteration = 7268

Training on fold [3/5]

bestTest = 0.6498860348
bestIteration = 4340

Training on fold [4/5]

bestTest = 0.6464247856
bestIteration = 8229



### It turned out to achieve a result of 66% accuracy. Because victory depends on many parameters, and not just on the selected heroes, I consider this a good result. If we develop this topic further, we can add dependence on the use of heroes by specific players, as well as statistics on team victories.