In [353]:
import pandas as pd
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import catboost as catb
import lightgbm as lgbm

In [354]:
df_match_train = pd.read_parquet('match_train.par')
df_match_events = pd.read_parquet('match_events.par')
df_team_features = pd.read_parquet('team_features.par')
df_match_test = pd.read_parquet('match_test.par')

In [355]:
df_match_train.head(3)

Unnamed: 0,mid,team_won
0,20348,1
1,1305,0
2,11021,0


In [356]:
df_match_events.head(3)

Unnamed: 0,mid,event_type,from_team,time
0,0,3,0,1
1,1,3,0,222
2,2,3,1,143


In [357]:
df_match_events['time'].sort_values().unique()

array([-55, -54, -49, -48, -45, -41, -40, -39, -38, -35, -34, -33, -32,
       -31, -30, -29, -28, -27, -26, -25, -24, -23, -22, -21, -20, -19,
       -18, -17, -16, -15, -14, -13, -12, -11, -10,  -9,  -8,  -7,  -6,
        -5,  -4,  -3,  -2,  -1,   0,   1,   2,   3,   4,   5,   6,   7,
         8,   9,  10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  20,
        21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  32,  33,
        34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,
        47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
        60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,
        73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,
        86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,  98,
        99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
       112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124,
       125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 13

В таблице df_match_events в столбце time имеются отрицательные значения, скорее всего это ошибка, знак минуса нужно убрать

In [358]:
df_match_events['time'] = df_match_events['time'].apply(lambda x: -x if x < 0 else x)

Строим сводную таблицу pivot_table,  где пропуски ставим “-1” (не было никакого события)

In [359]:
df_match_events['event_type_count'] = df_match_events['event_type']
pivot_match_events = pd.pivot_table(df_match_events, values=['event_type', 'time', 'event_type_count'], index='mid', columns='from_team',
              aggfunc={'event_type': np.mean, 
                       'time': np.mean, 
                       'event_type_count': np.size})
pivot_match_events.fillna(-1, inplace=True)

In [360]:
pivot_match_events.head(3)

Unnamed: 0_level_0,event_type,event_type,event_type_count,event_type_count,time,time
from_team,0,1,0,1,0,1
mid,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
0,3.0,-1.0,1.0,-1.0,1.0,-1.0
1,3.0,-1.0,1.0,-1.0,222.0,-1.0
2,-1.0,3.0,-1.0,1.0,-1.0,143.0


In [361]:
df_team_features.head(3)

Unnamed: 0,mid,times,team0_f0_sum,team0_f0_mean,team0_f0_min,team0_f0_max,team0_f1_sum,team0_f1_mean,team0_f1_min,team0_f1_max,...,team1_f0_min,team1_f0_max,team1_f1_sum,team1_f1_mean,team1_f1_min,team1_f1_max,team1_f2_sum,team1_f2_mean,team1_f2_min,team1_f2_max
0,0,60,2354,470.8,350,750,7,1.4,1,2,...,287,1056,15,3.0,1,7,809,161.8,78,396
1,0,120,4396,879.2,633,1080,15,3.0,1,6,...,438,1360,32,6.4,1,14,2598,519.6,241,895
2,0,180,5919,1183.8,782,1527,28,5.6,1,10,...,587,2072,46,9.2,1,18,3950,790.0,242,1259


In [362]:
df_team_features['times'].unique()

array([ 60, 120, 180, 240, 300, 360, 420, 480, 540, 600], dtype=int64)

In [363]:
pivot_team_features = pd.pivot_table(df_team_features, values=df_team_features.iloc[:, 2:].columns, index='mid' , columns='times')

In [364]:
df_match_train.index = df_match_train['mid']

In [365]:
total_table = pd.concat([df_match_train, pivot_match_events, pivot_team_features], 
          join='outer', ignore_index=True, axis=1, keys='mid').dropna()
total_table.drop(columns=0, inplace=True)
total_table.rename(columns={1: 'won'}, inplace=True)
total_table.head()

Unnamed: 0_level_0,won,2,3,4,5,6,7,8,9,10,...,238,239,240,241,242,243,244,245,246,247
mid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,3.0,-1.0,1.0,-1.0,1.0,-1.0,750,1080,1527,...,809,2598,3950,5560,7285,8936,9888,11613,13289,14930
2,0.0,-1.0,3.0,-1.0,1.0,-1.0,143.0,564,1260,1780,...,731,2210,3326,4519,5721,7361,8215,9564,11467,12445
4,0.0,-1.0,3.0,-1.0,1.0,-1.0,53.0,561,893,1276,...,849,2406,4200,5509,7089,8771,9628,11435,12634,13076
6,0.0,4.5,-1.0,2.0,-1.0,302.5,-1.0,555,1425,1943,...,788,2383,3622,5538,7102,8680,9765,12002,14992,16051
8,1.0,-1.0,4.5,-1.0,2.0,-1.0,293.0,627,1095,1621,...,681,2519,3738,5653,7094,8592,9910,12356,13685,15343


In [366]:
y = total_table['won']
X = total_table.drop(columns='won')

Баланс классов, процент класса "0"

In [367]:
print(y[y == 0].size / y.size)

0.5155813953488372


In [368]:
X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.33, random_state=13)

In [369]:
print('SGDClassifier')
clf = SGDClassifier(max_iter=1000, learning_rate='constant', eta0=0.1, random_state=13) #default
# clf = SGDClassifier(max_iter=281, learning_rate='invscaling', eta0=0.3, random_state=13) #gridresearch
clf.fit(X_train, y_train)
print('MSE:', mean_squared_error(y_validation, clf.predict(X_validation)))
print('ROC AUC:', roc_auc_score(y_validation, clf.predict(X_validation)))

SGDClassifier
MSE: 0.4427413671599718
ROC AUC: 0.5392968852596015


In [370]:
print('RandomForest')
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
print('MSE:', mean_squared_error(y_validation, rfc.predict(X_validation)))
print('ROC AUC:', roc_auc_score(y_validation, rfc.predict(X_validation)))

RandomForest
MSE: 0.34954193093727975
ROC AUC: 0.6488298831226198


In [371]:
print('XGboost')
xgb_cl = xgb.XGBClassifier()
xgb_cl.fit(X_train, y_train)
print('MSE:', mean_squared_error(y_validation, xgb_cl.predict(X_validation)))
print('ROC AUC:', roc_auc_score(y_validation, xgb_cl.predict(X_validation)))

XGboost
MSE: 0.37068357998590556
ROC AUC: 0.6288591024676767


In [372]:
print('CatBoost')
catb_cl = catb.CatBoostClassifier()
catb_cl.fit(X_train, y_train, verbose=False)
print('MSE:', mean_squared_error(y_validation, catb_cl.predict(X_validation)))
print('ROC AUC:', roc_auc_score(y_validation, catb_cl.predict(X_validation)))

CatBoost
MSE: 0.3449612403100775
ROC AUC: 0.6545095985100821


In [373]:
print('LightGBM')
lgbm_cl = lgbm.LGBMClassifier()
lgbm_cl.fit(X_train, y_train)
print('MSE:', mean_squared_error(y_validation, lgbm_cl.predict(X_validation)))
print('ROC AUC:', roc_auc_score(y_validation, lgbm_cl.predict(X_validation)))

LightGBM
MSE: 0.35658914728682173
ROC AUC: 0.6426218915272853


In [374]:
total_test = pd.concat([df_match_train, pivot_match_events, pivot_team_features], 
          join='outer', ignore_index=True, axis=1, keys='mid')
total_test = total_test[total_test[0].isna()]
total_test.drop(columns=0, inplace=True)
total_test.rename(columns={1: 'won'}, inplace=True)
total_test.head()

Unnamed: 0_level_0,won,2,3,4,5,6,7,8,9,10,...,238,239,240,241,242,243,244,245,246,247
mid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,3.0,-1.0,1.0,-1.0,222.0,-1.0,557,950,1473,...,843,1992,3744,5218,7427,9317,11418,12825,14906,17337
3,,3.0,-1.0,1.0,-1.0,143.0,-1.0,556,928,1382,...,803,2007,3512,4622,6662,8194,9723,11583,13211,13933
7,,6.0,3.0,2.0,1.0,439.0,77.0,537,1048,1594,...,668,2315,3544,5217,7624,8859,10422,12341,15495,17352
9,,-1.0,3.0,-1.0,1.0,-1.0,24.0,555,1080,1746,...,867,2502,3839,5021,7271,8747,10683,12724,14965,16239
10,,3.0,-1.0,1.0,-1.0,16.0,-1.0,948,1325,1779,...,849,2353,4260,5682,7409,8736,10480,12598,15248,16234


In [375]:
y_test = total_test['won']
X_test = total_test.drop(columns='won')

In [376]:
print('CatBoost')
catb_cl = catb.CatBoostClassifier()
catb_cl.fit(X_train, y_train, verbose=False)
catb_cl.predict(X_test)

CatBoost


array([1., 0., 1., ..., 1., 1., 0.])