# Обучение модели


In [1]:
import pandas as pd
import numpy as np
import os.path

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score
from lightgbm import LGBMClassifier

RANDOM_STATE = 1337

In [51]:
dataset_path = os.path.join('datasets/dataset.csv')
df = pd.read_csv(dataset_path)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3295 entries, 0 to 3294
Data columns (total 22 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   filename           3295 non-null   object 
 1   kills              3295 non-null   int64  
 2   deaths             3295 non-null   int64  
 3   kd                 3295 non-null   float64
 4   counted_kd         3295 non-null   float64
 5   avg                3295 non-null   int64  
 6   is_ranked          3295 non-null   int64  
 7   season             3295 non-null   int64  
 8   rank_games         3295 non-null   int64  
 9   rank_kills         3295 non-null   int64  
 10  rank_deaths        3295 non-null   int64  
 11  rank_kd            3295 non-null   float64
 12  rank_counted_kd    3295 non-null   float64
 13  rank_avg           3295 non-null   int64  
 14  rank_s1            3295 non-null   int64  
 15  rank_s2            3295 non-null   int64  
 16  season_kills       3295 

In [52]:
df.head()

Unnamed: 0,filename,kills,deaths,kd,counted_kd,avg,is_ranked,season,rank_games,rank_kills,rank_deaths,rank_kd,rank_counted_kd,rank_avg,rank_s1,rank_s2,season_kills,season_deaths,season_kd,season_counted_kd,season_avg,skill
0,-ANBU-1477.png,9044,4363,2.0700,2.0700,622,1,10,284,352,285,1.2300,1.2400,593,6,6,0,0,0.0000,0.0000,0,3
1,.7.7.7._.7.7.7._941607883328598016.png,3624,2555,1.4100,1.4200,576,1,11,859,842,863,0.9700,0.9800,528,5,6,0,0,0.0000,0.0000,0,3
2,.abua._1305861171118866483.png,80,125,0.6400,0.6400,201,0,0,0,0,0,0.0000,0.0000,0,0,0,78,121,0.6400,0.6400,202,2
3,.akashka_1251553790545494141.png,2469,2369,1.0400,1.0400,352,1,21,65,135,63,2.1400,2.1400,780,3,3,0,0,0.0000,0.0000,0,2
4,.alvarezzztm_1084379926460911667.png,4346,4637,0.9400,0.9400,378,1,16,109,172,108,1.5900,1.5900,611,3,3,0,0,0.0000,0.0000,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3290,мой твинк сезон 8 конец.png,3925,2201,1.7800,1.7800,534,0,0,0,0,0,0.0000,0.0000,0,0,0,588,313,1.8700,1.8800,587,3
3291,мой твинк сезон 8 начало.png,3460,1950,1.7700,1.7700,525,1,8,21,57,19,3.0000,3.0000,806,3,3,0,0,0.0000,0.0000,0,3
3292,scriptrix.png,13901,5498,2.5200,2.5300,576,0,0,0,0,0,0.0000,0.0000,0,0,0,3702,1217,3.0400,3.0400,642,5
3293,hARTLY.jpg,78826,24423,3.2300,3.2300,833,1,23,752,2904,666,4.3600,4.3600,1394,7,7,0,0,0.0000,0.0000,0,5


In [56]:
kd_dif = (df['kd'] - df['counted_kd'] > 0.02) | (df['rank_kd'] - df['rank_counted_kd'] > 0.02) | (df['season_kd'] - df['season_counted_kd'] > 0.02)
df[kd_dif]

Unnamed: 0,filename,kills,deaths,kd,counted_kd,avg,is_ranked,season,rank_games,rank_kills,rank_deaths,rank_kd,rank_counted_kd,rank_avg,rank_s1,rank_s2,season_kills,season_deaths,season_kd,season_counted_kd,season_avg,skill


In [57]:
df = df.drop(['counted_kd', 'rank_counted_kd', 'season_counted_kd'], axis=1)

In [58]:
def season(s):
    if s == 12 or s == 17:
        return -1
    elif s == 13:
        return 1
    else:
        return 0
df['season'] = df['season'].apply(season)

In [61]:
X = df.drop(['filename', 'skill'], axis=1)
y = df['skill']

In [64]:
X.head(10)

Unnamed: 0,kills,deaths,kd,avg,is_ranked,season,rank_games,rank_kills,rank_deaths,rank_kd,rank_avg,rank_s1,rank_s2,season_kills,season_deaths,season_kd,season_avg
0,9044,4363,2.07,622,1,0,284,352,285,1.23,593,6,6,0,0,0.0,0
1,3624,2555,1.41,576,1,0,859,842,863,0.97,528,5,6,0,0,0.0,0
2,80,125,0.64,201,0,0,0,0,0,0.0,0,0,0,78,121,0.64,202
3,2469,2369,1.04,352,1,0,65,135,63,2.14,780,3,3,0,0,0.0,0
4,4346,4637,0.94,378,1,0,109,172,108,1.59,611,3,3,0,0,0.0,0
5,1180,1056,1.11,422,1,-1,348,428,348,1.22,495,5,4,0,0,0.0,0
6,156,160,0.97,263,0,0,0,0,0,0.0,0,0,0,9,14,0.64,195
7,34879,10285,3.39,839,1,-1,3410,8430,3132,2.69,812,7,7,0,0,0.0,0
8,29822,8419,3.54,847,1,-1,1360,3373,1267,2.66,830,6,6,0,0,0.0,0
9,1247,1437,0.87,360,0,0,0,0,0,0.0,0,0,0,359,450,0.8,373


In [66]:
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.25, random_state=RANDOM_STATE)

print('Train size:')
print(f'features {X_train.shape};')
print(f'target {y_train.shape}')
print(f"it's {round((y_train.count() / y.count()) * 100)}% of dataset")
print()
print('Test size:')
print(f'features {X_test.shape};')
print(f'target {y_test.shape}')
print(f"it's {round((y_test.count() / y.count()) * 100)}% of dataset")

Размер тренировочной выборки:
признаки (2471, 17);
целевой признак (2471,)
это 75% от датасета

Размер тестовой выборки:
признаки (824, 17);
целевой признак (824,)
это 25% от датасета


In [67]:
pipeline = Pipeline([
    ('smote', SMOTE(random_state=42)),
    ('under', RandomUnderSampler(random_state=42))
])

X_resampled, y_resampled = pipeline.fit_resample(X_train, y_train)

In [68]:
cv = KFold(n_splits=5, random_state=RANDOM_STATE, shuffle=True)

lgbm_parameters = {'n_estimators': range(100, 1001, 100),
                   'max_depth': [1, 2, 4, 8, 12, 16],
                   'num_leaves': range(20, 51),
                   'learning_rate': np.linspace(0.01, 0.2, 20)
                   }

# создаем пайплайн - объединяем преобразованные данные и модель
model = LGBMClassifier(random_state=RANDOM_STATE, objective='multiclass', n_jobs=8, verbose=-1)

lgbm_sch = RandomizedSearchCV(model,
                              param_distributions=lgbm_parameters,
                              n_iter=100,
                              cv=cv,
                              scoring='accuracy',
                              random_state=RANDOM_STATE
                              )

lgbm_sch.fit(X_resampled, y_resampled)

print('Best accuracy score on cross-validation: ', lgbm_sch.best_score_)
print('with parameters: ', lgbm_sch.best_params_)

Наилучший показатель accuracy на кросс-валидации:  0.9461597742887745
при параметрах:  {'num_leaves': 49, 'n_estimators': 900, 'max_depth': 12, 'learning_rate': 0.12}


In [71]:
model = LGBMClassifier(random_state=RANDOM_STATE, 
                       objective='multiclass',
                       n_estimators = 900,
                       max_depth = 12,
                       num_leaves = 49,
                       learning_rate = 0.12,
                       n_jobs=8, 
                       verbose=-1)

model.fit(X_resampled, y_resampled)
predict = model.predict(X_test)

accuracy_score(predict, y_test)

0.8203883495145631

In [72]:
import joblib

joblib.dump(model, 'skill_pred_model\\lgbm_3_model.pkl')

['skill_pred_model\\lgbm_3_model.pkl']