# Обучение моделей

### Подготовка

In [1]:
import pandas
import pickle
import numpy as np
import matplotlib.pyplot as plt
from lightgbm import LGBMRegressor
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [20]:
LGBMRegressor.__version__

AttributeError: type object 'LGBMRegressor' has no attribute '__version__'

In [5]:
X_train = pandas.read_csv("trainDS_X.csv")
Y_train = pandas.read_csv("trainDS_Y.csv")
X_train = X_train.drop(X_train.columns[[0]], axis='columns')
Y_train = Y_train.drop(Y_train.columns[[0]], axis='columns')

### XGBRegressor

In [26]:
model = XGBRegressor()

In [27]:
n_estimators = [50, 100, 150, 200]
max_depth = [2, 4, 6, 8]
param_grid = dict(max_depth=max_depth, n_estimators=n_estimators)

In [None]:
kfold = StratifiedKFold(n_splits=10, random_state=7)
grid_search = GridSearchCV(model, param_grid, scoring="neg_root_mean_squared_error", n_jobs=-1, cv=kfold, verbose=1)
result = grid_search.fit(X_train, Y_train)

In [None]:
# grid search
model = XGBRegressor()
learning_rate = [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3]
param_grid = dict(learning_rate=learning_rate)
kfold = StratifiedKFold(n_splits=10, random_state=7)
grid_search = GridSearchCV(model, param_grid, scoring='neg_root_mean_squared_error', n_jobs=-1, cv=kfold)
grid_result = grid_search.fit(X_train, Y_train)
# summarize results
best= [grid_result.best_score_, grid_result.best_params_]
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
thebest = []
for mean, stdev, param in zip(means, stds, params):
    thebest.append([mean, stdev, param])

In [17]:
xgbr = xgb.XGBRegressor(booster='gbtree', verbosity=0, learning_rate=0.1, n_estimators=200, max_depth=50 )

In [18]:
xgbr.fit(X_train, Y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.1, max_delta_step=0, max_depth=50,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=200, n_jobs=8, num_parallel_tree=1,
             objective='reg:squarederror', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=0)

In [22]:
filename = 'finalized_model_xgbr.sav'
pickle.dump(xgbr, open(filename, 'wb'))

### LGBMRegressor

In [None]:
n_estimators = [50, 100, 150, 200]
max_depth = [2, 4, 6, 8]
param_grid = dict(max_depth=max_depth, n_estimators=n_estimators)

In [None]:
kfold = StratifiedKFold(n_splits=10, random_state=7)
grid_search = GridSearchCV(model, param_grid, scoring="neg_log_loss", n_jobs=-1, cv=kfold, verbose=1)
result = grid_search.fit(X_train, Y_train)

In [None]:
# grid search
model = LGBMRegressor()
learning_rate = [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3]
param_grid = dict(learning_rate=learning_rate)
kfold = StratifiedKFold(n_splits=10,  random_state=7)
grid_search = GridSearchCV(model, param_grid, scoring="neg_log_loss", n_jobs=-1, cv=kfold)
grid_result = grid_search.fit(X_train, Y_train)
# summarize results
best= [grid_result.best_score_, grid_result.best_params_]
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
thebest = []
for mean, stdev, param in zip(means, stds, params):
    thebest.append([mean, stdev, param])

In [33]:
lgbm = LGBMRegressor(max_depth = 50, learning_rate = 0.1, n_estimators = 200)

In [34]:
lgbm.fit(X_train, Y_train)

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=50,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=200, n_jobs=-1, num_leaves=31, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [35]:
filename = 'finalized_model_lgbm.sav'
pickle.dump(lgbm, open(filename, 'wb'))