In [118]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from hyperopt import hp, tpe, space_eval
from hyperopt.fmin import fmin
from sklearn.model_selection import cross_val_score
import scipy.stats as st
from sklearn.metrics import mean_squared_log_error, make_scorer

# Формирование данных и пространства гиперпараметров

In [120]:
# пространство гиперпараметров
space4rf = {
    'max_depth': hp.choice('max_depth', range(5,15)),
    'max_features': hp.choice('max_features', ['sqrt', 'log2']),
    'n_estimators': hp.choice('n_estimators', range(100,1000, 100))
}
xgb_space = {
            'max_depth': hp.choice('x_max_depth',[2,3,4,5,6]),
            'min_child_weight':hp.choice('x_min_child_weight',np.round(np.arange(0.0,0.2,0.01),5)),
            'learning_rate':hp.choice('x_learning_rate',np.round(np.arange(0.005,0.3,0.01),5)),
            'subsample':hp.choice('x_subsample',np.round(np.arange(0.1,1.0,0.05),5)),
            'colsample_bylevel':hp.choice('x_colsample_bylevel',np.round(np.arange(0.1,1.0,0.05),5)),
            'colsample_bytree':hp.choice('x_colsample_bytree',np.round(np.arange(0.1,1.0,0.05),5)),
            'n_estimators':hp.choice('x_n_estimators',np.arange(100, 300, 50))
            }
space_lgbm = {
            'n_estimators': hp.choice('n_estimators', np.arange(100, 300, 50)),
            'learning_rate': hp.quniform('eta', 0.025, 0.5, 0.025),
            'max_depth':  hp.choice('max_depth', np.arange(1, 14, dtype=int)),
            'min_child_weight': hp.quniform('min_child_weight', 1, 6, 1),
            'subsample': hp.quniform('subsample', 0.5, 1, 0.05),
            'gamma': hp.quniform('gamma', 0.5, 1, 0.05),
            'colsample_bytree': hp.quniform('colsample_bytree', 0.5, 1, 0.05),
        }

In [122]:
def clf_high_boundary(clf, X_train, y_train, scoring, cv=5):
    ''' вычисление верхней границы дов. ин-ла по n фолдам кросс-валидации'''
    # вычисляем score на кросс валидации
    val_scores = cross_val_score(clf, X_train, y_train, cv=5, scoring=scoring)
    # доверительный интервал
    conf_interval = st.t.interval(0.95, len(val_scores)-1, loc=val_scores.mean(), scale=val_scores.std())
    return conf_interval[1]

In [123]:
def hyperopt_model(model, params, X_train, y_train, scoring='neg_mean_squared_log_error', evals=30):
    def hyperopt_score(params):
        clf = model(**params)
        return clf_high_boundary(clf, X_train, y_train, scoring=scoring)

    best = fmin(fn=hyperopt_score, space=params, algo=tpe.suggest, max_evals=evals)
    return model(**space_eval(params, best)), space_eval(params, best)

In [19]:
test_df = pd.read_csv('test.csv')
test_length = len(test_df)

In [22]:
full_train_length = 74018464

In [37]:
columns = pd.read_csv('train.csv', nrows=2).columns
test = pd.read_csv('train.csv', skiprows=full_train_length-test_length, nrows=test_length)
test.columns = columns

In [83]:
train_baseline = pd.read_csv('train.csv', nrows=10**5)

y = train_baseline['Demanda_uni_equil']
train_baseline.drop('Demanda_uni_equil', axis=1, inplace=True)

## Data fields

Semana — Week number (From Thursday to Wednesday)

Agencia_ID — Sales Depot ID

Canal_ID — Sales Channel ID

Ruta_SAK — Route ID (Several routes = Sales Depot)

Cliente_ID — Client ID

NombreCliente — Client name

Producto_ID — Product ID

NombreProducto — Product Name

Venta_uni_hoy — Sales unit this week (integer)

Venta_hoy — Sales this week (unit: pesos)

Dev_uni_proxima — Returns unit next week (integer)

Dev_proxima — Returns next week (unit: pesos)

Demanda_uni_equil — Adjusted Demand (integer) (This is the target you will predict)

In [38]:
test_df['Semana'].value_counts()

10    3538385
11    3460866
Name: Semana, dtype: int64

In [41]:
test['Semana'].value_counts()

9    6999251
Name: Semana, dtype: int64

## Подбор параметров на части данных

In [72]:
rf, _ = hyperopt_model(RandomForestRegressor, space4rf, train_baseline, y)

100%|█████████████████████████████████████████████| 30/30 [48:07<00:00, 96.26s/trial, best loss: -0.042993944663950345]


(RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                       max_depth=5, max_features='log2', max_leaf_nodes=None,
                       max_samples=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=300, n_jobs=None, oob_score=False,
                       random_state=None, verbose=0, warm_start=False),
 {'max_depth': 5, 'max_features': 'log2', 'n_estimators': 300})

In [74]:
rf.fit(train_baseline, y)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=5, max_features='log2', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=300, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [107]:
xgb, _ = hyperopt_model(XGBRegressor, xgb_space, train_baseline, y, scoring='neg_root_mean_squared_error')

100%|████████████████████████████████████████████████| 30/30 [12:20<00:00, 24.70s/trial, best loss: 21.789172519451633]


(XGBRegressor(base_score=None, booster=None, colsample_bylevel=0.15,
              colsample_bynode=None, colsample_bytree=0.75, gamma=None,
              gpu_id=None, importance_type='gain', interaction_constraints=None,
              learning_rate=0.235, max_delta_step=None, max_depth=6,
              min_child_weight=0.1, missing=nan, monotone_constraints=None,
              n_estimators=150, n_jobs=None, num_parallel_tree=None,
              objective='reg:squarederror', random_state=None, reg_alpha=None,
              reg_lambda=None, scale_pos_weight=None, subsample=0.15,
              tree_method=None, validate_parameters=False, verbosity=None),
 {'colsample_bylevel': 0.15,
  'colsample_bytree': 0.75,
  'learning_rate': 0.235,
  'max_depth': 6,
  'min_child_weight': 0.1,
  'n_estimators': 150,
  'subsample': 0.15})

In [112]:
xgb.fit(train_baseline, y)

XGBRegressor(base_score=0.5, booster=None, colsample_bylevel=0.15,
             colsample_bynode=1, colsample_bytree=0.75, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints=None,
             learning_rate=0.235, max_delta_step=0, max_depth=6,
             min_child_weight=0.1, missing=nan, monotone_constraints=None,
             n_estimators=150, n_jobs=0, num_parallel_tree=1,
             objective='reg:squarederror', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=0.15, tree_method=None,
             validate_parameters=False, verbosity=None)

In [125]:
lgbm , _ = hyperopt_model(LGBMRegressor, space_lgbm, train_baseline, y, scoring='neg_root_mean_squared_error')

100%|█████████████████████████████████████████████████| 30/30 [01:21<00:00,  2.71s/trial, best loss: 25.19863906155475]


In [126]:
lgbm.fit(train_baseline, y)

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=0.5,
              gamma=0.65, importance_type='split',
              learning_rate=0.47500000000000003, max_depth=2,
              min_child_samples=20, min_child_weight=2.0, min_split_gain=0.0,
              n_estimators=150, n_jobs=-1, num_leaves=31, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=0.65, subsample_for_bin=200000, subsample_freq=0)

## Подсчет финального score

In [76]:
y_test = test['Demanda_uni_equil']

In [80]:
mean_squared_log_error(y_true=y_test, y_pred=rf.predict(test.iloc[:, :-1]))**0.5

0.43111007335434876

In [117]:
mean_squared_log_error(y_true=y_test, y_pred=np.abs(xgb.predict(test.iloc[:, :-1])))**0.5

0.3015254197112287

In [127]:
mean_squared_log_error(y_true=y_test, y_pred=np.abs(lgbm.predict(test.iloc[:, :-1])))**0.5

0.16828117671639434

## Измененный датасет

In [129]:
train_baseline

Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima
0,3,1110,7,3301,15766,1212,3,25.14,0,0.0
1,3,1110,7,3301,15766,1216,4,33.52,0,0.0
2,3,1110,7,3301,15766,1238,4,39.32,0,0.0
3,3,1110,7,3301,15766,1240,4,33.52,0,0.0
4,3,1110,7,3301,15766,1242,3,22.92,0,0.0
...,...,...,...,...,...,...,...,...,...,...
99995,3,1112,1,1417,331275,1284,8,24.16,0,0.0
99996,3,1112,1,1417,331275,3270,3,31.41,0,0.0
99997,3,1112,1,1417,331275,31423,4,42.76,0,0.0
99998,3,1112,1,1417,331275,35651,20,150.00,0,0.0


In [131]:
train_baseline['Dev_uni_proxima'].value_counts()

0       97559
1        1179
2         563
3         220
4         126
5         109
6          54
7          31
8          24
10         17
9          16
11         14
12         13
15         10
14          7
20          7
24          5
13          4
19          4
21          4
25          3
40          3
18          3
30          3
35          2
16          2
28          2
17          2
26          1
27          1
152         1
150         1
672         1
76          1
38          1
43          1
110         1
48          1
1008        1
50          1
55          1
60          1
Name: Dev_uni_proxima, dtype: int64