In [393]:
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm
import pandas as pd
from sklearn.metrics import mean_absolute_error
from copy import deepcopy
sns.set()

In [396]:
X_train = pd.read_csv('lt_train.csv')
X_test = pd.read_csv('lt_test.csv')

In [397]:
X_train.describe()

Unnamed: 0,location_id,product_id,demand,PROMO1_FLAG,PROMO2_FLAG,PRICE_REGULAR,PRICE_AFTER_DISC,NUM_CONSULTANT,AUTORIZATION_FLAG,id
count,402902.0,402902.0,249593.0,3402342.0,3402342.0,2182591.0,2147736.0,3402342.0,3402342.0,3404846.0
mean,866.363049,68773.279194,0.381047,0.408595,0.001626821,2256.149,1488.29,0.1222108,0.7714183,1707302.0
std,339.571497,27970.451505,0.690858,0.5081324,0.04030105,551.4987,440.8112,0.7633092,0.4199192,983338.7
min,309.0,23252.0,0.0,0.0,0.0,495.0,0.99,0.0,0.0,1.0
25%,557.0,37466.0,0.033175,0.0,0.0,1999.0,1181.25,0.0,1.0,856364.2
50%,798.0,76559.0,0.090909,0.0,0.0,2399.0,1434.414,0.0,1.0,1707576.0
75%,1191.0,88915.0,0.466667,1.0,0.0,2600.0,1762.5,0.0,1.0,2558787.0
max,1380.0,149517.0,27.0,3.0,1.0,5699.0,5449.0,11.0,1.0,3409998.0


In [398]:
X_train['Date'] = pd.to_datetime(X_train['period_dt'], dayfirst = True)
X_train.set_index(['Date'], inplace=True)
X_train.drop('period_dt', axis=1, inplace=True)
X_train.head()

Unnamed: 0_level_0,location_id,product_id,demand,PROMO1_FLAG,PROMO2_FLAG,PRICE_REGULAR,PRICE_AFTER_DISC,NUM_CONSULTANT,AUTORIZATION_FLAG,id
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2019-12-16,764.0,23285.0,,0.0,0.0,2199.0,1099.5,0.0,1.0,1
2019-12-30,764.0,23285.0,,0.0,0.0,2199.0,1099.5,0.0,1.0,3
2019-12-09,764.0,23285.0,,0.0,0.0,2199.0,1099.5,0.0,0.0,4
2019-12-02,453.0,23285.0,,1.0,0.0,2199.0,1319.4,0.0,1.0,5
2019-12-02,764.0,23285.0,,1.0,0.0,2199.0,1099.5,0.0,0.0,6


In [399]:
data = X_train.reset_index().copy(deep = True)

data["monthday"] = data.Date.dt.day
data["month"] = data.Date.dt.month
data["year"] = data.Date.dt.year

In [400]:
data = data.assign(ln_demand = np.log(data['demand'] + 1))
data

Unnamed: 0,Date,location_id,product_id,demand,PROMO1_FLAG,PROMO2_FLAG,PRICE_REGULAR,PRICE_AFTER_DISC,NUM_CONSULTANT,AUTORIZATION_FLAG,id,monthday,month,year,ln_demand
0,2019-12-16,764.0,23285.0,,0.0,0.0,2199.0,1099.5,0.0,1.0,1,16,12,2019,
1,2019-12-30,764.0,23285.0,,0.0,0.0,2199.0,1099.5,0.0,1.0,3,30,12,2019,
2,2019-12-09,764.0,23285.0,,0.0,0.0,2199.0,1099.5,0.0,0.0,4,9,12,2019,
3,2019-12-02,453.0,23285.0,,1.0,0.0,2199.0,1319.4,0.0,1.0,5,2,12,2019,
4,2019-12-02,764.0,23285.0,,1.0,0.0,2199.0,1099.5,0.0,0.0,6,2,12,2019,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3404841,2019-12-30,,,,1.0,0.0,,,0.0,0.0,3409994,30,12,2019,
3404842,2019-12-30,,,,1.0,0.0,,,0.0,0.0,3409995,30,12,2019,
3404843,2019-12-30,,,,1.0,0.0,,,0.0,0.0,3409996,30,12,2019,
3404844,2019-12-30,,,,1.0,0.0,,,0.0,0.0,3409997,30,12,2019,


In [405]:
def fill_missing_dates(x, date_col):
    min_date, max_date = x[date_col].min(), x[date_col].max()
    groupby_day = x.groupby(pd.PeriodIndex(x[date_col], freq='D'))
    results = groupby_day.sum(min_count=1)

    idx = pd.period_range(min_date, max_date)
    results = results.reindex(idx, fill_value=np.nan)

    results.index.rename(date_col, inplace=True)

    return results


def calc_preag_fill(data, group_col, date_col, target_cols, preagg_method):
    ## calc preaggregation
    data_preag = data.groupby(group_col).agg(
        preagg_method)[target_cols].reset_index()

    ## fill missing dates
    data_preag_filled = data_preag.groupby(group_col[:-1]).apply(
        fill_missing_dates, date_col=date_col).drop(group_col[:-1],
                                                    axis=1).reset_index()

    ## return DataFrame with calculated preaggregation and filled missing dates
    return data_preag_filled


def calc_rolling(data_preag_filled, group_col, date_col, method, w):

    ## calc rolling stats
    lf_df_filled = data_preag_filled.groupby(group_col[:-1]).\
        apply(lambda x: x.set_index(date_col).rolling(window=w, min_periods=1).agg(method)).drop(group_col[:-1], axis=1)

    ## return DataFrame with rolled columns from target_vars
    return lf_df_filled


def calc_ewm(data_preag_filled, group_col, date_col, span):
    ## calc ewm stats
    lf_df_filled = data_preag_filled.groupby(group_col[:-1]).\
        apply(lambda x: x.set_index(date_col).ewm(span=span).mean()).drop(group_col[:-1], axis=1)

    ## return DataFrame with rolled columns from target_vars
    return lf_df_filled


def shift(lf_df_filled, group_col, date_col, lag):

    lf_df = lf_df_filled.groupby(
        level=group_col[:-1]).apply(lambda x: x.shift(lag)).reset_index()
    lf_df[date_col] = pd.to_datetime(lf_df[date_col].astype(str))

    ## return DataFrame with following columns: filter_col, id_cols, date_col and shifted stats
    return lf_df

In [415]:
from ipywidgets import IntProgress

def generate_lagged_features(
        data: pd.DataFrame,
        target_cols: list = ['ln_demand'],
        id_cols: list = ['location_id', 'product_id'],
        date_col: str = 'Date',
        lags: list = [14, 28],
        windows: list = ['28D', '56D'],
        preagg_methods: list = ['mean'],
        agg_methods: list = ['mean', 'median'],
        dynamic_filters: list = ['PROMO1_FLAG'],
        ewm_params: dict = {'PROMO1_FLAG': [14, 42]}) -> pd.DataFrame:
    

    data = data.sort_values(date_col)
    out_df = deepcopy(data)
    dates = [min(data[date_col]), max(data[date_col])]

    total = len(target_cols) * len(lags) * len(windows) * len(preagg_methods) * len(agg_methods) * len(dynamic_filters)
    progress = IntProgress(min=0, max=total)
    display(progress)

    for filter_col in dynamic_filters:
        group_col = group_col = [filter_col] + id_cols + [date_col]
        for lag in lags:
            for preagg in preagg_methods:
                data_preag_filled = calc_preag_fill(data, group_col, date_col,
                                                    target_cols, preagg)

                ## add ewm features
                for alpha in ewm_params.get(filter_col, []):
                    ewm_filled = calc_ewm(data_preag_filled, group_col,
                                          date_col, alpha)
                    ewm = shift(ewm_filled, group_col, date_col, lag)
                    new_names = {x: "{0}_lag{1}d_alpha{2}_key{3}_preag{4}_{5}_dynamic_ewm".\
                        format(x, lag, alpha, '_'.join(id_cols), preagg, filter_col) for x in target_cols}

                    out_df = pd.merge(out_df,
                                      ewm.rename(columns=new_names),
                                      how='left',
                                      on=group_col)

                for w in windows:
                    for method in agg_methods:
                        rolling_filled = calc_rolling(data_preag_filled,
                                                      group_col, date_col,
                                                      method, w)

                        ## lf_df - DataFrame with following columns: filter_col, id_cols, date_col, shifted rolling stats
                        rolling = shift(rolling_filled, group_col, date_col,
                                        lag)

                        method_name = method.__name__ if type(
                            method) != str else method

                        new_names = {x: "{0}_lag{1}d_w{2}_key{3}_preag{4}_ag{5}_{6}_dynamic_rolling".\
                                     format(x, lag, w, '_'.join(id_cols), preagg, method_name, filter_col) for x in target_cols}

                        out_df = pd.merge(out_df,
                                          rolling.rename(columns=new_names),
                                          how='left',
                                          on=group_col)
                        progress.value += 1

    return out_df

In [416]:
target_cols = ['ln_demand']
id_cols = ['location_id', 'product_id']
date_col = 'Date'
built_in_funcs = [pd.Series.kurtosis, pd.Series.skew]


data_lagged_features = generate_lagged_features(data 
                    , target_cols = target_cols
                    , id_cols = id_cols
                    , date_col = date_col
                    , lags = [14, 28]
                    , windows = ['28D', '56D']
                    , preagg_methods = ['mean'] # ['mean', 'count']
                    , agg_methods = ['mean', 'median']
                    , dynamic_filters = ['PROMO1_FLAG']
                    , ewm_params={'PROMO1_FLAG': [14, 28]}
                    )

IntProgress(value=0, max=8)

In [444]:
data_lagged_features.to_csv('lags.csv', index=False)

In [418]:
data = data_lagged_features

In [419]:
data.describe()

Unnamed: 0,location_id,product_id,demand,PROMO1_FLAG,PROMO2_FLAG,PRICE_REGULAR,PRICE_AFTER_DISC,NUM_CONSULTANT,AUTORIZATION_FLAG,id,...,ln_demand_lag14d_w28D_keylocation_id_product_id_preagmean_agmean_PROMO1_FLAG_dynamic_rolling,ln_demand_lag14d_w28D_keylocation_id_product_id_preagmean_agmedian_PROMO1_FLAG_dynamic_rolling,ln_demand_lag14d_w56D_keylocation_id_product_id_preagmean_agmean_PROMO1_FLAG_dynamic_rolling,ln_demand_lag14d_w56D_keylocation_id_product_id_preagmean_agmedian_PROMO1_FLAG_dynamic_rolling,ln_demand_lag28d_alpha14_keylocation_id_product_id_preagmean_PROMO1_FLAG_dynamic_ewm,ln_demand_lag28d_alpha28_keylocation_id_product_id_preagmean_PROMO1_FLAG_dynamic_ewm,ln_demand_lag28d_w28D_keylocation_id_product_id_preagmean_agmean_PROMO1_FLAG_dynamic_rolling,ln_demand_lag28d_w28D_keylocation_id_product_id_preagmean_agmedian_PROMO1_FLAG_dynamic_rolling,ln_demand_lag28d_w56D_keylocation_id_product_id_preagmean_agmean_PROMO1_FLAG_dynamic_rolling,ln_demand_lag28d_w56D_keylocation_id_product_id_preagmean_agmedian_PROMO1_FLAG_dynamic_rolling
count,402902.0,402902.0,249593.0,3402342.0,3402342.0,2182591.0,2147736.0,3402342.0,3402342.0,3404846.0,...,295810.0,295810.0,295810.0,295810.0,280997.0,280997.0,280997.0,280997.0,280997.0,280997.0
mean,866.363049,68773.279194,0.381047,0.408595,0.001626821,2256.149,1488.29,0.1222108,0.7714183,1707302.0,...,0.324765,0.285259,0.324765,0.285259,0.287164,0.295847,0.326991,0.288034,0.326991,0.288034
std,339.571497,27970.451505,0.690858,0.5081324,0.04030105,551.4987,440.8112,0.7633092,0.4199192,983338.7,...,0.304813,0.325967,0.304813,0.325967,0.33605,0.331721,0.305411,0.32678,0.305411,0.32678
min,309.0,23252.0,0.0,0.0,0.0,495.0,0.99,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,557.0,37466.0,0.033175,0.0,0.0,1999.0,1181.25,0.0,1.0,856364.2,...,0.082926,0.036558,0.082926,0.036558,0.035612,0.039985,0.081741,0.035409,0.081741,0.035409
50%,798.0,76559.0,0.090909,0.0,0.0,2399.0,1434.414,0.0,1.0,1707576.0,...,0.217289,0.107102,0.217289,0.107102,0.103797,0.119519,0.220189,0.107102,0.220189,0.107102
75%,1191.0,88915.0,0.466667,1.0,0.0,2600.0,1762.5,0.0,1.0,2558787.0,...,0.531771,0.600774,0.531771,0.600774,0.693143,0.691032,0.552473,0.670921,0.552473,0.670921
max,1380.0,149517.0,27.0,3.0,1.0,5699.0,5449.0,11.0,1.0,3409998.0,...,2.859743,2.944439,2.859743,2.944439,2.889712,2.853304,2.859743,2.944439,2.859743,2.944439


In [420]:
data.drop('Date', axis=1, inplace=True)

In [421]:
data['demand'] = data['demand'].fillna(-10)
data = data.fillna(0)
sc = MinMaxScaler()
sc.fit(data.iloc[:, 3:9])
data.iloc[:, 3:9] = sc.transform(data.iloc[:, 3:9])
le = LabelEncoder()
le.fit(data['location_id'])
data['location_id'] = le.transform(data['location_id'])
le.fit(data['product_id'])
data['product_id'] = le.transform(data['product_id'])
data_test = data
data = data.loc[data['demand'] > -1]

In [422]:
data.describe()

Unnamed: 0,location_id,product_id,demand,PROMO1_FLAG,PROMO2_FLAG,PRICE_REGULAR,PRICE_AFTER_DISC,NUM_CONSULTANT,AUTORIZATION_FLAG,id,...,ln_demand_lag14d_w28D_keylocation_id_product_id_preagmean_agmean_PROMO1_FLAG_dynamic_rolling,ln_demand_lag14d_w28D_keylocation_id_product_id_preagmean_agmedian_PROMO1_FLAG_dynamic_rolling,ln_demand_lag14d_w56D_keylocation_id_product_id_preagmean_agmean_PROMO1_FLAG_dynamic_rolling,ln_demand_lag14d_w56D_keylocation_id_product_id_preagmean_agmedian_PROMO1_FLAG_dynamic_rolling,ln_demand_lag28d_alpha14_keylocation_id_product_id_preagmean_PROMO1_FLAG_dynamic_ewm,ln_demand_lag28d_alpha28_keylocation_id_product_id_preagmean_PROMO1_FLAG_dynamic_ewm,ln_demand_lag28d_w28D_keylocation_id_product_id_preagmean_agmean_PROMO1_FLAG_dynamic_rolling,ln_demand_lag28d_w28D_keylocation_id_product_id_preagmean_agmedian_PROMO1_FLAG_dynamic_rolling,ln_demand_lag28d_w56D_keylocation_id_product_id_preagmean_agmean_PROMO1_FLAG_dynamic_rolling,ln_demand_lag28d_w56D_keylocation_id_product_id_preagmean_agmedian_PROMO1_FLAG_dynamic_rolling
count,249593.0,249593.0,249593.0,249593.0,249593.0,249593.0,249593.0,249593.0,249593.0,249593.0,...,249593.0,249593.0,249593.0,249593.0,249593.0,249593.0,249593.0,249593.0,249593.0,249593.0
mean,20.458623,879.263136,0.381047,0.130432,0.002921,0.378714,0.265231,0.013187,0.816886,210412.858662,...,0.219742,0.190324,0.219742,0.190324,0.170426,0.177056,0.203737,0.177275,0.203737,0.177275
std,10.811404,574.855122,0.690858,0.170588,0.053965,0.104598,0.08084,0.074804,0.386761,124076.601177,...,0.274107,0.282472,0.274107,0.282472,0.277595,0.277262,0.269857,0.276525,0.269857,0.276525
min,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,13.0,384.0,0.033175,0.0,0.0,0.333216,0.22004,0.0,1.0,95106.0,...,0.017428,0.018996,0.017428,0.018996,0.0,0.0,0.0,0.0,0.0,0.0
50%,20.0,894.0,0.090909,0.0,0.0,0.410598,0.262323,0.0,1.0,208523.0,...,0.108814,0.051294,0.108814,0.051294,0.040607,0.046069,0.087358,0.039553,0.087358,0.039553
75%,29.0,1306.0,0.466667,0.333333,0.0,0.456045,0.316434,0.0,1.0,324387.0,...,0.334125,0.24686,0.334125,0.24686,0.182322,0.202127,0.301849,0.207151,0.301849,0.207151
max,41.0,2633.0,27.0,1.0,1.0,1.0,0.7706,0.727273,1.0,407315.0,...,2.859743,2.944439,2.859743,2.944439,2.889712,2.853304,2.859743,2.944439,2.859743,2.944439


In [423]:
#loc_list = data['location_id'].value_counts().tolist()
#for i in loc_list:
 #   data.loc[data['location_id'] == i, ['PROMO2_FLAG']] = data.loc[data['location_id'] == i, ['PROMO2_FLAG']].fillna(method="ffill")
  #  data.loc[data['location_id'] == i, ['PROMO1_FLAG']] = data.loc[data['location_id'] == i, ['PROMO1_FLAG']].fillna(method="ffill")

In [424]:
data.drop('demand', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [425]:
test_id_list = X_test['id'].tolist()

In [433]:
data.loc[60000:,:]

Unnamed: 0,location_id,product_id,PROMO1_FLAG,PROMO2_FLAG,PRICE_REGULAR,PRICE_AFTER_DISC,NUM_CONSULTANT,AUTORIZATION_FLAG,id,monthday,...,ln_demand_lag14d_w28D_keylocation_id_product_id_preagmean_agmean_PROMO1_FLAG_dynamic_rolling,ln_demand_lag14d_w28D_keylocation_id_product_id_preagmean_agmedian_PROMO1_FLAG_dynamic_rolling,ln_demand_lag14d_w56D_keylocation_id_product_id_preagmean_agmean_PROMO1_FLAG_dynamic_rolling,ln_demand_lag14d_w56D_keylocation_id_product_id_preagmean_agmedian_PROMO1_FLAG_dynamic_rolling,ln_demand_lag28d_alpha14_keylocation_id_product_id_preagmean_PROMO1_FLAG_dynamic_ewm,ln_demand_lag28d_alpha28_keylocation_id_product_id_preagmean_PROMO1_FLAG_dynamic_ewm,ln_demand_lag28d_w28D_keylocation_id_product_id_preagmean_agmean_PROMO1_FLAG_dynamic_rolling,ln_demand_lag28d_w28D_keylocation_id_product_id_preagmean_agmedian_PROMO1_FLAG_dynamic_rolling,ln_demand_lag28d_w56D_keylocation_id_product_id_preagmean_agmean_PROMO1_FLAG_dynamic_rolling,ln_demand_lag28d_w56D_keylocation_id_product_id_preagmean_agmedian_PROMO1_FLAG_dynamic_rolling
60000,27,581,0.000000,0.0,0.368310,0.288906,0.0,0.0,347723,30,...,0.150823,0.150823,0.150823,0.150823,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
60002,24,448,0.333333,0.0,0.170907,0.178748,0.0,1.0,336021,30,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
60003,20,681,0.333333,0.0,0.157922,0.165168,0.0,1.0,349930,30,...,0.192362,0.157325,0.192362,0.157325,0.226002,0.241262,0.258431,0.258431,0.258431,0.258431
60010,12,339,0.000000,0.0,0.307071,0.240870,0.0,0.0,28751,30,...,0.058350,0.058350,0.058350,0.058350,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
60011,23,681,0.333333,0.0,0.157922,0.165168,0.0,1.0,349973,30,...,0.143997,0.096916,0.143997,0.096916,0.190765,0.211271,0.234342,0.234342,0.234342,0.234342
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1755503,24,1962,0.333333,0.0,0.385857,0.282492,0.0,1.0,230668,24,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1755587,3,1786,0.333333,0.0,0.394631,0.247642,0.0,1.0,214891,24,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1755653,20,1784,0.333333,0.0,0.394631,0.247642,0.0,1.0,214398,24,...,0.645518,0.693147,0.645518,0.693147,0.416421,0.517556,0.645518,0.693147,0.645518,0.693147
1755661,6,1784,0.333333,0.0,0.394631,0.247642,0.0,1.0,214384,24,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [434]:
data = data.loc[60000:,:]

In [435]:
def CV(X_train, y_train, number_folds, model, metrics, kwargs={}):

    k = int(np.floor(float(X_train.shape[0]) / number_folds))

    errors = np.zeros(number_folds-1)
 
    for i in range(2, number_folds + 1):
        print('')
        split = float(i-1)/i

        X = X_train[:(k*i)]
        y = y_train[:(k*i)]

        index = int(np.floor(X.shape[0] * split))
       
        X_trainFolds = X[:index]        
        y_trainFolds = y[:index]

        X_testFold = X[(index + 1):]
        y_testFold = y[(index + 1):]

        model.fit(X_trainFolds, y_trainFolds, **kwargs, eval_set=[(X_testFold, y_testFold)])
        errors[i-2] = metrics(model.predict(X_testFold), y_testFold)
    
    return errors.mean()

In [436]:
#def wape(y_pred, y_true):
    #res = np.sum(np.abs(y_true - y_pred)) / np.sum(y_true) * 100
    #return res

In [437]:
X_train_1 = data.drop(['id','ln_demand'], axis=1) 
y_train_1 = data.ln_demand

In [438]:
lgbm = lgb.LGBMRegressor(
                        n_estimators=350,
                        learning_rate=0.1,
                        feature_fraction=0.7,
                        subsample=0.4,
                        num_leaves=40)    

kwargs={'early_stopping_rounds':10,'verbose':2}
CV(X_train_1, y_train_1, 15, lgbm, mean_absolute_error, kwargs)


Training until validation scores don't improve for 10 rounds
[2]	valid_0's l2: 0.0519414
[4]	valid_0's l2: 0.0449348
[6]	valid_0's l2: 0.0403374
[8]	valid_0's l2: 0.0371246
[10]	valid_0's l2: 0.0349386
[12]	valid_0's l2: 0.0336068
[14]	valid_0's l2: 0.0325952
[16]	valid_0's l2: 0.0320147
[18]	valid_0's l2: 0.0317346
[20]	valid_0's l2: 0.0317985
[22]	valid_0's l2: 0.0321951
[24]	valid_0's l2: 0.0322782
[26]	valid_0's l2: 0.0323147
[28]	valid_0's l2: 0.0321461
Early stopping, best iteration is:
[19]	valid_0's l2: 0.0316813

Training until validation scores don't improve for 10 rounds
[2]	valid_0's l2: 0.044113
[4]	valid_0's l2: 0.0389098
[6]	valid_0's l2: 0.0356656
[8]	valid_0's l2: 0.0338589
[10]	valid_0's l2: 0.0326036
[12]	valid_0's l2: 0.0318883
[14]	valid_0's l2: 0.0313793
[16]	valid_0's l2: 0.0311552
[18]	valid_0's l2: 0.0308581
[20]	valid_0's l2: 0.0306987
[22]	valid_0's l2: 0.0305375
[24]	valid_0's l2: 0.030271
[26]	valid_0's l2: 0.0303894
[28]	valid_0's l2: 0.0303725
[30]	valid

[50]	valid_0's l2: 0.0475468
[52]	valid_0's l2: 0.047518
[54]	valid_0's l2: 0.0474198
[56]	valid_0's l2: 0.0473528
[58]	valid_0's l2: 0.0473233
[60]	valid_0's l2: 0.0472984
[62]	valid_0's l2: 0.0472059
[64]	valid_0's l2: 0.0471511
[66]	valid_0's l2: 0.0473664
[68]	valid_0's l2: 0.04717
[70]	valid_0's l2: 0.0469253
[72]	valid_0's l2: 0.0468508
[74]	valid_0's l2: 0.0467392
[76]	valid_0's l2: 0.0467671
[78]	valid_0's l2: 0.0468637
[80]	valid_0's l2: 0.0468398
[82]	valid_0's l2: 0.0468046
[84]	valid_0's l2: 0.0467458
Early stopping, best iteration is:
[75]	valid_0's l2: 0.0467386

Training until validation scores don't improve for 10 rounds
[2]	valid_0's l2: 0.238805
[4]	valid_0's l2: 0.210489
[6]	valid_0's l2: 0.1918
[8]	valid_0's l2: 0.176405
[10]	valid_0's l2: 0.166301
[12]	valid_0's l2: 0.158922
[14]	valid_0's l2: 0.152033
[16]	valid_0's l2: 0.14309
[18]	valid_0's l2: 0.136471
[20]	valid_0's l2: 0.133064
[22]	valid_0's l2: 0.130805
[24]	valid_0's l2: 0.128677
[26]	valid_0's l2: 0.12769

[100]	valid_0's l2: 0.0369514
[102]	valid_0's l2: 0.0368308
[104]	valid_0's l2: 0.0368306
[106]	valid_0's l2: 0.036747
[108]	valid_0's l2: 0.0367326
[110]	valid_0's l2: 0.0366919
[112]	valid_0's l2: 0.0366247
[114]	valid_0's l2: 0.0366534
[116]	valid_0's l2: 0.0366353
[118]	valid_0's l2: 0.036604
[120]	valid_0's l2: 0.036607
[122]	valid_0's l2: 0.0365725
[124]	valid_0's l2: 0.0365856
[126]	valid_0's l2: 0.0366159
[128]	valid_0's l2: 0.0365635
[130]	valid_0's l2: 0.0365431
[132]	valid_0's l2: 0.0365353
[134]	valid_0's l2: 0.0365278
[136]	valid_0's l2: 0.0365075
[138]	valid_0's l2: 0.0364888
[140]	valid_0's l2: 0.0365221
[142]	valid_0's l2: 0.036506
[144]	valid_0's l2: 0.0364629
[146]	valid_0's l2: 0.0364817
[148]	valid_0's l2: 0.0364354
[150]	valid_0's l2: 0.0364245
[152]	valid_0's l2: 0.036382
[154]	valid_0's l2: 0.0362789
[156]	valid_0's l2: 0.0363135
[158]	valid_0's l2: 0.0362974
[160]	valid_0's l2: 0.0362623
[162]	valid_0's l2: 0.0362724
[164]	valid_0's l2: 0.0362333
[166]	valid_0's

[104]	valid_0's l2: 0.0833134
[106]	valid_0's l2: 0.0833149
[108]	valid_0's l2: 0.0832286
Early stopping, best iteration is:
[98]	valid_0's l2: 0.0827352


0.14725815833724612

In [439]:
data_test = data_test[data_test['id'].isin(test_id_list)]

In [440]:
data_test.describe()

Unnamed: 0,location_id,product_id,demand,PROMO1_FLAG,PROMO2_FLAG,PRICE_REGULAR,PRICE_AFTER_DISC,NUM_CONSULTANT,AUTORIZATION_FLAG,id,...,ln_demand_lag14d_w28D_keylocation_id_product_id_preagmean_agmean_PROMO1_FLAG_dynamic_rolling,ln_demand_lag14d_w28D_keylocation_id_product_id_preagmean_agmedian_PROMO1_FLAG_dynamic_rolling,ln_demand_lag14d_w56D_keylocation_id_product_id_preagmean_agmean_PROMO1_FLAG_dynamic_rolling,ln_demand_lag14d_w56D_keylocation_id_product_id_preagmean_agmedian_PROMO1_FLAG_dynamic_rolling,ln_demand_lag28d_alpha14_keylocation_id_product_id_preagmean_PROMO1_FLAG_dynamic_ewm,ln_demand_lag28d_alpha28_keylocation_id_product_id_preagmean_PROMO1_FLAG_dynamic_ewm,ln_demand_lag28d_w28D_keylocation_id_product_id_preagmean_agmean_PROMO1_FLAG_dynamic_rolling,ln_demand_lag28d_w28D_keylocation_id_product_id_preagmean_agmedian_PROMO1_FLAG_dynamic_rolling,ln_demand_lag28d_w56D_keylocation_id_product_id_preagmean_agmean_PROMO1_FLAG_dynamic_rolling,ln_demand_lag28d_w56D_keylocation_id_product_id_preagmean_agmedian_PROMO1_FLAG_dynamic_rolling
count,10169.0,10169.0,10169.0,10169.0,10169.0,10169.0,10169.0,10169.0,10169.0,10169.0,...,10169.0,10169.0,10169.0,10169.0,10169.0,10169.0,10169.0,10169.0,10169.0,10169.0
mean,24.399646,2060.097945,-10.0,0.149703,0.0,0.241706,0.146276,0.013597,0.888976,243095.325991,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
std,12.22393,654.938097,0.0,0.170552,0.0,0.231293,0.160946,0.077478,0.314177,79756.245214,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
min,2.0,7.0,-10.0,0.0,0.0,0.0,0.0,0.0,0.0,601.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,15.0,1613.0,-10.0,0.0,0.0,0.0,0.0,0.0,1.0,213092.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,25.0,2359.0,-10.0,0.0,0.0,0.289349,0.121068,0.0,1.0,260643.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,36.0,2556.0,-10.0,0.333333,0.0,0.473241,0.269664,0.0,1.0,268235.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,41.0,2821.0,-10.0,0.666667,0.0,0.956133,1.0,0.818182,1.0,408054.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [441]:
final_pred = lgbm.predict(X = data_test.drop(['id','demand', 'ln_demand'], axis=1))

In [194]:
sub = pd.DataFrame({'id': test_id_list, 'demand': final_pred}) #просто
sub.head()

Unnamed: 0,id,demand
0,601,0.406425
1,697,0.339568
2,831,0.388904
3,875,0.887916
4,975,0.448378


In [213]:
sub = pd.DataFrame({'id': test_id_list, 'demand': final_pred}) # с кросс-валидацией
sub.head()

Unnamed: 0,id,demand
0,601,0.395669
1,697,0.379509
2,831,0.357557
3,875,0.801525
4,975,0.518862


In [223]:
sub = pd.DataFrame({'id': test_id_list, 'demand': final_pred}) # с новой метрикой
sub.head()

Unnamed: 0,id,demand
0,601,0.404764
1,697,0.396917
2,831,0.365454
3,875,0.598964
4,975,0.462538


In [326]:
sub = pd.DataFrame({'id': test_id_list, 'demand': final_pred}) #масштабирование признаков
sub.head()

Unnamed: 0,id,demand
0,601,0.366616
1,697,0.363527
2,831,0.34125
3,875,0.757578
4,975,0.454403


In [327]:
sub.to_csv('lt_sample_submission_1.csv', index=False)

In [382]:
sub = pd.DataFrame({'id': test_id_list, 'demand': final_pred}) #масштабирование признаков + логарифмы спроса
sub.head()

Unnamed: 0,id,demand
0,601,0.399672
1,697,0.375778
2,831,0.363467
3,875,0.602788
4,975,0.379433


In [383]:
sub['demand'] = np.exp(sub['demand']) - 1
sub.head()

Unnamed: 0,id,demand
0,601,0.491335
1,697,0.456124
2,831,0.438308
3,875,0.827205
4,975,0.461456


In [384]:
sub.to_csv('lt_sample_submission_2.csv', index=False)

In [442]:
sub = pd.DataFrame({'id': test_id_list, 'demand': final_pred}) #масштабирование признаков + логарифмы спроса + генерация фичей
sub['demand'] = np.exp(sub['demand']) - 1
sub.head()

Unnamed: 0,id,demand
0,601,1.25242
1,697,0.420964
2,831,1.03775
3,875,0.832711
4,975,1.242437


In [443]:
sub.to_csv('lt_sample_submission_3.csv', index=False)