In [59]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
import lightgbm as lgb
from lightgbm import LGBMRegressor
from sklearn.preprocessing import OrdinalEncoder
import numpy as np
import holidays
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import optuna

In [2]:
os.chdir('..')

In [3]:
sample_submission = pd.read_csv('./data/sample_submission.csv')
test = pd.read_csv('./data/test.csv')
train = pd.read_csv('./data/train.csv')

In [45]:
product_sales = train.groupby(['date', 'product']).sum().reset_index()

In [46]:
product_ratio_daily = product_sales.pivot(index="date", columns="product", values="num_sold")
product_ratio_daily = product_ratio_daily.apply(lambda x: x/x.sum(),axis=1)
product_ratio_daily = product_ratio_daily.stack().rename("ratios").reset_index()
product_ratio_daily = product_ratio_daily.assign(date= lambda df_ : pd.to_datetime(df_.date))

# Preprocessing

In [121]:
ext_train = train.groupby(["date"])["num_sold"].sum().reset_index()

In [122]:
ext_train = ext_train.assign(date= lambda df_ : pd.to_datetime(df_.date)
            ,day_month = lambda df_ : df_.date.dt.day
            ,month = lambda df_ : df_.date.dt.month
            ,year = lambda df_ : df_.date.dt.year
            ,day_week = lambda df_ : df_.date.dt.dayofweek
            ,day_year = lambda df_ : df_.date.dt.dayofyear
            ,weekyear = lambda df_ : df_.date.dt.week
            ,is_weekend = lambda df_ : np.where(df_.day_week.isin([5,6]),1,0)
)

  ,weekyear = lambda df_ : df_.date.dt.week


In [123]:
ext_test = test.groupby(["date"])["store"].sum().reset_index().drop('store',axis=1)

In [124]:
all_test_dates  = ext_test.copy()

In [125]:
ext_test = ext_test.assign(date= lambda df_ : pd.to_datetime(df_.date)
            ,day_month = lambda df_ : df_.date.dt.day
            ,month = lambda df_ : df_.date.dt.month
            ,year = lambda df_ : df_.date.dt.year
            ,day_week = lambda df_ : df_.date.dt.dayofweek
            ,day_year = lambda df_ : df_.date.dt.dayofyear
            ,weekyear = lambda df_ : df_.date.dt.week
            ,is_weekend = lambda df_ : np.where(df_.day_week.isin([5,6]),1,0)
)

  ,weekyear = lambda df_ : df_.date.dt.week


In [126]:
ext_train = ext_train[~((ext_train['year']== 2020) & (ext_train['month'].isin([3,4,5])))]

In [127]:
ext_train["day_year"] = ext_train.apply(lambda x: x["day_year"]-1 if (x["date"] > pd.Timestamp("2020-02-29") and x["date"] < pd.Timestamp("2021-01-01"))  else x["day_year"], axis=1)

In [128]:
daily_sales_extended = ext_train[['day_year','num_sold']]
daily_sales_extended = daily_sales_extended.groupby('day_year').sum().sort_values(by='num_sold',ascending=False).reset_index()

In [129]:
important_days = daily_sales_extended['day_year'][:50].to_list()
print(important_days,end='')

[363, 364, 365, 362, 361, 1, 2, 5, 6, 3, 4, 360, 7, 12, 8, 308, 13, 231, 54, 309, 19, 47, 40, 26, 33, 11, 41, 48, 55, 321, 49, 34, 27, 230, 307, 56, 28, 20, 21, 349, 319, 10, 347, 35, 42, 14, 311, 310, 312, 343]

In [130]:
ext_train['important_day'] = ext_train["day_year"].apply(lambda x: x if x in important_days else 0)
ext_test['important_day'] = ext_test["day_year"].apply(lambda x: x if x in important_days else 0)

In [131]:
ext_test['weekyear'] = np.where(ext_test['weekyear']==53, 1, ext_test['weekyear'])

In [132]:
ext_train["month_sin"] = np.sin(ext_train['month'] * (2 * np.pi / 12))
ext_test["month_sin"] = np.sin(ext_test['month'] * (2 * np.pi / 12))

In [133]:
ext_train["month_cos"] = np.cos(ext_train['month'] * (2 * np.pi / 12))
ext_test["month_cos"] = np.cos(ext_test['month'] * (2 * np.pi / 12))

In [135]:
ext_train = pd.get_dummies(ext_train, columns = ["important_day","day_week"], drop_first=True)
ext_test = pd.get_dummies(ext_test, columns = ["important_day","day_week"], drop_first=True)

In [136]:
ext_train.head()

Unnamed: 0,date,num_sold,day_month,month,year,day_year,weekyear,is_weekend,month_sin,month_cos,...,important_day_362,important_day_363,important_day_364,important_day_365,day_week_1,day_week_2,day_week_3,day_week_4,day_week_5,day_week_6
0,2017-01-01,15352,1,1,2017,1,52,1,0.5,0.866025,...,0,0,0,0,0,0,0,0,0,1
1,2017-01-02,11578,2,1,2017,2,1,0,0.5,0.866025,...,0,0,0,0,0,0,0,0,0,0
2,2017-01-03,10884,3,1,2017,3,1,0,0.5,0.866025,...,0,0,0,0,1,0,0,0,0,0
3,2017-01-04,10445,4,1,2017,4,1,0,0.5,0.866025,...,0,0,0,0,0,1,0,0,0,0
4,2017-01-05,9795,5,1,2017,5,1,0,0.5,0.866025,...,0,0,0,0,0,0,1,0,0,0


In [152]:
SHIFT_DAY = 365

In [163]:
data = ext_train.append(ext_test.assign(test=1))

In [164]:
LAG_DAYS = [col for col in range(SHIFT_DAY,SHIFT_DAY+15)]
data = data.assign(**{
        '{}_lag_{}'.format(col, l): ext_train[col].transform(lambda x: x.shift(l))
        for l in LAG_DAYS
        for col in ['num_sold']
    })

In [165]:
print('Create rolling aggs')

for i in [7,14,30,60,180]:
    print('Rolling period:', i)
    data['rolling_mean_'+str(i)] = data['num_sold'].transform(lambda x: x.shift(SHIFT_DAY).rolling(i).mean()).astype(np.float16)
    data['rolling_std_'+str(i)]  = data['num_sold'].transform(lambda x: x.shift(SHIFT_DAY).rolling(i).std()).astype(np.float16)

Create rolling aggs
Rolling period: 7
Rolling period: 14
Rolling period: 30
Rolling period: 60
Rolling period: 180


In [172]:
ext_train = data[data.test.isna()].drop('test', axis=1)
ext_test = data[~data.test.isna()].drop(['test','num_sold'], axis=1)

# Model

In [174]:
y = ext_train['num_sold']
X = ext_train.drop(['date','month','num_sold','weekyear'],axis=1)
X_test = ext_test.drop(['date','month', 'weekyear'],axis=1)

In [175]:
def lgbm_objective(trial,data=X,target=y):
    
    train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.2,random_state=42)

        
    param = {'metric': 'mape', 
        'random_state': 48,
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.006,0.008,0.01,0.014,0.017,0.02, 0.1, 0.04]),
        'max_depth': trial.suggest_categorical('max_depth', [10,20,100]),
        'num_leaves' : trial.suggest_int('num_leaves', 1, 1000),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 300),
        'cat_smooth' : trial.suggest_int('min_data_per_groups', 1, 100),

    }
    model = LGBMRegressor(**param)  
    
    model.fit(train_x,train_y,eval_set=[(test_x,test_y)],early_stopping_rounds=100,verbose=False)
    
    preds = model.predict(test_x)
    
    rmse = mean_squared_error(test_y, preds,squared=False)
    
    return rmse

In [176]:
study = optuna.create_study(direction='minimize')
study.optimize(lgbm_objective, n_trials=100)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[32m[I 2022-09-26 13:30:48,511][0m A new study created in memory with name: no-name-28ae0fe0-ac2d-48d0-8fdc-0ca3dcf8d254[0m
[32m[I 2022-09-26 13:30:48,973][0m Trial 0 finished with value: 464.09924340141635 and parameters: {'n_estimators': 572, 'reg_alpha': 0.18728157851612298, 'reg_lambda': 1.7710846380130594, 'colsample_bytree': 0.9, 'subsample': 0.4, 'learning_rate': 0.04, 'max_depth': 10, 'num_leaves': 684, 'min_child_samples': 44, 'min_data_per_groups': 44}. Best is trial 0 with value: 464.09924340141635.[0m
[32m[I 2022-09-26 13:30:49,073][0m Trial 1 finished with value: 721.8768488879708 and parameters: {'n_estimators': 342, 'reg_alpha': 0.15751784242540715, 'reg_lambda': 0.05406062491790769, 'colsample_bytree': 0.7, 'subsample': 0.7, 'learning_rate': 0.04, 'max_depth': 20, 'num_leaves': 200, 'min_child_samples': 162, 'min_data_per_groups': 73}. Best is trial 0 with value: 464.09924340141635.[0m
[32m[I 2022-09-26 13:30:49,201][0m Trial 2 finished with value: 817.716907

[32m[I 2022-09-26 13:30:51,262][0m Trial 8 finished with value: 957.6599383695046 and parameters: {'n_estimators': 226, 'reg_alpha': 0.08020411722858234, 'reg_lambda': 2.9503989235628927, 'colsample_bytree': 0.8, 'subsample': 0.8, 'learning_rate': 0.008, 'max_depth': 10, 'num_leaves': 626, 'min_child_samples': 112, 'min_data_per_groups': 11}. Best is trial 0 with value: 464.09924340141635.[0m
[32m[I 2022-09-26 13:30:51,328][0m Trial 9 finished with value: 1063.2872918110274 and parameters: {'n_estimators': 185, 'reg_alpha': 5.965717218159362, 'reg_lambda': 0.004503768056630387, 'colsample_bytree': 0.4, 'subsample': 0.5, 'learning_rate': 0.008, 'max_depth': 10, 'num_leaves': 814, 'min_child_samples': 207, 'min_data_per_groups': 75}. Best is trial 0 with value: 464.09924340141635.[0m
[32m[I 2022-09-26 13:31:00,515][0m Trial 10 finished with value: 454.5583532986381 and parameters: {'n_estimators': 518, 'reg_alpha': 0.013184119468932057, 'reg_lambda': 0.33787993469695027, 'colsamp

[32m[I 2022-09-26 13:31:13,405][0m Trial 15 finished with value: 772.2894973297789 and parameters: {'n_estimators': 429, 'reg_alpha': 0.004297388725549927, 'reg_lambda': 0.6993816263546395, 'colsample_bytree': 0.5, 'subsample': 1.0, 'learning_rate': 0.01, 'max_depth': 100, 'num_leaves': 711, 'min_child_samples': 82, 'min_data_per_groups': 61}. Best is trial 11 with value: 394.340368058443.[0m
[32m[I 2022-09-26 13:31:13,627][0m Trial 16 finished with value: 747.7566731481047 and parameters: {'n_estimators': 993, 'reg_alpha': 0.027359103035067632, 'reg_lambda': 0.021072466413844537, 'colsample_bytree': 0.9, 'subsample': 1.0, 'learning_rate': 0.1, 'max_depth': 100, 'num_leaves': 897, 'min_child_samples': 285, 'min_data_per_groups': 58}. Best is trial 11 with value: 394.340368058443.[0m
[32m[I 2022-09-26 13:31:13,823][0m Trial 17 finished with value: 721.1371716212209 and parameters: {'n_estimators': 606, 'reg_alpha': 0.8037553657667592, 'reg_lambda': 0.5614004615681534, 'colsample

[32m[I 2022-09-26 13:31:25,692][0m Trial 23 finished with value: 641.2264344156977 and parameters: {'n_estimators': 299, 'reg_alpha': 0.002928968290800951, 'reg_lambda': 0.10840901479783509, 'colsample_bytree': 0.9, 'subsample': 1.0, 'learning_rate': 0.01, 'max_depth': 100, 'num_leaves': 910, 'min_child_samples': 29, 'min_data_per_groups': 83}. Best is trial 11 with value: 394.340368058443.[0m
[32m[I 2022-09-26 13:31:25,941][0m Trial 24 finished with value: 478.93452402588474 and parameters: {'n_estimators': 475, 'reg_alpha': 0.0020832024312802187, 'reg_lambda': 0.05759038341033254, 'colsample_bytree': 0.4, 'subsample': 1.0, 'learning_rate': 0.1, 'max_depth': 100, 'num_leaves': 771, 'min_child_samples': 71, 'min_data_per_groups': 95}. Best is trial 11 with value: 394.340368058443.[0m
[32m[I 2022-09-26 13:31:26,984][0m Trial 25 finished with value: 443.96274319557205 and parameters: {'n_estimators': 625, 'reg_alpha': 0.047418183155328655, 'reg_lambda': 0.9175253913061582, 'colsa

[32m[I 2022-09-26 13:31:31,422][0m Trial 31 finished with value: 418.55728223493435 and parameters: {'n_estimators': 516, 'reg_alpha': 0.01815930948400053, 'reg_lambda': 0.39754594151749606, 'colsample_bytree': 0.9, 'subsample': 1.0, 'learning_rate': 0.01, 'max_depth': 100, 'num_leaves': 946, 'min_child_samples': 11, 'min_data_per_groups': 93}. Best is trial 11 with value: 394.340368058443.[0m
[32m[I 2022-09-26 13:31:31,814][0m Trial 32 finished with value: 701.587118411359 and parameters: {'n_estimators': 513, 'reg_alpha': 0.019801322964481675, 'reg_lambda': 0.06091279030979367, 'colsample_bytree': 0.7, 'subsample': 1.0, 'learning_rate': 0.01, 'max_depth': 100, 'num_leaves': 842, 'min_child_samples': 49, 'min_data_per_groups': 92}. Best is trial 11 with value: 394.340368058443.[0m
[32m[I 2022-09-26 13:31:32,630][0m Trial 33 finished with value: 402.82174744526657 and parameters: {'n_estimators': 474, 'reg_alpha': 0.05248438542287218, 'reg_lambda': 1.1996074229608602, 'colsampl

[32m[I 2022-09-26 13:31:35,411][0m Trial 39 finished with value: 390.5147552745336 and parameters: {'n_estimators': 560, 'reg_alpha': 0.13111909817791995, 'reg_lambda': 0.03369248856922734, 'colsample_bytree': 1.0, 'subsample': 0.4, 'learning_rate': 0.04, 'max_depth': 100, 'num_leaves': 997, 'min_child_samples': 16, 'min_data_per_groups': 37}. Best is trial 39 with value: 390.5147552745336.[0m
[32m[I 2022-09-26 13:31:35,633][0m Trial 40 finished with value: 675.3069596050834 and parameters: {'n_estimators': 694, 'reg_alpha': 0.10129523123144903, 'reg_lambda': 0.030112451215238395, 'colsample_bytree': 1.0, 'subsample': 0.7, 'learning_rate': 0.04, 'max_depth': 10, 'num_leaves': 954, 'min_child_samples': 183, 'min_data_per_groups': 33}. Best is trial 39 with value: 390.5147552745336.[0m
[32m[I 2022-09-26 13:31:36,952][0m Trial 41 finished with value: 343.87651423061595 and parameters: {'n_estimators': 557, 'reg_alpha': 0.24238998900098452, 'reg_lambda': 0.009035391610974696, 'cols

[32m[I 2022-09-26 13:31:41,272][0m Trial 47 finished with value: 452.032709301269 and parameters: {'n_estimators': 923, 'reg_alpha': 0.0026035977912061106, 'reg_lambda': 0.014872541857410292, 'colsample_bytree': 1.0, 'subsample': 0.8, 'learning_rate': 0.04, 'max_depth': 10, 'num_leaves': 874, 'min_child_samples': 34, 'min_data_per_groups': 2}. Best is trial 41 with value: 343.87651423061595.[0m
[32m[I 2022-09-26 13:31:41,429][0m Trial 48 finished with value: 951.5557138848964 and parameters: {'n_estimators': 584, 'reg_alpha': 0.003515855974887722, 'reg_lambda': 0.0027137000442037412, 'colsample_bytree': 1.0, 'subsample': 0.4, 'learning_rate': 0.008, 'max_depth': 100, 'num_leaves': 440, 'min_child_samples': 230, 'min_data_per_groups': 43}. Best is trial 41 with value: 343.87651423061595.[0m
[32m[I 2022-09-26 13:31:43,399][0m Trial 49 finished with value: 457.6089503198707 and parameters: {'n_estimators': 541, 'reg_alpha': 0.0014611395889117097, 'reg_lambda': 0.007835619222679639

[32m[I 2022-09-26 13:31:48,753][0m Trial 55 finished with value: 554.8098101551441 and parameters: {'n_estimators': 787, 'reg_alpha': 0.002161268015211422, 'reg_lambda': 0.02212290599186143, 'colsample_bytree': 0.5, 'subsample': 0.4, 'learning_rate': 0.017, 'max_depth': 100, 'num_leaves': 696, 'min_child_samples': 33, 'min_data_per_groups': 46}. Best is trial 41 with value: 343.87651423061595.[0m
[32m[I 2022-09-26 13:31:48,923][0m Trial 56 finished with value: 643.5107805153217 and parameters: {'n_estimators': 267, 'reg_alpha': 0.0013610709908095683, 'reg_lambda': 0.07763226168071731, 'colsample_bytree': 0.8, 'subsample': 0.7, 'learning_rate': 0.04, 'max_depth': 100, 'num_leaves': 767, 'min_child_samples': 74, 'min_data_per_groups': 55}. Best is trial 41 with value: 343.87651423061595.[0m
[32m[I 2022-09-26 13:31:49,065][0m Trial 57 finished with value: 885.2054665471172 and parameters: {'n_estimators': 162, 'reg_alpha': 1.2697976101200086, 'reg_lambda': 0.04730376654898553, 'co

[32m[I 2022-09-26 13:31:52,605][0m Trial 63 finished with value: 377.65281944941825 and parameters: {'n_estimators': 863, 'reg_alpha': 0.0049649735631183715, 'reg_lambda': 0.0031167409156752516, 'colsample_bytree': 0.7, 'subsample': 0.8, 'learning_rate': 0.1, 'max_depth': 10, 'num_leaves': 911, 'min_child_samples': 56, 'min_data_per_groups': 33}. Best is trial 41 with value: 343.87651423061595.[0m
[32m[I 2022-09-26 13:31:53,208][0m Trial 64 finished with value: 383.1485326770585 and parameters: {'n_estimators': 873, 'reg_alpha': 0.009972682611009991, 'reg_lambda': 0.0029257737556020753, 'colsample_bytree': 0.7, 'subsample': 0.8, 'learning_rate': 0.1, 'max_depth': 10, 'num_leaves': 898, 'min_child_samples': 60, 'min_data_per_groups': 33}. Best is trial 41 with value: 343.87651423061595.[0m
[32m[I 2022-09-26 13:31:53,866][0m Trial 65 finished with value: 385.8718808778111 and parameters: {'n_estimators': 957, 'reg_alpha': 0.009686623884949445, 'reg_lambda': 0.0027543223122773776,

[32m[I 2022-09-26 13:31:56,677][0m Trial 71 finished with value: 376.3901528913657 and parameters: {'n_estimators': 898, 'reg_alpha': 0.0019052076347881362, 'reg_lambda': 0.005364474866860822, 'colsample_bytree': 0.7, 'subsample': 0.8, 'learning_rate': 0.1, 'max_depth': 10, 'num_leaves': 311, 'min_child_samples': 56, 'min_data_per_groups': 26}. Best is trial 41 with value: 343.87651423061595.[0m
[32m[I 2022-09-26 13:31:57,068][0m Trial 72 finished with value: 408.7409054339185 and parameters: {'n_estimators': 899, 'reg_alpha': 0.00203172830135013, 'reg_lambda': 0.001742359705733471, 'colsample_bytree': 0.7, 'subsample': 0.8, 'learning_rate': 0.1, 'max_depth': 10, 'num_leaves': 343, 'min_child_samples': 94, 'min_data_per_groups': 26}. Best is trial 41 with value: 343.87651423061595.[0m
[32m[I 2022-09-26 13:31:57,666][0m Trial 73 finished with value: 375.5590178830925 and parameters: {'n_estimators': 970, 'reg_alpha': 0.004181354779616571, 'reg_lambda': 0.0019853993842177857, 'co

[32m[I 2022-09-26 13:32:00,566][0m Trial 79 finished with value: 383.6245938464256 and parameters: {'n_estimators': 833, 'reg_alpha': 0.004078778155114166, 'reg_lambda': 0.00400035739185842, 'colsample_bytree': 0.7, 'subsample': 0.8, 'learning_rate': 0.1, 'max_depth': 10, 'num_leaves': 413, 'min_child_samples': 48, 'min_data_per_groups': 39}. Best is trial 41 with value: 343.87651423061595.[0m
[32m[I 2022-09-26 13:32:01,101][0m Trial 80 finished with value: 422.69553699053415 and parameters: {'n_estimators': 790, 'reg_alpha': 0.007587510714868981, 'reg_lambda': 0.0014134029547939098, 'colsample_bytree': 0.3, 'subsample': 0.8, 'learning_rate': 0.1, 'max_depth': 10, 'num_leaves': 199, 'min_child_samples': 28, 'min_data_per_groups': 36}. Best is trial 41 with value: 343.87651423061595.[0m
[32m[I 2022-09-26 13:32:01,634][0m Trial 81 finished with value: 377.55155080745345 and parameters: {'n_estimators': 877, 'reg_alpha': 0.005020172321960609, 'reg_lambda': 0.005306475988528736, 'c

[32m[I 2022-09-26 13:32:04,515][0m Trial 87 finished with value: 375.85733178223785 and parameters: {'n_estimators': 926, 'reg_alpha': 0.0040928730322059214, 'reg_lambda': 0.007897082882618835, 'colsample_bytree': 0.7, 'subsample': 0.5, 'learning_rate': 0.1, 'max_depth': 10, 'num_leaves': 482, 'min_child_samples': 86, 'min_data_per_groups': 23}. Best is trial 41 with value: 343.87651423061595.[0m
[32m[I 2022-09-26 13:32:04,974][0m Trial 88 finished with value: 564.3678765856617 and parameters: {'n_estimators': 962, 'reg_alpha': 0.001281292829413678, 'reg_lambda': 0.008419567507257333, 'colsample_bytree': 0.7, 'subsample': 0.5, 'learning_rate': 0.017, 'max_depth': 10, 'num_leaves': 480, 'min_child_samples': 85, 'min_data_per_groups': 23}. Best is trial 41 with value: 343.87651423061595.[0m
[32m[I 2022-09-26 13:32:05,260][0m Trial 89 finished with value: 495.25432035677215 and parameters: {'n_estimators': 807, 'reg_alpha': 0.0016683329457422523, 'reg_lambda': 0.003433800704429250

[32m[I 2022-09-26 13:32:08,330][0m Trial 95 finished with value: 382.19820809517853 and parameters: {'n_estimators': 955, 'reg_alpha': 0.021750422138053102, 'reg_lambda': 0.007964312500345674, 'colsample_bytree': 0.7, 'subsample': 0.8, 'learning_rate': 0.1, 'max_depth': 10, 'num_leaves': 522, 'min_child_samples': 29, 'min_data_per_groups': 31}. Best is trial 41 with value: 343.87651423061595.[0m
[32m[I 2022-09-26 13:32:08,978][0m Trial 96 finished with value: 383.478959112964 and parameters: {'n_estimators': 881, 'reg_alpha': 0.002348845081585378, 'reg_lambda': 0.010747612128759887, 'colsample_bytree': 0.7, 'subsample': 0.7, 'learning_rate': 0.1, 'max_depth': 10, 'num_leaves': 466, 'min_child_samples': 39, 'min_data_per_groups': 29}. Best is trial 41 with value: 343.87651423061595.[0m
[32m[I 2022-09-26 13:32:09,447][0m Trial 97 finished with value: 476.38416616857916 and parameters: {'n_estimators': 930, 'reg_alpha': 0.007217962629793474, 'reg_lambda': 0.0036769522564039083, 'c

Number of finished trials: 100
Best trial: {'n_estimators': 557, 'reg_alpha': 0.24238998900098452, 'reg_lambda': 0.009035391610974696, 'colsample_bytree': 1.0, 'subsample': 0.4, 'learning_rate': 0.04, 'max_depth': 100, 'num_leaves': 867, 'min_child_samples': 15, 'min_data_per_groups': 40}


In [178]:
Best_trial=study.best_params   
Best_trial

{'n_estimators': 557,
 'reg_alpha': 0.24238998900098452,
 'reg_lambda': 0.009035391610974696,
 'colsample_bytree': 1.0,
 'subsample': 0.4,
 'learning_rate': 0.04,
 'max_depth': 100,
 'num_leaves': 867,
 'min_child_samples': 15,
 'min_data_per_groups': 40}

In [179]:
model = LGBMRegressor(**Best_trial)
model.fit(X, y)
preds = model.predict(X_test)



In [180]:
all_test_dates['num_sold'] = preds

In [181]:
product_ratio_daily_19 = product_ratio_daily.loc[product_ratio_daily["date"].dt.year == 2019].copy()
product_ratio_daily_19["mm-dd"] = product_ratio_daily_19["date"].dt.strftime('%m-%d')
product_ratio_daily_19 = product_ratio_daily_19.drop(columns="date")

In [182]:
test_df_product_ratio = test.assign(date=pd.to_datetime(test.date)).copy()
test_df_product_ratio['mm-dd'] = test_df_product_ratio['date'].dt.strftime("%m-%d")

In [183]:
test_df_product_ratio = pd.merge(test_df_product_ratio,product_ratio_daily_19,how='left',on=['mm-dd','product'])

In [184]:
sub_df = pd.merge(test,all_test_dates,how='left')
sub_df['ratios'] = test_df_product_ratio['ratios']

In [185]:
store_weights = train.groupby("store")["num_sold"].sum()/train["num_sold"].sum()
store_weights

store
KaggleMart    0.742515
KaggleRama    0.257485
Name: num_sold, dtype: float64

In [186]:
country_balanced = pd.Series(index=sub_df['country'].unique(),data =1/6)
country_balanced

Belgium    0.166667
France     0.166667
Germany    0.166667
Italy      0.166667
Poland     0.166667
Spain      0.166667
dtype: float64

In [187]:
for country in country_balanced.index:
        sub_df.loc[(sub_df["country"] == country), "num_sold"] = sub_df.loc[(sub_df["country"] == country), "num_sold"] *  country_balanced[country]
sub_df.head()

Unnamed: 0,row_id,date,country,store,product,num_sold,ratios
0,70128,2021-01-01,Belgium,KaggleMart,Kaggle Advanced Techniques,2141.856235,0.279273
1,70129,2021-01-01,Belgium,KaggleMart,Kaggle Getting Started,2141.856235,0.230832
2,70130,2021-01-01,Belgium,KaggleMart,Kaggle Recipe Book,2141.856235,0.188119
3,70131,2021-01-01,Belgium,KaggleMart,Kaggle for Kids: One Smart Goose,2141.856235,0.301776
4,70132,2021-01-01,Belgium,KaggleRama,Kaggle Advanced Techniques,2141.856235,0.279273


In [188]:
for store in store_weights.index:
        sub_df.loc[sub_df["store"] == store, "num_sold"] = sub_df.loc[sub_df["store"] == store, "num_sold"] * store_weights[store]
sub_df.head()

Unnamed: 0,row_id,date,country,store,product,num_sold,ratios
0,70128,2021-01-01,Belgium,KaggleMart,Kaggle Advanced Techniques,1590.360038,0.279273
1,70129,2021-01-01,Belgium,KaggleMart,Kaggle Getting Started,1590.360038,0.230832
2,70130,2021-01-01,Belgium,KaggleMart,Kaggle Recipe Book,1590.360038,0.188119
3,70131,2021-01-01,Belgium,KaggleMart,Kaggle for Kids: One Smart Goose,1590.360038,0.301776
4,70132,2021-01-01,Belgium,KaggleRama,Kaggle Advanced Techniques,551.496197,0.279273


In [189]:
sub_df['num_sold'] = sub_df['num_sold'] * sub_df['ratios']
sub_df['num_sold'] = sub_df['num_sold'].round()
sub_df.head()

Unnamed: 0,row_id,date,country,store,product,num_sold,ratios
0,70128,2021-01-01,Belgium,KaggleMart,Kaggle Advanced Techniques,444.0,0.279273
1,70129,2021-01-01,Belgium,KaggleMart,Kaggle Getting Started,367.0,0.230832
2,70130,2021-01-01,Belgium,KaggleMart,Kaggle Recipe Book,299.0,0.188119
3,70131,2021-01-01,Belgium,KaggleMart,Kaggle for Kids: One Smart Goose,480.0,0.301776
4,70132,2021-01-01,Belgium,KaggleRama,Kaggle Advanced Techniques,154.0,0.279273


In [190]:
sample_submission['num_sold'] =sub_df['num_sold']
sample_submission.head()

Unnamed: 0,row_id,num_sold
0,70128,444.0
1,70129,367.0
2,70130,299.0
3,70131,480.0
4,70132,154.0


In [191]:
sample_submission.to_csv('./data/final_submission.csv',index=False)