In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
import lightgbm as lgb
from sklearn.preprocessing import OrdinalEncoder
import numpy as np
import holidays
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import mean_squared_error
import optuna

In [2]:
os.chdir('..')

In [3]:
sample_submission = pd.read_csv('./data/sample_submission.csv')
test = pd.read_csv('./data/test.csv')
train = pd.read_csv('./data/train.csv')

In [4]:
train = train.assign(date=lambda df_ : pd.to_datetime(df_.date))
test = test.assign(date=lambda df_ : pd.to_datetime(df_.date))

# Preprocessing

In [18]:
data = train.append(test)

In [19]:
data = data.assign(day_month = lambda df_ : df_.date.dt.day
            ,month = lambda df_ : df_.date.dt.month
            ,year = lambda df_ : df_.date.dt.year
            ,day_week = lambda df_ : df_.date.dt.dayofweek
            ,day_year = lambda df_ : df_.date.dt.dayofyear)

In [20]:
data.head()

Unnamed: 0,row_id,date,country,store,product,num_sold,day_month,month,year,day_week,day_year
0,0,2017-01-01,Belgium,KaggleMart,Kaggle Advanced Techniques,663.0,1,1,2017,6,1
1,1,2017-01-01,Belgium,KaggleMart,Kaggle Getting Started,615.0,1,1,2017,6,1
2,2,2017-01-01,Belgium,KaggleMart,Kaggle Recipe Book,480.0,1,1,2017,6,1
3,3,2017-01-01,Belgium,KaggleMart,Kaggle for Kids: One Smart Goose,710.0,1,1,2017,6,1
4,4,2017-01-01,Belgium,KaggleRama,Kaggle Advanced Techniques,240.0,1,1,2017,6,1


In [21]:
enc = OrdinalEncoder()
enc.fit(data[["country","store", "product"]])

OrdinalEncoder()

In [22]:
data[["country","store", "product"]] = enc.transform(data[["country", "store", "product"]])

In [23]:
holidays_mapping = {0.0 : 'BE', 1.0 : 'FR', 2.0 : 'DE', 3.0 : 'IT', 4.0 : 'PL', 5.0 : 'ES'}

In [24]:
data['iso_country'] = data.country.map(holidays_mapping)

In [25]:
data['is_holiday'] = data.apply(lambda x: int(x.date in holidays.country_holidays(x['iso_country'])), axis=1)

# Model

In [None]:
!pip install fbprophet

In [1]:
from fbprophet import Prophet
from fbprophet.plot import plot_plotly
import plotly.offline as py
py.init_notebook_mode()

ModuleNotFoundError: No module named 'fbprophet'

# Validation

In [231]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'mse',
    'metric': ['rmse', 'mape'],
    'learning_rate': 0.032,
    'verbose': -1,
    'n_estimators':60,
    
}

In [232]:
y_preds_eval = np.zeros(len(X_eval))

In [233]:
for i in list_products:
    X_train_temp = X_train[(X_train['product'] == i)].drop(columns=['product'])
    y_train_temp = y_train[(X_train['product'] == i)]
    X_eval_temp = X_eval[(X_eval['product'] == i)].drop(columns=['product'])
    y_eval_temp = y_eval[(X_eval['product'] == i)]
    lgb_train = lgb.Dataset(X_train_temp, y_train_temp)
    lgb_eval = lgb.Dataset(X_eval_temp, y_eval_temp)
    gbm = lgb.train(params,
                lgb_train,
                valid_sets=[lgb_train, lgb_eval],
                verbose_eval=50,)
#     X_test_temp = X_test[(X_test['product'] == i)].drop(columns=['product'])
    y_preds_eval[(X_eval['product'] == i)] = gbm.predict(X_eval_temp)    



[50]	training's rmse: 35.459	training's mape: 0.200219	valid_1's rmse: 83.7414	valid_1's mape: 0.216549
[50]	training's rmse: 27.2996	training's mape: 0.191846	valid_1's rmse: 70.0149	valid_1's mape: 0.228251
[50]	training's rmse: 20.621	training's mape: 0.192664	valid_1's rmse: 52.8069	valid_1's mape: 0.226792
[50]	training's rmse: 35.0668	training's mape: 0.183604	valid_1's rmse: 96.0373	valid_1's mape: 0.208132


In [243]:
print(np.sqrt(mean_squared_error(y_preds_eval,y_eval)))

77.10324972569578


In [202]:
for i in list_products:
    for j in list_countries:
        X_train_temp = X_train[(X_train['product'] == i) & (X_train['country'] == j)].drop(columns=['product','country'])
        y_train_temp = y_train[(X_train['product'] == i) & (X_train_final['country'] == j)]
        X_eval_temp = X_eval[(X_eval['product'] == i) & (X_eval['country'] == j)].drop(columns=['product','country'])
        y_eval_temp = y_eval[(X_eval['product'] == i) & (X_eval['country'] == j)]
        lgb_train = lgb.Dataset(X_train_temp, y_train_temp)
        lgb_eval = lgb.Dataset(X_eval_temp, y_eval_temp)
        gbm = lgb.train(params,
                    lgb_train,
                    valid_sets=[lgb_train, lgb_eval],
                    verbose_eval=50,)
        y_preds_eval[(X_eval['product'] == i) & (X_eval['country'] == j)] = gbm.predict(X_eval_temp)    

[50]	training's rmse: 24.9055	training's mape: 0.0714299	valid_1's rmse: 53.5319	valid_1's mape: 0.200205
[50]	training's rmse: 21.7625	training's mape: 0.0720637	valid_1's rmse: 34.1951	valid_1's mape: 0.0962563




[50]	training's rmse: 24.6643	training's mape: 0.0715712	valid_1's rmse: 55.4198	valid_1's mape: 0.203957
[50]	training's rmse: 17.448	training's mape: 0.0711783	valid_1's rmse: 61.1315	valid_1's mape: 0.158572
[50]	training's rmse: 8.4838	training's mape: 0.07507	valid_1's rmse: 177.604	valid_1's mape: 0.60427
[50]	training's rmse: 15.5106	training's mape: 0.0712483	valid_1's rmse: 82.2287	valid_1's mape: 0.245402
[50]	training's rmse: 19.0082	training's mape: 0.0691241	valid_1's rmse: 35.9659	valid_1's mape: 0.188304
[50]	training's rmse: 16.1881	training's mape: 0.0679093	valid_1's rmse: 26.7943	valid_1's mape: 0.107084
[50]	training's rmse: 19.2672	training's mape: 0.0683328	valid_1's rmse: 35.3032	valid_1's mape: 0.191204
[50]	training's rmse: 14.1558	training's mape: 0.0686732	valid_1's rmse: 49.8289	valid_1's mape: 0.163112
[50]	training's rmse: 6.49451	training's mape: 0.0712804	valid_1's rmse: 144.331	valid_1's mape: 0.602512
[50]	training's rmse: 11.6904	training's mape: 0.06

In [165]:
print(mean_squared_error(y_preds_eval,y_eval))

6675.546473628595


# Create submission

In [240]:
data = {'row_id': np.arange(70128,87648),
        'num_sold': y_preds
        }

In [241]:
final_submission = pd.DataFrame(data)

In [242]:
final_submission.to_csv('./data/final_submission.csv', index=False)