In [1]:
%%capture
!pip install pycaret[full]

In [3]:
## The magic four
import pandas as pd
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt 

#garbage collection (clear up some RAM)
import gc

#pycaret
from pycaret.regression import *

#cuML
import cudf

%matplotlib inline

In [4]:
#this is an aesthetic choice and just removes the many warnings that some functions and comands produce
#it helps significantly declutter the workbook
import warnings
warnings.filterwarnings('ignore')

In [5]:
train = cudf.read_csv('../input/tabular-playground-series-jan-2022/train.csv', index_col = 'row_id').to_pandas()
test = cudf.read_csv('../input/tabular-playground-series-jan-2022/test.csv', index_col = 'row_id').to_pandas()

In [6]:
# Credit to https://www.kaggle.com/ranjeetshrivastav/tps-jan-21-base-xgb

train['date'] = pd.to_datetime(train['date'])
train['year'] = train['date'].dt.year
train['month'] = train['date'].dt.month
train['day'] = train['date'].dt.day
train['dayofweek'] = train['date'].dt.dayofweek
train['dayofmonth'] = train['date'].dt.days_in_month
train['dayofyear'] = train['date'].dt.dayofyear
train['weekday'] = train['date'].dt.weekday

test['date'] = pd.to_datetime(test['date'])
test['year'] = test['date'].dt.year
test['month'] = test['date'].dt.month
test['day'] = test['date'].dt.day
test['dayofweek'] = test['date'].dt.dayofweek
test['dayofmonth'] = test['date'].dt.days_in_month
test['dayofyear'] = test['date'].dt.dayofyear
test['weekday'] = test['date'].dt.weekday

train.drop('date', axis = 1, inplace = True)
test.drop('date', axis = 1, inplace = True)

In [7]:
train.dtypes

country       object
store         object
product       object
num_sold       int64
year           int64
month          int64
day            int64
dayofweek      int64
dayofmonth     int64
dayofyear      int64
weekday        int64
dtype: object

In [8]:
# Credit to https://www.kaggle.com/c/web-traffic-time-series-forecasting/discussion/36414
def SMAPE(y_true, y_pred):
    denominator = (y_true + np.abs(y_pred)) / 200.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return np.mean(diff)

In [9]:
reg = setup(data = train,
            target = 'num_sold',
            data_split_shuffle = False, #so that we do not use "future" observations to predict "past" observations
            create_clusters = True,
            use_gpu = True,
            silent = True,
            n_jobs = -1)

Unnamed: 0,Description,Value
0,session_id,8500
1,Target,num_sold
2,Original Data,"(26298, 11)"
3,Missing Values,False
4,Numeric Features,2
5,Categorical Features,8
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(18408, 54)"


In [10]:
models()

Unnamed: 0_level_0,Name,Reference,Turbo
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
lr,Linear Regression,cuml.linear_model.linear_regression.LinearRegr...,True
lasso,Lasso Regression,cuml.linear_model.lasso.Lasso,True
ridge,Ridge Regression,cuml.linear_model.ridge.Ridge,True
en,Elastic Net,cuml.linear_model.elastic_net.ElasticNet,True
lar,Least Angle Regression,sklearn.linear_model._least_angle.Lars,True
llar,Lasso Least Angle Regression,sklearn.linear_model._least_angle.LassoLars,True
omp,Orthogonal Matching Pursuit,sklearn.linear_model._omp.OrthogonalMatchingPu...,True
br,Bayesian Ridge,sklearn.linear_model._bayes.BayesianRidge,True
ard,Automatic Relevance Determination,sklearn.linear_model._bayes.ARDRegression,False
par,Passive Aggressive Regressor,sklearn.linear_model._passive_aggressive.Passi...,True


In [11]:
add_metric('SMAPE', 'SMAPE', SMAPE, greater_is_better = False)

Name                                                       SMAPE
Display Name                                               SMAPE
Score Function                <function SMAPE at 0x7fe72c587ef0>
Scorer               make_scorer(SMAPE, greater_is_better=False)
Target                                                      pred
Args                                                          {}
Greater is Better                                          False
Custom                                                      True
Name: SMAPE, dtype: object

In [12]:
#compare_models(sort = 'MAPE')

In [13]:
N = 3
top = compare_models(sort = 'SMAPE', n_select = N)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,SMAPE,TT (Sec)
catboost,CatBoost Regressor,25.3889,2216.047,44.0207,0.9687,0.0891,0.0657,6.5617,2.962
lightgbm,Light Gradient Boosting Machine,26.9633,2341.4157,45.6251,0.9663,0.0941,0.0698,6.9493,1.033
xgboost,Extreme Gradient Boosting,26.9551,2387.314,45.7763,0.9661,0.0938,0.0701,7.0162,0.547
rf,Random Forest Regressor,29.1127,2753.6515,49.9417,0.9597,0.0968,0.0729,7.2843,2.183
et,Extra Trees Regressor,31.4609,3196.9308,54.2069,0.9518,0.1039,0.0783,7.774,10.093
gbr,Gradient Boosting Regressor,33.1467,3150.1448,53.9272,0.9524,0.1158,0.0892,8.9202,1.891
dt,Decision Tree Regressor,37.8414,4567.7,64.3158,0.9327,0.1232,0.0947,9.3625,0.141
knn,K Neighbors Regressor,77.4726,12716.2323,110.0229,0.7955,0.2626,0.2314,21.1513,0.224
huber,Huber Regressor,71.227,13861.9435,112.9531,0.7922,0.5066,0.227,26.5114,0.914
lasso,Lasso Regression,76.4842,12455.7795,109.4768,0.7966,0.5443,0.2732,30.7106,0.107


In [14]:
blend = blend_models(top)
predict_model(blend);

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE,SMAPE
0,29.5474,2722.4543,52.1771,0.9659,0.0993,0.0719,7.1563
1,29.8972,2325.888,48.2275,0.9664,0.1062,0.0835,8.1095
2,14.1618,445.7452,21.1127,0.986,0.0558,0.0453,4.4432
3,25.0028,2183.9912,46.7332,0.9677,0.0837,0.06,6.0354
4,36.0496,4388.7771,66.2478,0.9471,0.1236,0.0849,8.7208
5,21.2189,1235.4295,35.1487,0.9719,0.0805,0.0643,6.1821
6,15.5979,496.0662,22.2725,0.9844,0.0632,0.0489,4.9432
7,30.3347,3003.3035,54.8024,0.9662,0.0869,0.0637,6.4121
8,34.673,4035.0386,63.522,0.9544,0.1114,0.0765,7.9234
9,16.1384,524.2737,22.897,0.9864,0.0595,0.0483,4.7815


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,SMAPE
0,Voting Regressor,42.8648,5360.1064,73.2127,0.9367,0.1216,0.0905,9.6024


In [15]:
final_blend = finalize_model(blend)
predict_model(final_blend);

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,SMAPE
0,Voting Regressor,20.5943,1070.8094,32.7232,0.9874,0.0597,0.0473,4.7927


In [16]:
#N = 3: 4.7978
#N = 4: 4.5288
#N = 5: 3.6129
#N = 6: 3.9910

In [17]:
#tuned_top = [tune_model(i, optimize = 'MAPE', choose_better = True) for i in top]

In [18]:
#tuned_blend = blend_models(tuned_top)
#predict_model(tuned_blend);

In [19]:
#final_tuned_blend = finalize_model(tuned_blend)
#predict_model(final_tuned_blend);

In [20]:
gc.collect()
unseen_predictions_blend = predict_model(final_blend, data=test)
unseen_predictions_blend.head()

Unnamed: 0_level_0,country,store,product,year,month,day,dayofweek,dayofmonth,dayofyear,weekday,Label
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
26298,Finland,KaggleMart,Kaggle Mug,2019,1,1,1,31,1,1,386.537208
26299,Finland,KaggleMart,Kaggle Hat,2019,1,1,1,31,1,1,590.709176
26300,Finland,KaggleMart,Kaggle Sticker,2019,1,1,1,31,1,1,175.871997
26301,Finland,KaggleRama,Kaggle Mug,2019,1,1,1,31,1,1,669.206591
26302,Finland,KaggleRama,Kaggle Hat,2019,1,1,1,31,1,1,1002.992452


In [21]:
gc.collect()

assert(len(test.index)==len(unseen_predictions_blend))

sub = pd.DataFrame(list(zip(test.index, unseen_predictions_blend.Label)),columns = ['row_id', 'num_sold'])

sub.to_csv('submission.csv', index = False)

print(sub)

      row_id     num_sold
0      26298   386.537208
1      26299   590.709176
2      26300   175.871997
3      26301   669.206591
4      26302  1002.992452
...      ...          ...
6565   32863   839.922878
6566   32864   231.508540
6567   32865   987.710762
6568   32866  1502.964192
6569   32867   424.943613

[6570 rows x 2 columns]
