<a href="https://www.kaggle.com/code/ranjeetshrivastav/tps-sep-22-pycaret?scriptVersionId=104673261" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

<h1 align="center" style="font-weight: bold">TPS September 2022</h1>

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

import plotly.express as px
import plotly.graph_objects as go

In [2]:
train = pd.read_csv(r'../input/tabular-playground-series-sep-2022/train.csv')
train.head()

Unnamed: 0,row_id,date,country,store,product,num_sold
0,0,2017-01-01,Belgium,KaggleMart,Kaggle Advanced Techniques,663
1,1,2017-01-01,Belgium,KaggleMart,Kaggle Getting Started,615
2,2,2017-01-01,Belgium,KaggleMart,Kaggle Recipe Book,480
3,3,2017-01-01,Belgium,KaggleMart,Kaggle for Kids: One Smart Goose,710
4,4,2017-01-01,Belgium,KaggleRama,Kaggle Advanced Techniques,240


In [3]:
test = pd.read_csv(r'../input/tabular-playground-series-sep-2022/test.csv')
test.head()

Unnamed: 0,row_id,date,country,store,product
0,70128,2021-01-01,Belgium,KaggleMart,Kaggle Advanced Techniques
1,70129,2021-01-01,Belgium,KaggleMart,Kaggle Getting Started
2,70130,2021-01-01,Belgium,KaggleMart,Kaggle Recipe Book
3,70131,2021-01-01,Belgium,KaggleMart,Kaggle for Kids: One Smart Goose
4,70132,2021-01-01,Belgium,KaggleRama,Kaggle Advanced Techniques


In [4]:
sub = pd.read_csv(r'../input/tabular-playground-series-sep-2022/sample_submission.csv')
sub.head()

Unnamed: 0,row_id,num_sold
0,70128,100
1,70129,100
2,70130,100
3,70131,100
4,70132,100


In [5]:
print(f'train set have {train.shape[0]} rows and {train.shape[1]} columns.')
print(f'test set have {test.shape[0]} rows and {test.shape[1]} columns.') 
print(f'sample_submission set have {sub.shape[0]} rows and {sub.shape[1]} columns.') 

train set have 70128 rows and 6 columns.
test set have 17520 rows and 5 columns.
sample_submission set have 17520 rows and 2 columns.


In [6]:
train.drop('row_id',axis=1, inplace=True)
test.drop('row_id',axis=1, inplace=True)

In [7]:
train.dtypes

date        object
country     object
store       object
product     object
num_sold     int64
dtype: object

In [8]:
train.nunique()

date        1461
country        6
store          2
product        4
num_sold     699
dtype: int64

In [9]:
train.isnull().sum()

date        0
country     0
store       0
product     0
num_sold    0
dtype: int64

In [10]:
# Convert date to datetime
train.date=pd.to_datetime(train.date)
test.date=pd.to_datetime(test.date)

# drop 29th Feb
train.drop(train[(train.date.dt.month==2) & (train.date.dt.day==29)].index, axis=0, inplace=True)

<h2 align="center" style="font-weight: bold">Data Visualization using Plotly</h2>

In [11]:
fig = go.Figure(data=[go.Pie(labels=train.country, hole=.4)])
fig.add_annotation(text='Country',
                   x=0.5,y=0.5,showarrow=False,font_size=14,opacity=0.7,font_family='monospace')
fig.update_traces(hoverinfo='label+percent+value',
                  marker=dict(colors=['darkorange','blue'], line=dict(color='#000000', width=2)))
fig.update_layout(
    font_family='monospace',
    title=dict(text='Country',x=0.47,y=0.98,
               font=dict(color='black',size=20)),
    legend=dict(orientation='v',traceorder='reversed'),
    hoverlabel=dict(bgcolor='white'))
fig.update_traces(textposition='outside', textinfo='percent+label')
fig.show()

In [12]:
fig = go.Figure(data=[go.Pie(labels=train.store, hole=.4)])
fig.add_annotation(text='Store',
                   x=0.5,y=0.5,showarrow=False,font_size=14,opacity=0.7,font_family='monospace')
fig.update_traces(hoverinfo='label+percent+value',
                  marker=dict(colors=['darkorange','blue'], line=dict(color='#000000', width=2)))
fig.update_layout(
    font_family='monospace',
    title=dict(text='Store',x=0.47,y=0.98,
               font=dict(color='black',size=20)),
    legend=dict(orientation='v',traceorder='reversed'),
    hoverlabel=dict(bgcolor='white'))
fig.update_traces(textposition='outside', textinfo='percent+label')
fig.show()

In [13]:
fig = px.line(train, x="date", y="num_sold", color='store')
fig.update_layout(
    font_family='monospace',
    title=dict(text='No. of product sold by store',x=0.47,y=0.98,
               font=dict(color='black',size=20)),
    legend=dict(orientation='v',traceorder='reversed'),
    hoverlabel=dict(bgcolor='white'))
fig.show()

* KaggleMart consistently sells more products than KaggleRama.
* There are big spikes towards the end of each year.

<h2 align="center" style="font-weight: bold">Data Preprocessing</h2>

In [14]:
train['year'] = train['date'].dt.year
train['month'] = train['date'].dt.month
train['day'] = train['date'].dt.day
train['weekday'] = train['date'].dt.weekday

test['year'] = test['date'].dt.year
test['month'] = test['date'].dt.month
test['day'] = test['date'].dt.day
test['weekday'] = test['date'].dt.weekday

train.drop('date',axis=1,inplace=True)
test.drop('date',axis=1,inplace=True)

In [15]:
train = pd.get_dummies(train)
test = pd.get_dummies(test)

<h2 align="center" style="font-weight: bold">PyCaret</h2>

In [16]:
! pip install --ignore-installed --pre pycaret

Collecting pycaret
  Downloading pycaret-3.0.0rc3-py3-none-any.whl (544 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m544.5/544.5 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting numba~=0.55.0
  Downloading numba-0.55.2-cp37-cp37m-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (3.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting schemdraw>=0.14
  Downloading schemdraw-0.15-py3-none-any.whl (106 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.8/106.8 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pandas<1.5.0,>=1.3.0
  Downloading pandas-1.3.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.3/11.3 MB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pmdarima>=1.8.0
  Downloading pmdarima-2.0.1-cp37-cp37m-manylinux_

In [17]:
from pycaret.regression import setup, compare_models, blend_models, finalize_model, plot_model, predict_model, add_metric

In [18]:
def SMAPE(y_true, y_pred):
    denominator = (y_true + np.abs(y_pred)) / 200.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return np.mean(diff)

In [19]:
def pycaret_model(train, target, test, n_select, fold, opt ):
    print('Setup Your Data....')
    setup(data=train,
          target=target,
          fold = fold)
    
    # add SMAPE to PyCaret
    add_metric('SMAPE', 'SMAPE', SMAPE, greater_is_better=False)
  
    print('Comparing Models....')
    best = compare_models(sort = opt,n_select=n_select, fold = fold)
    
    print('Blending Models....')
    blended = blend_models(estimator_list= best, fold=fold, optimize=opt)
    pred_holdout = predict_model(blended)
    
    print('Finallizing Models....')
    final_model = finalize_model(blended)
    print('Done...!!!')
    
    pred = predict_model(final_model, test)
    
    return pred

In [20]:
result = pycaret_model(train,'num_sold',test, 3, 5, 'SMAPE')

Setup Your Data....


Unnamed: 0,Description,Value
0,Session id,8602
1,Target,num_sold
2,Target type,Regression
3,Data shape,"(70080, 17)"
4,Train data shape,"(49055, 17)"
5,Test data shape,"(21025, 17)"
6,Numeric features,16
7,Preprocess,True
8,Imputation type,simple
9,Numeric imputation,mean


Comparing Models....


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,SMAPE,TT (Sec)
catboost,CatBoost Regressor,8.8055,177.5555,13.3244,0.989,0.0583,0.0463,4.6199,6.98
xgboost,Extreme Gradient Boosting,9.2522,193.2278,13.8999,0.988,0.0635,0.05,4.9894,2.632
rf,Random Forest Regressor,10.7145,304.0775,17.4347,0.9811,0.0718,0.0549,5.4738,3.816
et,Extra Trees Regressor,11.4221,364.2983,19.0812,0.9773,0.0772,0.0583,5.8144,3.64
dt,Decision Tree Regressor,13.9624,518.1108,22.7577,0.9678,0.0947,0.0718,7.1356,0.122
gbr,Gradient Boosting Regressor,19.2523,859.6538,29.3175,0.9465,0.1984,0.1118,11.5106,1.478
knn,K Neighbors Regressor,37.4999,2574.4684,50.7388,0.8398,0.284,0.2543,21.9313,5.786
huber,Huber Regressor,45.9405,4350.6346,65.7731,0.7294,0.4136,0.2957,28.0159,0.402
lasso,Lasso Regression,42.1446,3363.4799,57.9949,0.7907,0.4217,0.2955,28.9581,0.058
lar,Least Angle Regression,43.4229,3612.3342,60.0097,0.7753,0.4426,0.3005,29.2343,0.066


Processing:   0%|          | 0/87 [00:00<?, ?it/s]

[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.


Blending Models....


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE,SMAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,9.0182,194.4493,13.9445,0.988,0.058,0.046,4.5862
1,8.9278,186.7439,13.6654,0.9883,0.0581,0.0464,4.6232
2,8.843,180.9747,13.4527,0.9887,0.0581,0.0465,4.6273
3,8.9532,194.9345,13.9619,0.9877,0.0579,0.046,4.5764
4,8.9398,189.8875,13.78,0.9883,0.0574,0.046,4.5803
Mean,8.9364,189.398,13.7609,0.9882,0.0579,0.0462,4.5987
Std,0.0562,5.1847,0.189,0.0003,0.0003,0.0002,0.022


Processing:   0%|          | 0/6 [00:00<?, ?it/s]

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,SMAPE
0,Voting Regressor,8.9819,187.1784,13.6813,0.9884,0.0571,0.0459,4.5752


Finallizing Models....
Done...!!!


In [21]:
sub['num_sold'] = np.round(result.Label).astype(int)
sub.to_csv('sub.csv',index=False)