<a href="https://www.kaggle.com/code/ranjeetshrivastav/tps-sep-22-pycaret?scriptVersionId=104746886" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

<h1 align="center" style="font-weight: bold">TPS September 2022</h1>

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

import plotly.express as px
import plotly.graph_objects as go

In [2]:
train = pd.read_csv(r'../input/tabular-playground-series-sep-2022/train.csv')
train.head()

Unnamed: 0,row_id,date,country,store,product,num_sold
0,0,2017-01-01,Belgium,KaggleMart,Kaggle Advanced Techniques,663
1,1,2017-01-01,Belgium,KaggleMart,Kaggle Getting Started,615
2,2,2017-01-01,Belgium,KaggleMart,Kaggle Recipe Book,480
3,3,2017-01-01,Belgium,KaggleMart,Kaggle for Kids: One Smart Goose,710
4,4,2017-01-01,Belgium,KaggleRama,Kaggle Advanced Techniques,240


In [3]:
test = pd.read_csv(r'../input/tabular-playground-series-sep-2022/test.csv')
test.head()

Unnamed: 0,row_id,date,country,store,product
0,70128,2021-01-01,Belgium,KaggleMart,Kaggle Advanced Techniques
1,70129,2021-01-01,Belgium,KaggleMart,Kaggle Getting Started
2,70130,2021-01-01,Belgium,KaggleMart,Kaggle Recipe Book
3,70131,2021-01-01,Belgium,KaggleMart,Kaggle for Kids: One Smart Goose
4,70132,2021-01-01,Belgium,KaggleRama,Kaggle Advanced Techniques


In [4]:
sub = pd.read_csv(r'../input/tabular-playground-series-sep-2022/sample_submission.csv')
sub.head()

Unnamed: 0,row_id,num_sold
0,70128,100
1,70129,100
2,70130,100
3,70131,100
4,70132,100


In [5]:
print(f'train set have {train.shape[0]} rows and {train.shape[1]} columns.')
print(f'test set have {test.shape[0]} rows and {test.shape[1]} columns.') 
print(f'sample_submission set have {sub.shape[0]} rows and {sub.shape[1]} columns.') 

train set have 70128 rows and 6 columns.
test set have 17520 rows and 5 columns.
sample_submission set have 17520 rows and 2 columns.


In [6]:
train.drop('row_id',axis=1, inplace=True)
test.drop('row_id',axis=1, inplace=True)

In [7]:
train.dtypes

date        object
country     object
store       object
product     object
num_sold     int64
dtype: object

In [8]:
train.nunique()

date        1461
country        6
store          2
product        4
num_sold     699
dtype: int64

In [9]:
train.isnull().sum()

date        0
country     0
store       0
product     0
num_sold    0
dtype: int64

In [10]:
# Convert date to datetime
train.date=pd.to_datetime(train.date)
test.date=pd.to_datetime(test.date)

# drop 29th Feb
train.drop(train[(train.date.dt.month==2) & (train.date.dt.day==29)].index, axis=0, inplace=True)

<h2 align="center" style="font-weight: bold">Data Visualization using Plotly</h2>

In [11]:
a = train.groupby(['country'], as_index=False)['num_sold'].sum()
fig = go.Figure(data=[go.Pie(labels=a.country,values = a.num_sold, hole=.4)])
fig.add_annotation(text='Country',
                   x=0.5,y=0.5,showarrow=False,font_size=14,opacity=0.7,font_family='monospace')
fig.update_traces(hoverinfo='label+percent+value',
                  marker=dict(colors=['darkorange','blue'], line=dict(color='#000000', width=2)))
fig.update_layout(
    font_family='monospace',
    title=dict(text='Total products sold by Country',x=0.47,y=0.98,
               font=dict(color='black',size=20)),
    legend=dict(orientation='v',traceorder='reversed'),
    hoverlabel=dict(bgcolor='white'))
fig.update_traces(textposition='outside', textinfo='percent+label')
fig.show()

In [12]:
b = train.groupby(['store'], as_index=False)['num_sold'].sum()
fig = go.Figure(data=[go.Pie(labels=b.store,values = b.num_sold, hole=.4)])
fig.add_annotation(text='Store',
                   x=0.5,y=0.5,showarrow=False,font_size=14,opacity=0.7,font_family='monospace')
fig.update_traces(hoverinfo='label+percent+value',
                  marker=dict(colors=['darkorange','blue'], line=dict(color='#000000', width=2)))
fig.update_layout(
    font_family='monospace',
    title=dict(text='Total products sold by Store',x=0.47,y=0.98,
               font=dict(color='black',size=20)),
    legend=dict(orientation='v',traceorder='reversed'),
    hoverlabel=dict(bgcolor='white'))
fig.update_traces(textposition='outside', textinfo='percent+label')
fig.show()

In [13]:
fig = px.scatter(train, x="date", y="num_sold", color='country')
fig.update_layout(
    font_family='monospace',
    title=dict(text='No. of product sold by country',x=0.47,y=0.98,
               font=dict(color='black',size=20)),
    legend=dict(orientation='v',traceorder='reversed'),
    hoverlabel=dict(bgcolor='white'))
fig.show()

In [14]:
fig = px.scatter(train, x="date", y="num_sold", color='store')
fig.update_layout(
    font_family='monospace',
    title=dict(text='Products sold by store',x=0.47,y=0.98,
               font=dict(color='black',size=20)),
    legend=dict(orientation='v',traceorder='reversed'),
    hoverlabel=dict(bgcolor='white'))
fig.show()

* KaggleMart consistently sells more products than KaggleRama.
* There are big spikes towards the end of each year.

In [15]:
c = train.groupby(['product','store'], as_index=False)['num_sold'].sum()
fig = px.bar(c, x="store", y="num_sold", color='product', barmode='group')
fig.update_layout(
    font_family='monospace',
    title=dict(text='No. of products sold by store',x=0.47,y=0.98,
               font=dict(color='black',size=20)),
    legend=dict(orientation='h',traceorder='reversed'),
    hoverlabel=dict(bgcolor='white'))
fig.show()

<h2 align="center" style="font-weight: bold">Data Preprocessing</h2>

In [16]:
train['year'] = train['date'].dt.year
train['month'] = train['date'].dt.month
train['day'] = train['date'].dt.day
train['weekday'] = train['date'].dt.weekday

test['year'] = test['date'].dt.year
test['month'] = test['date'].dt.month
test['day'] = test['date'].dt.day
test['weekday'] = test['date'].dt.weekday

train.drop('date',axis=1,inplace=True)
test.drop('date',axis=1,inplace=True)

In [17]:
train = pd.get_dummies(train)
test = pd.get_dummies(test)

<h2 align="center" style="font-weight: bold">PyCaret</h2>

In [18]:
! pip install --ignore-installed --pre pycaret

Collecting pycaret
  Downloading pycaret-3.0.0rc3-py3-none-any.whl (544 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m544.5/544.5 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scikit-plot>=0.3.7
  Downloading scikit_plot-0.3.7-py3-none-any.whl (33 kB)
Collecting markupsafe>=2.0.1
  Downloading MarkupSafe-2.1.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (25 kB)
Collecting sktime~=0.11.4
  Downloading sktime-0.11.4-py3-none-any.whl (6.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.7/6.7 MB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting schemdraw>=0.14
  Downloading schemdraw-0.15-py3-none-any.whl (106 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.8/106.8 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scikit-learn>=1.0
  Downloading scikit_learn-1.0.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (24.8 MB)
[2K     [90m━━━━━━━━━━━

In [19]:
from pycaret.regression import setup, compare_models, blend_models, finalize_model, plot_model, predict_model, add_metric

In [20]:
def SMAPE(y_true, y_pred):
    denominator = (y_true + np.abs(y_pred)) / 200.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return np.mean(diff)

In [21]:
def pycaret_model(train, target, test, n_select, fold, opt ):
    print('Setup Your Data....')
    setup(data=train,
          target=target,
          fold = fold)
    
    # add SMAPE to PyCaret
    add_metric('SMAPE', 'SMAPE', SMAPE, greater_is_better=False)
  
    print('Comparing Models....')
    best = compare_models(sort = opt,n_select=n_select, fold = fold)
    
    print('Blending Models....')
    blended = blend_models(estimator_list= best, fold=fold, optimize=opt)
    
    print('Finallizing Models....')
    final_model = finalize_model(blended)
    print('Done...!!!')
    
    pred = predict_model(final_model, test)
    
    return pred

In [22]:
result = pycaret_model(train,'num_sold',test, 2, 5, 'SMAPE')

Setup Your Data....


Unnamed: 0,Description,Value
0,Session id,4579
1,Target,num_sold
2,Target type,Regression
3,Data shape,"(70080, 17)"
4,Train data shape,"(49055, 17)"
5,Test data shape,"(21025, 17)"
6,Numeric features,16
7,Preprocess,True
8,Imputation type,simple
9,Numeric imputation,mean


Comparing Models....


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,SMAPE,TT (Sec)
catboost,CatBoost Regressor,8.8515,178.9451,13.3759,0.9889,0.0585,0.0466,4.6413,9.51
xgboost,Extreme Gradient Boosting,9.2893,193.9071,13.9233,0.9879,0.0636,0.0502,5.0053,3.67
rf,Random Forest Regressor,10.6838,299.6376,17.3084,0.9814,0.0718,0.0549,5.4684,6.31
et,Extra Trees Regressor,11.3562,351.6391,18.748,0.9781,0.0766,0.0579,5.7834,5.952
dt,Decision Tree Regressor,13.8932,516.5704,22.7225,0.9679,0.0944,0.0715,7.11,0.164
gbr,Gradient Boosting Regressor,18.9098,853.1634,29.1997,0.947,0.1779,0.1071,10.9094,2.006
knn,K Neighbors Regressor,37.4791,2555.4231,50.5494,0.8411,0.2849,0.2552,21.988,8.512
huber,Huber Regressor,46.0857,4405.9569,66.164,0.726,0.3896,0.2927,26.8639,0.436
lasso,Lasso Regression,42.1303,3357.7921,57.9394,0.7913,0.4236,0.2954,28.9343,0.072
lar,Least Angle Regression,42.9232,3520.277,59.2555,0.7812,0.4277,0.2996,29.0933,0.07


Processing:   0%|          | 0/86 [00:00<?, ?it/s]

[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.


Blending Models....


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE,SMAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,8.9246,181.6831,13.479,0.9889,0.0591,0.047,4.6873
1,8.8592,173.2758,13.1634,0.9893,0.0589,0.0469,4.6863
2,8.7071,174.0604,13.1932,0.9889,0.0581,0.0464,4.6313
3,8.9581,183.8641,13.5596,0.9885,0.0596,0.0474,4.725
4,8.9215,183.7935,13.557,0.9886,0.0581,0.0462,4.5983
Mean,8.8741,179.3354,13.3905,0.9888,0.0588,0.0468,4.6656
Std,0.0894,4.6998,0.1759,0.0003,0.0006,0.0004,0.045


Processing:   0%|          | 0/6 [00:00<?, ?it/s]

Finallizing Models....
Done...!!!


In [23]:
sub['num_sold'] = np.round(result.Label).astype(int)
sub.to_csv('sub.csv',index=False)