# Importing Libraries

In [None]:
import os 
import numpy as np 
import pandas as pd 
import seaborn as sns 
import plotly
import plotly.express as px
import matplotlib.pyplot as plt 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import TimeSeriesSplit ,train_test_split
from datetime import datetime
plt.style.use("seaborn-whitegrid")
plt.rc(
    "figure",
    autolayout=True,
    figsize=(11, 4),
    titlesize=17,
    titleweight='bold',
)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=16,
    titlepad=10,
)
import warnings
warnings.filterwarnings('ignore')

# Reading Data

In [None]:
train = pd.read_csv("../input/tabular-playground-series-jan-2022/train.csv",index_col="row_id")
test = pd.read_csv("../input/tabular-playground-series-jan-2022/test.csv",index_col="row_id")
sample = pd.read_csv("../input/tabular-playground-series-jan-2022/sample_submission.csv")

# EDA

In [None]:
display(train.head() , test.head())

In [None]:
display(train.info() , train.shape)

In [None]:
display(train.isnull().sum())

The train data has 26298 rows and 5 columns with no missing values , all the columns are object type except number sold (our target) is an int64 type.

In [None]:
print('Start Data and End Date')
print(train.date.min() , train.date.max())

However, before converting date values, let's see if all of the values are, ideally, following the same month/day/four-digit year format. 

In [None]:
def len_data_count(column):
    return column.str.len().value_counts()

print(len_data_count(train.date))
print(len_data_count(test.date))

In [None]:
# all are 10 chars long

train['date'] = pd.to_datetime(train['date'])
test['date']  = pd.to_datetime(test['date'])

In [None]:
display(train.iloc[: , 1:-1].nunique() , '' , test.iloc[: , 1:-1].nunique())

**Counts(Values) of stores , country and products.**

In [None]:
train.country.value_counts().index
# train.country.value_counts()


In [None]:
plt.figure(figsize=(8,5))
sns.countplot(x='country' , data=train , order=train.country.value_counts().index , palette='flare_r')
plt.title('Total Counts of Country')
plt.show()


Can be done similar plot for products and for store counts... or without plot done below

In [None]:
#Count for each unique Value

categorical_cols = train.select_dtypes('object').columns.tolist()

for col in categorical_cols:
    display(pd.DataFrame(train[col].value_counts()))

All Features are Balanced.

**Number of Sales Analysis**

In [None]:
plt.figure(figsize=(20,7))
sns.lineplot(x='date' , y='num_sold' , data=train , err_style=None , linewidth=1)
plt.title('Number of Sales over Time.')
plt.show()

5 picks ae observed, these picks show on the 1st of January because people buy a lot of things during this date, 

In [None]:
plt.figure(figsize=(12 , 7))
sld_cont = train.groupby(['country']).sum().reset_index()
ax = sns.barplot(x='country' , y='num_sold' , data=sld_cont , palette='rocket_r')
ax.bar_label(ax.containers[0] , fontweight='bold')
ax.set_title('Numbers Sold by Country' , fontweight='bold')

In [None]:
sld_store = train.groupby(['store']).sum().reset_index()
sld_store


In [None]:
plt.figure(figsize=(12 , 7))
ax = sns.barplot(x='store' , y='num_sold' , data=sld_store , palette='Blues')
ax.bar_label(ax.containers[0] , fontweight='bold')
ax.set_title('Numbers Sold by Each Store' , fontweight='bold')

In [None]:
sld_prod = train.groupby(['product']).sum().reset_index()
sld_prod

In [None]:
plt.figure(figsize=(14 , 7))
ax = sns.barplot(x='product' , y='num_sold' , data=sld_prod , palette='Reds')
ax.bar_label(ax.containers[0] , fontweight='bold')
ax.set_title('Numbers Sold by Each Product' , fontweight='bold')

In [None]:
# plt.figure(figsize=(16,8))
# sns.heatmap(train.corr() , annot=True , linewidth = 2)
# plt.xticks(fontweight='bold')
# plt.yticks(fontweight='bold')
# plt.show()

In [None]:
train.head()

In [None]:
train['year'] = train.date.dt.year
train['month'] = train.date.dt.month
train['day'] = train.date.dt.day
train.head()

In [None]:
train.drop('date' , axis=1 , inplace=True)
train.head()

**Country Wise Analysis 🚩**

Finland

In [None]:
finland = train[train['country'] == 'Finland']
finland.head()

In [None]:
finland_store = finland.groupby(['store']).sum().reset_index()
finland_store.head()

In [None]:
finland_prod = finland.groupby(['product']).sum().reset_index()
finland_prod.head()

In [None]:
finland_year = finland.groupby(['year']).sum().reset_index()
finland_year.head()

In [None]:
f , ax = plt.subplots(nrows = 1 , ncols=2 , figsize=(18,10))

a = sns.barplot(x='store' , y='num_sold' , data=finland_store , ax=ax[0] , palette='rocket_r')

b = sns.barplot(x='product' , y='num_sold' , data=finland_prod , ax=ax[1] , palette='rocket_r')

ax[0].bar_label(ax[0].containers[0] , fontweight='bold')
ax[1].bar_label(ax[1].containers[0] , fontweight='bold')

ax[0].text(0 , 1800500 , 'Store Distribution in Finland' , fontweight='bold' , size=18)
ax[1].text(0 , 1530000 , 'Products Distribution in Finland' , fontweight='bold' , size=18)

for a in [ax[0], ax[1]]:
    for label in (a.get_xticklabels() + a.get_yticklabels()):
        label.set_fontsize(12)
        label.set_fontweight('bold')

plt.show()

In [None]:
f , ax = plt.subplots(nrows = 1 , ncols=2 , figsize=(18,10))

plt.suptitle('Year trend for Numbers Sold in Finland' , fontweight='bold' , size=16)

a = sns.barplot(x='year' , y='num_sold' , data=finland_year , ax=ax[0] , palette='rocket_r')

b = sns.lineplot(x='year' , y='num_sold' , data=finland_year , ax=ax[1] , palette='rocket_r')

ax[0].bar_label(ax[0].containers[0] , fontweight='bold')

# ax[1].text(0 , 150 , 'Year trend for Numbers Sold in Finland' , fontweight='bold' , size=12)

for a in [ax[0], ax[1]]:
    for label in (a.get_xticklabels() + a.get_yticklabels()):
        label.set_fontsize(10)
        label.set_fontweight('bold')

plt.show()



****Norway****

In [None]:
norway = train[train['country'] == 'Norway']
norway.head()

In [None]:
norway_store = norway.groupby(['store']).sum().reset_index()

norway_prod = norway.groupby(['product']).sum().reset_index()

norway_year = norway.groupby(['year']).sum().reset_index()


In [None]:
f , ax = plt.subplots(nrows = 1 , ncols=2 , figsize=(18,10))

a = sns.barplot(x='store' , y='num_sold' , data=norway_store , ax=ax[0] , palette='rocket_r')

b = sns.barplot(x='product' , y='num_sold' , data=norway_prod , ax=ax[1] , palette='rocket_r')

ax[0].bar_label(ax[0].containers[0] , fontweight='bold')
ax[1].bar_label(ax[1].containers[0] , fontweight='bold')

ax[0].text(0 , 2999000 , 'Store Distribution in Norway' , fontweight='bold' , size=18)
ax[1].text(0 , 2540000 , 'Products Distribution in Norway' , fontweight='bold' , size=18)

for a in [ax[0], ax[1]]:
    for label in (a.get_xticklabels() + a.get_yticklabels()):
        label.set_fontsize(12)
        label.set_fontweight('bold')

plt.show()

In [None]:
f , ax = plt.subplots(nrows = 1 , ncols=2 , figsize=(18,10))

plt.suptitle('Year trend for Numbers Sold in Norway' , fontweight='bold' , size=16)

a = sns.barplot(x='year' , y='num_sold' , data=norway_year , ax=ax[0] , palette='rocket_r')

b = sns.lineplot(x='year' , y='num_sold' , data=norway_year , ax=ax[1] , palette='rocket_r')

ax[0].bar_label(ax[0].containers[0] , fontweight='bold')

# ax[1].text(0 , 150 , 'Year trend for Numbers Sold in Finland' , fontweight='bold' , size=12)

for a in [ax[0], ax[1]]:
    for label in (a.get_xticklabels() + a.get_yticklabels()):
        label.set_fontsize(10)
        label.set_fontweight('bold')

plt.show()



**SWEDEN**

In [None]:
sweden = train[train['country'] == 'Sweden']
sweden.head()

In [None]:
sweden_store = sweden.groupby(['store']).sum().reset_index()

sweden_prod = sweden.groupby(['product']).sum().reset_index()

sweden_year = sweden.groupby(['year']).sum().reset_index()


In [None]:
f , ax = plt.subplots(nrows = 1 , ncols=2 , figsize=(18,10))

a = sns.barplot(x='store' , y='num_sold' , data=sweden_store , ax=ax[0] , palette='rocket_r')

b = sns.barplot(x='product' , y='num_sold' , data=sweden_prod , ax=ax[1] , palette='rocket_r')

ax[0].bar_label(ax[0].containers[0] , fontweight='bold')
ax[1].bar_label(ax[1].containers[0] , fontweight='bold')

ax[0].text(0 , 2099000 , 'Store Distribution in Sweden' , fontweight='bold' , size=18)
ax[1].text(0 , 1800000 , 'Products Distribution in Sweden' , fontweight='bold' , size=18)

for a in [ax[0], ax[1]]:
    for label in (a.get_xticklabels() + a.get_yticklabels()):
        label.set_fontsize(12)
        label.set_fontweight('bold')

plt.show()

In [None]:
f , ax = plt.subplots(nrows = 1 , ncols=2 , figsize=(18,10))

plt.suptitle('Year trend for Numbers Sold in Sweden' , fontweight='bold' , size=16)

a = sns.barplot(x='year' , y='num_sold' , data=sweden_year , ax=ax[0] , palette='rocket_r')

b = sns.lineplot(x='year' , y='num_sold' , data=sweden_year , ax=ax[1] , palette='rocket_r')

ax[0].bar_label(ax[0].containers[0] , fontweight='bold')

# ax[1].text(0 , 150 , 'Year trend for Numbers Sold in Finland' , fontweight='bold' , size=12)

for a in [ax[0], ax[1]]:
    for label in (a.get_xticklabels() + a.get_yticklabels()):
        label.set_fontsize(10)
        label.set_fontweight('bold')

plt.show()



**Year By Year Analysis**

In [None]:
Year = train.groupby('year').sum().reset_index()
Year.head()

In [None]:
fig=plt.figure(figsize=(10,5))

ax=plt.axes()
ax.set_facecolor("#F2EDD7FF")
fig.patch.set_facecolor("#F2EDD7FF")

colors=['yellowgreen', 'gold', 'lightskyblue', 'lightcoral','lightpink','teal','green']
plt.pie(Year['num_sold'],colors=colors,labels=Year.year,autopct='%1.2f%%',shadow=True)
plt.title("Distribution of sales from 2015 to 2018",fontweight='bold',fontsize=16)
plt.axis('equal')
plt.tight_layout()
plt.show()

**Total Number Sold by Year for each country**

In [None]:
pt1 = train.groupby(['country' , 'year']).sum().reset_index()
pt1.head()

In [None]:
fig = px.line(pt1 , x='year' , y='num_sold' , title='Total Number Sold by Year in Each Country' , color='country')
fig.show()

# Preprocessing and Modelling

In [None]:
test.info()

In [None]:
test['year'] = test.date.dt.year
test['month'] = test.date.dt.month
test['day'] = test.date.dt.day

test.drop('date' , axis=1 , inplace = True)
test.head()

In [None]:
le = LabelEncoder()
cols = train.select_dtypes('object').columns.tolist()

for i in cols:
    train[i] = le.fit_transform(train[i])
    test[i] = le.fit_transform(test[i])

In [None]:
train.info()

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
import lightgbm as lgb
import xgboost as xgb
import catboost as cb
from sklearn.model_selection import train_test_split , cross_val_score
from sklearn.preprocessing import LabelEncoder

In [None]:
y = train['num_sold']
x = train.drop('num_sold' , axis=1)

x.head()

In [None]:
x_train , x_test , y_train , y_test = train_test_split(x , y , test_size=0.2 , random_state=1)

display(x_train.shape , y_train.shape , x_test.shape , y_test.shape)

In [None]:
def smape(actual, predicted):
    numerator = np.abs(predicted - actual)
    denominator = (np.abs(actual) + np.abs(predicted)) / 2
    
    return np.mean(numerator / denominator)*100

In [None]:
dic_models = {'RandomForestRegressor' : RandomForestRegressor() , 
              'GradientBoosting' : GradientBoostingRegressor() ,
              'LightGBM' : lgb.LGBMRegressor(),
              'XGradientBoosting' : xgb.XGBRegressor()    
                }
            #'CBR' : cb.CatBoostRegressor() 
    
for i in dic_models:
    print('Training with ' + i + ' model. \n')
    
    model = dic_models[i].fit(x_train , y_train)
    
    #Predicting
    print('Predicting with ' + i + ' model. \n')
    pred = model.predict(x_test)
    
    # Using SMAPE for predicting models
    print("SMAPE of " + i + " Model is ", smape(y_test,pred))
    print("------------------------------------------------------------------")
    print()    

In [None]:
from xgboost import plot_importance

model = xgb.XGBRegressor(n_estimators=1000)
model.fit(x_train , y_train , eval_set=[(x_train,y_train),(x_test, y_test)],
        early_stopping_rounds=25,
       verbose=False)

pred = model.predict(x_test)
 
# Using SMAPE for predicting models
print("SMAPE of " + i + " Model is ", smape(y_test,pred))
print("------------------------------------------------------------------")
print()    

In [None]:
features = x_train.columns
feat_importance = model.feature_importances_
#print(feat_importance)

sns.barplot(y= features , x=feat_importance)

In [None]:
results = model.evals_result()

plt.figure(figsize=(10, 8))
plt.plot(results['validation_0']['rmse'], label='train')
plt.plot(results['validation_1']['rmse'], label='test')
# show the legend
plt.legend()
plt.xlabel('iterations')
plt.ylabel('rmse')
# show the plot
plt.show()

In [None]:
from catboost import CatBoostRegressor
cat = CatBoostRegressor()
cat.fit(x_train,y_train,eval_set=(x_test,y_test) ,early_stopping_rounds=500,verbose=False)

pred = cat.predict(x_test)
 
# Using SMAPE for predicting models
print("SMAPE of " + 'CatBoost' + " Model is ", smape(y_test,pred))
print("------------------------------------------------------------------")
print()    

**Parameter Tuning XGB (Taken best parameters from https://www.kaggle.com/satoshiss/tps-jan-with-xgboost)**

In [None]:
#params ={'lambda': 0.0012338191278124635, 'alpha': 3.284395992431614, 'eta': 0.09886834650237164, 'colsample_bytree': 0.9, 'subsample': 0.6, 'learning_rate': 0.018, 'n_estimators': 2000, 'max_depth': 5, 'min_child_weight': 3}

import xgboost as xgb
#Can use GridSearch
model = xgb.XGBRegressor(n_estimators = 1500 , alpha=3.28)

model.fit(x_train, y_train,
        eval_set=[(x_train,y_train),(x_test, y_test)],
        early_stopping_rounds=60,
       verbose=False)

pred = model.predict(x_test)
 
# Using SMAPE for predicting models
print("SMAPE of " + 'Tuned XGB Model is ' , smape(y_test,pred))
print("------------------------------------------------------------------")
print()    

In [None]:
rang = range(len(pred))
plt.figure(figsize=(25,7))
plt.plot(rang , pred)
plt.plot(rang , y_test , 'red')

# Submission

In [None]:
test_preds = model.predict(test)

In [None]:
submission = pd.read_csv('../input/tabular-playground-series-jan-2022/sample_submission.csv')
submission.num_sold = np.ceil(test_preds) # rounding up
submission

In [None]:
submission.to_csv('submission.csv' , index=False)