<a id=0></a>
## Table of Contents ⏩

* [Basic Overview of Dataset 📺](#1)

* [Exploratory Data Analysis📊 ](#2) 
  * [Country Wise Analysis 🚩](#2.1)
  * [Year by Year Analysis 🕡](#2.2)
  

* [Modelling and Preprocessing ✌](#4)
  

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

[Slide to Top](#0)
<a  id=1 Swipe to Top></a> 
## Basic Overview of Dataset 📺

In [None]:
#Reading Train and Test dataset
train = pd.read_csv("../input/tabular-playground-series-jan-2022/train.csv")

test = pd.read_csv("../input/tabular-playground-series-jan-2022/test.csv")

In [None]:
print('**** Shape of training dataset **** ', train.shape)
print()
print('**** Shape of test dataste **** ',test.shape)


In [None]:
print("***** First Ten rows of Training Dataset ****")
print()
train.head()


In [None]:
print("***** First Ten rows of Test Dataset ****")
print()
test.head()


In [None]:
print("***** Columns of the Dataset *****")
print()
print(train.columns)

In [None]:
print("**** Datatypes present in Dataset *****")
print()
train.dtypes

In [None]:
print("***** Null values is there or not in training dataset? *****")
print()
print(train.isnull().sum())


In [None]:
print("***** Null values is there or not in test dataset? *****")
print()
print(test.isnull().sum())


In [None]:
print("***** Basic Description of Dataset *****")
print()
train.describe(include='all')

In [None]:
cols = ['country','store','product']
for i in cols:
    print("Value counts of "+i+" are :")
    print()
    print(train[i].value_counts())
    print()
    

In [None]:
train_df = train.drop(['row_id'],axis=1)
test_df = test.drop(['row_id'],axis=1)

[Slide to Top](#0)
<a id=2></a>
## Exploratory Data Analysis 📊

In [None]:
#Extracting more features from 'date' columns
train_df['date']= pd.to_datetime(train_df['date'])
train_df['year'] = train_df['date'].dt.year
train_df['month'] = train_df['date'].dt.month
train_df['day'] = train_df['date'].dt.day
train_df['weekday'] = train_df['date'].dt.weekday
train_df['week_of_year'] = train_df['date'].dt.isocalendar().week
train_df['day_of_year'] = train_df['date'].dt.dayofyear
train_df['quarter'] = train_df['date'].dt.quarter
train_df['is_weekend'] = train_df.date.dt.weekday>3
train_df.head()

In [None]:
test_df['date']= pd.to_datetime(test_df['date'])
test_df['year'] = test_df['date'].dt.year
test_df['month'] = test_df['date'].dt.month
test_df['day'] = test_df['date'].dt.day
test_df['weekday'] = test_df['date'].dt.weekday
test_df['week_of_year'] = test_df['date'].dt.isocalendar().week
test_df['day_of_year'] = test_df['date'].dt.dayofyear
test_df['quarter'] = test_df['date'].dt.quarter
test_df['is_weekend'] = test_df.date.dt.weekday>4
test_df.head()

In [None]:
#Holiday Feature Add
import holidays

days=[]
for ptr in holidays.Norway(years = [2015,2016,2017,2018,2019]).keys():
    days.append(ptr)
    
for ptr in holidays.Finland(years = [2015,2016,2017,2018,2019]).keys():
    days.append(ptr)

for ptr in holidays.Sweden(years = [2015,2016,2017,2018,2019]).keys():
    days.append(ptr)


train_df['is_holiday']= train_df['date'].isin(days) 
test_df['is_holiday'] = test_df['date'].isin(days)


In [None]:
#Adding GDP of given countries

#Training Dataset
train_df['GDP_USD)'] = 0

#Test Dataset
test_df['GDP_USD)'] = 0

#2015
train_df.loc[(train_df.country=="Norway") & (train_df.year==2015) , 'GDP_USD'] = 38.58016
train_df.loc[(train_df.country=="Finland") &  (train_df.year==2015) , 'GDP_USD'] = 23.44401
train_df.loc[(train_df.country=="Sweden") & (train_df.year==2015) , 'GDP_USD'] = 50.51038

#2016
train_df.loc[(train_df.country=="Norway") & (train_df.year==2016) , 'GDP_USD'] = 36.88271
train_df.loc[(train_df.country=="Finland") & (train_df.year==2016) , 'GDP_USD'] = 24.06079
train_df.loc[(train_df.country=="Sweden") & (train_df.year==2016), 'GDP_USD'] = 51.56547

#2017
train_df.loc[(train_df.country=="Norway") & (train_df.year==2017) , 'GDP_USD'] = 38.8394
train_df.loc[(train_df.country=="Finland") & (train_df.year==2017) , 'GDP_USD'] = 25.50165
train_df.loc[(train_df.country=="Sweden") & (train_df.year==2017) , 'GDP_USD'] = 54.10187

#2018
train_df.loc[(train_df.country=="Norway") & (train_df.year==2018) , 'GDP_USD'] = 43.69997
train_df.loc[(train_df.country=="Finland") & (train_df.year==2018) , 'GDP_USD'] = 27.58496
train_df.loc[(train_df.country=="Sweden") & (train_df.year==2018) , 'GDP_USD'] = 55.54554



#2019
test_df.loc[(test_df.country=="Norway") & (test_df.year==2019) , 'GDP_USD'] = 40.551
test_df.loc[(test_df.country=="Finland") & (test_df.year==2019) , 'GDP_USD'] = 26.89661
test_df.loc[(test_df.country=="Sweden") & (test_df.year==2019) , 'GDP_USD'] = 53.12833



In [None]:
test_df['is_holiday'].value_counts()

In [None]:
train_df['is_holiday'].value_counts()

In [None]:
train_df = train_df.drop(['date'],axis=1)
test_df = test_df.drop(['date'],axis=1)

In [None]:
train_df

In [None]:
country = pd.DataFrame()
country = train_df.groupby('country').sum()
country.head()

In [None]:
def with_hue(data,feature,ax):
    
    #Numnber of categories
    num_of_cat=len([x for x in data[feature].unique() if x==x])
    
    bars=ax.patches
    
    for ind in range(num_of_cat):
        ##     Get every hue bar
        ##     ex. 8 X categories, 4 hues =>
        ##    [0, 8, 16, 24] are hue bars for 1st X category
        hueBars=bars[ind:][::num_of_cat] 
        # Get the total height (for percentages)
        total=sum([x.get_height() for x in hueBars])
        #Printing percentages on bar
        for bar in hueBars:
            percentage='{:.1f}%'.format(100 * bar.get_height()/total)
            ax.text(bar.get_x()+bar.get_width()/2.0,
                   bar.get_height(),
                   percentage,
                    ha="center",va="bottom",fontweight='bold',fontsize=10)
    

    
def without_hue(data,feature,ax):
    
    total=0
    bars_plot=ax.patches
    for bars in bars_plot:
        total+=bars.get_height()
    for bars in bars_plot:
        percentage = '{:.1f}%'.format(100 * bars.get_height()/total)
        x = bars.get_x() + bars.get_width()/2.0
        y = bars.get_height()
        ax.text(x, y+2,str(int(y))+" , "+percentage,ha='center',fontweight='bold',fontsize=12)


In [None]:
sns.set_theme(style='white',context='notebook')
fig=plt.figure(figsize=(16,8))

ax=plt.axes()
ax.set_facecolor("#F2EDD7FF")
fig.patch.set_facecolor("#F2EDD7FF")

ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.grid(linestyle="--",axis="y",color='gray')

a=sns.barplot(x=country.index,y=country['num_sold'],palette='rocket_r')

plt.xticks(fontweight='bold')
plt.yticks(fontweight='bold')
plt.xlabel("Country",fontweight='bold')
plt.ylabel("Number of products sold",fontweight='bold')

without_hue(country,'country',a)
plt.show()



In [None]:
fig=plt.figure(figsize=(16,8))

ax=plt.axes()
ax.set_facecolor("#F2EDD7FF")
fig.patch.set_facecolor("#F2EDD7FF")

ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.grid(linestyle="--",axis="y",color='gray')

a=sns.heatmap(train_df.corr(),annot=True,linewidth=2)

plt.xticks(fontweight='bold')
plt.yticks(fontweight='bold')
plt.show()

[Slide to Top](#0)
<a id=2.1></a>
### Country Wise Analysis 🚩

#### Finland

In [None]:
finland = train_df[train_df['country']=="Finland"]
finland.head()

In [None]:
finland_store = finland.groupby('store').sum()
finland_store.head()

In [None]:
finland_prod = finland.groupby('product').sum()
finland_prod.head()

In [None]:
f,ax =plt.subplots(nrows=1, ncols=2 ,figsize=(16,8))

f.patch.set_facecolor('#F2EDD7FF')
for i in range(0,2):
    ax[i].set_facecolor('#F2EDD7FF')
    ax[i].spines['top'].set_visible(False)
    ax[i].spines['left'].set_visible(False)
    ax[i].spines['right'].set_visible(False)
    ax[i].grid(linestyle="--",axis="y",color='gray')
    

a=sns.barplot(x=finland_store.index,y=finland_store['num_sold'],ax=ax[0],palette='rocket_r')
b=sns.barplot(x=finland_prod.index,y=finland_prod['num_sold'],ax=ax[1],palette='rocket_r')

'''plt.xticks(fontweight='bold')
plt.yticks(fontweight='bold')
plt.xlabel("Country",fontweight='bold')
plt.ylabel("Number of products sold",fontweight='bold')'''

ax[0].text(0,1800000 , "Kaggle store distribution in Finland",fontweight='bold')
ax[1].text(0,1500000 , "Kaggle product distribution in Finland",fontweight='bold')

without_hue(finland_store,'country',a)
without_hue(finland_prod,'country',b)

plt.show()


In [None]:
finland_year = finland.groupby('year').sum()
finland_year.head()

In [None]:
f,ax =plt.subplots(nrows=1, ncols=2 ,figsize=(16,8))

f.patch.set_facecolor('#F2EDD7FF')
for i in range(0,2):
    ax[i].set_facecolor('#F2EDD7FF')
    ax[i].spines['top'].set_visible(False)
    ax[i].spines['left'].set_visible(False)
    ax[i].spines['right'].set_visible(False)
    ax[i].grid(linestyle="--",axis="y",color='gray')
    

a=sns.barplot(x=finland_year.index,y=finland_year['num_sold'],ax=ax[0],palette='rocket_r')
b=sns.lineplot(x=finland_year.index,y=finland_year['num_sold'],ax=ax[1],palette='rocket_r')

'''plt.xticks(fontweight='bold')
plt.yticks(fontweight='bold')
plt.xlabel("Country",fontweight='bold')
plt.ylabel("Number of products sold",fontweight='bold')'''

without_hue(finland_store,'country',a)
#without_hue(finland_prod,'country',b)

plt.show()


#### Norway

In [None]:
norway= train_df[train_df['country']=="Norway"]
norway_store = norway.groupby('store').sum()
norway_prod = norway.groupby('product').sum()
norway_year = norway.groupby('year').sum()


In [None]:
f,ax =plt.subplots(nrows=1, ncols=2 ,figsize=(16,8))

f.patch.set_facecolor('#F2EDD7FF')
for i in range(0,2):
    ax[i].set_facecolor('#F2EDD7FF')
    ax[i].spines['top'].set_visible(False)
    ax[i].spines['left'].set_visible(False)
    ax[i].spines['right'].set_visible(False)
    ax[i].grid(linestyle="--",axis="y",color='gray')
    

a=sns.barplot(x=norway_store.index,y=norway_store['num_sold'],ax=ax[0],palette='rocket_r')
b=sns.barplot(x=norway_prod.index,y=norway_prod['num_sold'],ax=ax[1],palette='rocket_r')

'''plt.xticks(fontweight='bold')
plt.yticks(fontweight='bold')
plt.xlabel("Country",fontweight='bold')
plt.ylabel("Number of products sold",fontweight='bold')'''

ax[0].text(0,3000000 , "Kaggle store distribution in Norway",fontweight='bold')
ax[1].text(0,2600000 , "Kaggle product distribution in Norway",fontweight='bold')

without_hue(norway_store,'country',a)
without_hue(norway_prod,'country',b)

plt.show()


In [None]:
f,ax =plt.subplots(nrows=1, ncols=2 ,figsize=(16,8))

f.patch.set_facecolor('#F2EDD7FF')
for i in range(0,2):
    ax[i].set_facecolor('#F2EDD7FF')
    ax[i].spines['top'].set_visible(False)
    ax[i].spines['left'].set_visible(False)
    ax[i].spines['right'].set_visible(False)
    ax[i].grid(linestyle="--",axis="y",color='gray')
    

a=sns.barplot(x=norway_year.index,y=norway_year['num_sold'],ax=ax[0],palette='rocket_r')
b=sns.lineplot(x=norway_year.index,y=norway_year['num_sold'],ax=ax[1],palette='rocket_r')

'''plt.xticks(fontweight='bold')
plt.yticks(fontweight='bold')
plt.xlabel("Country",fontweight='bold')
plt.ylabel("Number of products sold",fontweight='bold')'''


without_hue(norway_store,'country',a)
#without_hue(finland_prod,'country',b)

plt.show()


#### Sweden 

In [None]:
sweden= train_df[train_df['country']=="Sweden"]
sweden_store = sweden.groupby('store').sum()
sweden_prod = sweden.groupby('product').sum()
sweden_year = sweden.groupby('year').sum()


In [None]:
f,ax =plt.subplots(nrows=1, ncols=2 ,figsize=(16,8))

f.patch.set_facecolor('#F2EDD7FF')
for i in range(0,2):
    ax[i].set_facecolor('#F2EDD7FF')
    ax[i].spines['top'].set_visible(False)
    ax[i].spines['left'].set_visible(False)
    ax[i].spines['right'].set_visible(False)
    ax[i].grid(linestyle="--",axis="y",color='gray')
    

a=sns.barplot(x=sweden_store.index,y=sweden_store['num_sold'],ax=ax[0],palette='rocket_r')
b=sns.barplot(x=sweden_prod.index,y=sweden_prod['num_sold'],ax=ax[1],palette='rocket_r')

'''plt.xticks(fontweight='bold')
plt.yticks(fontweight='bold')
plt.xlabel("Country",fontweight='bold')
plt.ylabel("Number of products sold",fontweight='bold')'''
ax[0].text(0,2200000 , "Kaggle store distribution in Norway",fontweight='bold')
ax[1].text(0,1850000 , "Kaggle product distribution in Norway",fontweight='bold')

without_hue(sweden_store,'country',a)
without_hue(sweden_prod,'country',b)

plt.show()


In [None]:
f,ax =plt.subplots(nrows=1, ncols=2 ,figsize=(16,8))

f.patch.set_facecolor('#F2EDD7FF')
for i in range(0,2):
    ax[i].set_facecolor('#F2EDD7FF')
    ax[i].spines['top'].set_visible(False)
    ax[i].spines['left'].set_visible(False)
    ax[i].spines['right'].set_visible(False)
    ax[i].grid(linestyle="--",axis="y",color='gray')
    

a=sns.barplot(x=sweden_year.index,y=sweden_year['num_sold'],ax=ax[0],palette='rocket_r')
b=sns.lineplot(x=sweden_year.index,y=sweden_year['num_sold'],ax=ax[1],palette='rocket_r')

'''plt.xticks(fontweight='bold')
plt.yticks(fontweight='bold')
plt.xlabel("Country",fontweight='bold')
plt.ylabel("Number of products sold",fontweight='bold')'''

without_hue(sweden_store,'country',a)
#without_hue(finland_prod,'country',b)

plt.show()


[Slide to Top](#0)
<a id=2.2></a>
### Year by Year Analysis 🕡 

In [None]:
Year = train_df.groupby('year').sum()
Year.head()

In [None]:
fig=plt.figure(figsize=(10,5))

ax=plt.axes()
ax.set_facecolor("#F2EDD7FF")
fig.patch.set_facecolor("#F2EDD7FF")

colors=['yellowgreen', 'gold', 'lightskyblue', 'lightcoral','lightpink','teal','green']
plt.pie(Year['num_sold'],colors=colors,labels=Year.index,autopct='%1.2f%%',shadow=True)
plt.title("Distribution of sales from 2015 to 2018",fontweight='bold',fontsize=16)
plt.axis('equal')
plt.tight_layout()
plt.show()


In [None]:
year2015 = train_df.loc[train_df.year==2015]
year2016 = train_df.loc[train_df.year==2016]
year2017 = train_df.loc[train_df.year==2017]
year2018 = train_df.loc[train_df.year==2018]


In [None]:
year2015_store= year2015.groupby('store').sum() 
year2015_prod= year2015.groupby('product').sum()

year2016_store= year2016.groupby('store').sum()
year2016_prod= year2016.groupby('product').sum()

year2017_store= year2017.groupby('store').sum() 
year2017_prod= year2017.groupby('product').sum()

year2018_store= year2018.groupby('store').sum() 
year2018_prod= year2018.groupby('product').sum()


In [None]:
f,ax =plt.subplots(nrows=4, ncols=2 ,figsize=(16,20))
f.tight_layout()
f.patch.set_facecolor('#F2EDD7FF')
for i in range(0,4):
    for j in range(0,2):
        ax[i][j].set_facecolor('#F2EDD7FF')
        ax[i][j].spines['top'].set_visible(False)
        ax[i][j].spines['left'].set_visible(False)
        ax[i][j].spines['right'].set_visible(False)
        ax[i][j].grid(linestyle="--",axis="y",color='gray')


a=sns.barplot(x=year2015_store.index,y=year2015_store['num_sold'],ax=ax[0][0],palette='rocket_r')
b=sns.barplot(x=year2015_prod.index,y=year2015_prod['num_sold'],ax=ax[0][1],palette='rocket_r')

c=sns.barplot(x=year2016_store.index,y=year2016_store['num_sold'],ax=ax[1][0],palette='rocket_r')
d=sns.barplot(x=year2016_prod.index,y=year2016_prod['num_sold'],ax=ax[1][1],palette='rocket_r')

e=sns.barplot(x=year2017_store.index,y=year2017_store['num_sold'],ax=ax[2][0],palette='rocket_r')
f=sns.barplot(x=year2017_prod.index,y=year2017_prod['num_sold'],ax=ax[2][1],palette='rocket_r')

g=sns.barplot(x=year2018_store.index,y=year2018_store['num_sold'],ax=ax[3][0],palette='rocket_r')
h=sns.barplot(x=year2018_prod.index,y=year2018_prod['num_sold'],ax=ax[3][1],palette='rocket_r')


'''plt.xticks(fontweight='bold')
plt.yticks(fontweight='bold')
plt.xlabel("Country",fontweight='bold')
plt.ylabel("Number of products sold",fontweight='bold')'''

without_hue(year2015_store,'country',a)
without_hue(year2015_prod,'country',b)
without_hue(year2016_store,'country',c)
without_hue(year2016_prod,'country',d)
without_hue(year2017_store,'country',e)
without_hue(year2017_prod,'country',f)
without_hue(year2018_store,'country',g)
without_hue(year2018_prod,'country',h)

ax[0][0].text(0,1700000 , "Kaggle store distribution in 2015",fontweight='bold')
ax[0][1].text(0.3,1400000 , "Kaggle product distribution in 2015",fontweight='bold')
ax[1][0].text(-0.4,1400000 , "Kaggle store distribution in 2016",fontweight='bold')
ax[1][1].text(0.5,1200000 , "Kaggle product distribution in 2016",fontweight='bold')
ax[2][0].text(-0.4,1400000 , "Kaggle store distribution in 2017",fontweight='bold')
ax[2][1].text(0.5,1200000 , "Kaggle product distribution in 2017",fontweight='bold')
ax[3][0].text(-0.4,1500000 , "Kaggle store distribution in 2018",fontweight='bold')
ax[3][1].text(0.5,1200000 , "Kaggle product distribution in 2018",fontweight='bold')

plt.show()


### YEAR 2015

In [None]:
# month , weekday , quarter , week_of_year

#MONTH
year2015_month_store = year2015.groupby(['month','store']).sum().reset_index()
year2015_month_prod = year2015.groupby(['month','product']).sum().reset_index()

#WEEKDAY
year2015_wd_store = year2015.groupby(['weekday','store']).sum().reset_index()
year2015_wd_prod = year2015.groupby(['weekday','product']).sum().reset_index()

#QUARTER
year2015_quar_store = year2015.groupby(['quarter','store']).sum().reset_index()
year2015_quar_prod = year2015.groupby(['quarter','product']).sum().reset_index()

#WEEK OF YEAR
year2015_woy_store = year2015.groupby(['week_of_year','store']).sum().reset_index()
year2015_woy_prod = year2015.groupby(['week_of_year','product']).sum().reset_index()

#DAY OF YEAR
year2015_doy_store = year2015.groupby(['day_of_year','store']).sum().reset_index()
year2015_doy_prod = year2015.groupby(['day_of_year','product']).sum().reset_index()



In [None]:
import warnings
warnings.filterwarnings("ignore")

f,ax =plt.subplots(nrows=5, ncols=2 ,figsize=(16,20))
f.tight_layout()
f.patch.set_facecolor('#F2EDD7FF')
for i in range(0,5):
    for j in range(0,2):
        ax[i][j].set_facecolor('#F2EDD7FF')
        ax[i][j].spines['top'].set_visible(False)
        ax[i][j].spines['left'].set_visible(False)
        ax[i][j].spines['right'].set_visible(False)
        ax[i][j].grid(linestyle="--",axis="y",color='gray')


sns.lineplot(year2015_month_store['month'],year2015_month_store['num_sold'],hue=year2015_month_store['store'],ax=ax[0][0])
sns.lineplot(year2015_month_prod['month'],year2015_month_prod['num_sold'],hue=year2015_month_prod['product'],ax=ax[0][1])

sns.lineplot(year2015_wd_store['weekday'],year2015_wd_store['num_sold'],hue=year2015_wd_store['store'],ax=ax[1][0])
sns.lineplot(year2015_wd_prod['weekday'],year2015_wd_prod['num_sold'],hue=year2015_wd_prod['product'],ax=ax[1][1])

sns.lineplot(year2015_quar_store['quarter'],year2015_quar_store['num_sold'],hue=year2015_quar_store['store'],ax=ax[2][0])
sns.lineplot(year2015_quar_prod['quarter'],year2015_quar_prod['num_sold'],hue=year2015_quar_prod['product'],ax=ax[2][1])

sns.lineplot(year2015_woy_store['week_of_year'],year2015_woy_store['num_sold'],hue=year2015_woy_store['store'],ax=ax[3][0])
sns.lineplot(year2015_woy_prod['week_of_year'],year2015_woy_prod['num_sold'],hue=year2015_woy_prod['product'],ax=ax[3][1])

sns.lineplot(year2015_doy_store['day_of_year'],year2015_doy_store['num_sold'],hue=year2015_doy_store['store'],ax=ax[4][0])
sns.lineplot(year2015_doy_prod['day_of_year'],year2015_doy_prod['num_sold'],hue=year2015_doy_prod['product'],ax=ax[4][1])

plt.show()

### YEAR 2016

In [None]:
# month , weekday , quarter , week_of_year

#MONTH
year2016_month_store = year2016.groupby(['month','store']).sum().reset_index()
year2016_month_prod = year2016.groupby(['month','product']).sum().reset_index()

#WEEKDAY
year2016_wd_store = year2016.groupby(['weekday','store']).sum().reset_index()
year2016_wd_prod = year2016.groupby(['weekday','product']).sum().reset_index()

#QUARTER
year2016_quar_store = year2016.groupby(['quarter','store']).sum().reset_index()
year2016_quar_prod = year2016.groupby(['quarter','product']).sum().reset_index()

#WEEK OF YEAR
year2016_woy_store = year2016.groupby(['week_of_year','store']).sum().reset_index()
year2016_woy_prod = year2016.groupby(['week_of_year','product']).sum().reset_index()

#DAY OF YEAR
year2016_doy_store = year2016.groupby(['day_of_year','store']).sum().reset_index()
year2016_doy_prod = year2016.groupby(['day_of_year','product']).sum().reset_index()


In [None]:
#import warnings
warnings.filterwarnings("ignore")

f,ax =plt.subplots(nrows=5, ncols=2 ,figsize=(16,20))
f.tight_layout()
f.patch.set_facecolor('#F2EDD7FF')
for i in range(0,5):
    for j in range(0,2):
        ax[i][j].set_facecolor('#F2EDD7FF')
        ax[i][j].spines['top'].set_visible(False)
        ax[i][j].spines['left'].set_visible(False)
        ax[i][j].spines['right'].set_visible(False)
        ax[i][j].grid(linestyle="--",axis="y",color='gray')


sns.lineplot(year2016_month_store['month'],year2016_month_store['num_sold'],hue=year2016_month_store['store'],ax=ax[0][0])
sns.lineplot(year2016_month_prod['month'],year2016_month_prod['num_sold'],hue=year2016_month_prod['product'],ax=ax[0][1])

sns.lineplot(year2016_wd_store['weekday'],year2016_wd_store['num_sold'],hue=year2016_wd_store['store'],ax=ax[1][0])
sns.lineplot(year2016_wd_prod['weekday'],year2016_wd_prod['num_sold'],hue=year2016_wd_prod['product'],ax=ax[1][1])

sns.lineplot(year2016_quar_store['quarter'],year2016_quar_store['num_sold'],hue=year2016_quar_store['store'],ax=ax[2][0])
sns.lineplot(year2016_quar_prod['quarter'],year2016_quar_prod['num_sold'],hue=year2016_quar_prod['product'],ax=ax[2][1])

sns.lineplot(year2016_woy_store['week_of_year'],year2016_woy_store['num_sold'],hue=year2016_woy_store['store'],ax=ax[3][0])
sns.lineplot(year2016_woy_prod['week_of_year'],year2016_woy_prod['num_sold'],hue=year2016_woy_prod['product'],ax=ax[3][1])

sns.lineplot(year2016_doy_store['day_of_year'],year2016_doy_store['num_sold'],hue=year2016_doy_store['store'],ax=ax[4][0])
sns.lineplot(year2016_doy_prod['day_of_year'],year2016_doy_prod['num_sold'],hue=year2016_doy_prod['product'],ax=ax[4][1])

plt.show()

### YEAR 2017

In [None]:
# month , weekday , quarter , week_of_year

#MONTH
year2017_month_store = year2017.groupby(['month','store']).sum().reset_index()
year2017_month_prod = year2017.groupby(['month','product']).sum().reset_index()

#WEEKDAY
year2017_wd_store = year2017.groupby(['weekday','store']).sum().reset_index()
year2017_wd_prod = year2017.groupby(['weekday','product']).sum().reset_index()

#QUARTER
year2017_quar_store = year2017.groupby(['quarter','store']).sum().reset_index()
year2017_quar_prod = year2017.groupby(['quarter','product']).sum().reset_index()

#WEEK OF YEAR
year2017_woy_store = year2017.groupby(['week_of_year','store']).sum().reset_index()
year2017_woy_prod = year2017.groupby(['week_of_year','product']).sum().reset_index()

#DAY OF YEAR
year2017_doy_store = year2017.groupby(['day_of_year','store']).sum().reset_index()
year2017_doy_prod = year2017.groupby(['day_of_year','product']).sum().reset_index()


In [None]:
warnings.filterwarnings("ignore")

f,ax =plt.subplots(nrows=5, ncols=2 ,figsize=(16,20))
f.tight_layout()
f.patch.set_facecolor('#F2EDD7FF')
for i in range(0,5):
    for j in range(0,2):
        ax[i][j].set_facecolor('#F2EDD7FF')
        ax[i][j].spines['top'].set_visible(False)
        ax[i][j].spines['left'].set_visible(False)
        ax[i][j].spines['right'].set_visible(False)
        ax[i][j].grid(linestyle="--",axis="y",color='gray')


sns.lineplot(year2017_month_store['month'],year2017_month_store['num_sold'],hue=year2017_month_store['store'],ax=ax[0][0])
sns.lineplot(year2017_month_prod['month'],year2017_month_prod['num_sold'],hue=year2017_month_prod['product'],ax=ax[0][1])

sns.lineplot(year2017_wd_store['weekday'],year2017_wd_store['num_sold'],hue=year2017_wd_store['store'],ax=ax[1][0])
sns.lineplot(year2017_wd_prod['weekday'],year2017_wd_prod['num_sold'],hue=year2017_wd_prod['product'],ax=ax[1][1])

sns.lineplot(year2017_quar_store['quarter'],year2017_quar_store['num_sold'],hue=year2017_quar_store['store'],ax=ax[2][0])
sns.lineplot(year2017_quar_prod['quarter'],year2017_quar_prod['num_sold'],hue=year2017_quar_prod['product'],ax=ax[2][1])

sns.lineplot(year2017_woy_store['week_of_year'],year2017_woy_store['num_sold'],hue=year2017_woy_store['store'],ax=ax[3][0])
sns.lineplot(year2017_woy_prod['week_of_year'],year2017_woy_prod['num_sold'],hue=year2017_woy_prod['product'],ax=ax[3][1])

sns.lineplot(year2017_doy_store['day_of_year'],year2017_doy_store['num_sold'],hue=year2017_doy_store['store'],ax=ax[4][0])
sns.lineplot(year2017_doy_prod['day_of_year'],year2017_doy_prod['num_sold'],hue=year2017_doy_prod['product'],ax=ax[4][1])

plt.show()

### YEAR 2018

In [None]:
# month , weekday , quarter , week_of_year

#MONTH
year2018_month_store = year2018.groupby(['month','store']).sum().reset_index()
year2018_month_prod = year2018.groupby(['month','product']).sum().reset_index()

#WEEKDAY
year2018_wd_store = year2018.groupby(['weekday','store']).sum().reset_index()
year2018_wd_prod = year2018.groupby(['weekday','product']).sum().reset_index()

#QUARTER
year2018_quar_store = year2018.groupby(['quarter','store']).sum().reset_index()
year2018_quar_prod = year2018.groupby(['quarter','product']).sum().reset_index()

#WEEK OF YEAR
year2018_woy_store = year2018.groupby(['week_of_year','store']).sum().reset_index()
year2018_woy_prod = year2018.groupby(['week_of_year','product']).sum().reset_index()

#DAY OF YEAR
year2018_doy_store = year2018.groupby(['day_of_year','store']).sum().reset_index()
year2018_doy_prod = year2018.groupby(['day_of_year','product']).sum().reset_index()


In [None]:
warnings.filterwarnings("ignore")

f,ax =plt.subplots(nrows=5, ncols=2 ,figsize=(16,20))
f.tight_layout()
f.patch.set_facecolor('#F2EDD7FF')
for i in range(0,5):
    for j in range(0,2):
        ax[i][j].set_facecolor('#F2EDD7FF')
        ax[i][j].spines['top'].set_visible(False)
        ax[i][j].spines['left'].set_visible(False)
        ax[i][j].spines['right'].set_visible(False)
        ax[i][j].grid(linestyle="--",axis="y",color='gray')


sns.lineplot(year2018_month_store['month'],year2018_month_store['num_sold'],hue=year2018_month_store['store'],ax=ax[0][0])
sns.lineplot(year2018_month_prod['month'],year2018_month_prod['num_sold'],hue=year2018_month_prod['product'],ax=ax[0][1])

sns.lineplot(year2018_wd_store['weekday'],year2018_wd_store['num_sold'],hue=year2018_wd_store['store'],ax=ax[1][0])
sns.lineplot(year2018_wd_prod['weekday'],year2018_wd_prod['num_sold'],hue=year2018_wd_prod['product'],ax=ax[1][1])

sns.lineplot(year2018_quar_store['quarter'],year2018_quar_store['num_sold'],hue=year2018_quar_store['store'],ax=ax[2][0])
sns.lineplot(year2018_quar_prod['quarter'],year2018_quar_prod['num_sold'],hue=year2018_quar_prod['product'],ax=ax[2][1])

sns.lineplot(year2018_woy_store['week_of_year'],year2018_woy_store['num_sold'],hue=year2018_woy_store['store'],ax=ax[3][0])
sns.lineplot(year2018_woy_prod['week_of_year'],year2018_woy_prod['num_sold'],hue=year2018_woy_prod['product'],ax=ax[3][1])

sns.lineplot(year2018_doy_store['day_of_year'],year2018_doy_store['num_sold'],hue=year2018_doy_store['store'],ax=ax[4][0])
sns.lineplot(year2018_doy_prod['day_of_year'],year2018_doy_prod['num_sold'],hue=year2018_doy_prod['product'],ax=ax[4][1])

plt.show()

[Slide to Top](#0)
<a id=4></a>
### Modelling and Preprocessing✌

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
import lightgbm as lgb
import xgboost as xgb
import catboost as cb
from sklearn.model_selection import train_test_split , cross_val_score
from sklearn.preprocessing import LabelEncoder


In [None]:
le = LabelEncoder()
col_encode = ['country','store','product','is_weekend','is_holiday']
for i in col_encode:
    train_df[i] = le.fit_transform(train_df[i])

In [None]:
train_df

In [None]:
train_df=train_df.drop(['year'],axis=1)
#train_df=train_df.drop(['week_of_year'],axis=1)

In [None]:
#train_df=train_df.drop(['day_of_year'],axis=1)

In [None]:
y=train_df['num_sold']
x= train_df.drop(['num_sold'],axis=1)

In [None]:
x['week_of_year']=x['week_of_year'].astype('int')
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [None]:
print("Shape of x_train is :",x_train.shape)
print("Shape of x_test is :",x_test.shape)
print("Shape of y_train is :",y_train.shape)
print("Shape of y_test is :",y_test.shape)

In [None]:
'''def SMAPE(y_true,y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    smape=0
    if(len(y_true)==len(y_pred)):
        smape = (100/len(y_true)) * np.sum(2* np.abs(y_pred-y_true)/(np.abs(y_true)+np.abs(y_pred)))
    else:
        return
    return(smape)'''

def SMAPE(y_true, y_pred):
    denominator = (y_true + np.abs(y_pred)) / 200.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return np.mean(diff)

In [None]:
dic_model = {"RandomForest":RandomForestRegressor(),
             'GradientBoosting':GradientBoostingRegressor(),
            'XGradientBoosting':xgb.XGBRegressor(),
            'CatBoostRegressor':cb.CatBoostRegressor(),
            'LightGBM': lgb.LGBMRegressor()}

for i in dic_model:
    #Training
    print("Training with ",i+" Algorithm....")
    print()
    model = dic_model[i].fit(x_train,y_train)
    
    #Predicting
    print("Predicting with ",i+" Model....")
    print()
    prediction = model.predict(x_test)
    
    # Using SMAPE for predicting models
    print("SMAPE of ",i+" Model is ",SMAPE(y_test,prediction))
    print("------------------------------------------------------------------")
    print()

### XGBOOST WITH OPTUNA 🚀

In [None]:
import optuna
def objective(trial,data=x,target=y):
    
    x_train, x_test, y_train , y_test = train_test_split(x, y, test_size=0.15,random_state=42)
    #'tree_method':'gpu_hist',  # this parameter means using the GPU when training our model to speedup the training process
     #'max_depth': trial.suggest_categorical('max_depth', [5,7,9,11,13,15,17,20]),
           
    param = {
        'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.008,0.009,0.01,0.012,0.014,0.016,0.018, 0.02]),
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'max_depth': trial.suggest_int('max_depth', 2,20),
        'random_state': trial.suggest_categorical('random_state', [24, 48,2020]),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
    }
    
    model = xgb.XGBRegressor(**param)  
    
    model.fit(x_train,y_train,eval_set=[(x_test,y_test)],early_stopping_rounds=100,verbose=False)
    
    preds = model.predict(x_test)
    
    Smape = SMAPE(y_test,preds)
    
    return Smape

In [None]:
study_xgb= optuna.create_study(direction='minimize')
study_xgb.optimize(objective, n_trials=50)

In [None]:
trial_xgb= study_xgb.best_trial
print(trial_xgb.value)
print(trial_xgb.params)

In [None]:
model_xgb = xgb.XGBRegressor(**trial_xgb.params)
model_xgb.fit(x,y)

In [None]:
features = x_train.columns
feat_importance = model_xgb.feature_importances_
print(feat_importance)

sns.barplot(y= features , x=feat_importance)

## LGBM 🐱‍🏍

In [None]:
def objective_lgbm(trial):
    x_train, x_test, y_train , y_test = train_test_split(x, y, test_size=0.15,random_state=42)
    
    n_estimators = trial.suggest_int('n_estimators', 50, 1000)
    max_depth = int(trial.suggest_int('max_depth', 2, 50))
    learning_rate=trial.suggest_loguniform('learning_rate',0.001,1)
    colsample_bytree=trial.suggest_loguniform("colsample_bytree",0.1, 1)
    num_leaves=trial.suggest_int('num_leaves',10,300)
    reg_alpha= trial.suggest_loguniform('reg_alpha',0.1,1)
    reg_lambda= trial.suggest_loguniform('reg_lambda',0.1,1)
    min_split_gain=trial.suggest_loguniform('min_split_gain',0.1,1)
    subsample=trial.suggest_loguniform('subsample',0.1,1)    
    clf = lgb.LGBMRegressor(n_estimators=n_estimators, max_depth=max_depth,
                            learning_rate=learning_rate,colsample_bytree=colsample_bytree,
                            num_leaves=num_leaves,reg_alpha=reg_alpha,reg_lambda=reg_lambda,
                            min_split_gain=min_split_gain,subsample=subsample)
    
    clf.fit(x_train,y_train,eval_set=[(x_test,y_test)],early_stopping_rounds=100,verbose=False)
    
    preds = clf.predict(x_test)
    
    Smape = SMAPE(y_test,preds)
    
    return Smape

In [None]:
study_lgbm= optuna.create_study(direction='minimize')
study_lgbm.optimize(objective_lgbm, n_trials=50)

In [None]:
trial_lgbm= study_lgbm.best_trial
print(trial_lgbm.value)
print(trial_lgbm.params)

In [None]:
model_lgbm = lgb.LGBMRegressor(**trial_lgbm.params)
model_lgbm.fit(x,y)

In [None]:
features = x_train.columns
feat_importance = model_lgbm.feature_importances_
print(feat_importance)

sns.barplot(y= features , x=feat_importance)

## TEST DATASET

In [None]:
#Preprocessing test dataset
le = LabelEncoder()
col_encode = ['country','store','product','is_weekend','is_holiday']
for i in col_encode:
    test_df[i] = le.fit_transform(test_df[i])

In [None]:
test_df['week_of_year']=test_df['week_of_year'].astype('int')
#test_df=test_df.drop(['week_of_year'],axis=1)
test_df=test_df.drop(['year'],axis=1)

In [None]:
#test_df=test_df.drop(['day_of_year'],axis=1)

In [None]:
test_df.head()

In [None]:
test.head()

In [None]:
pred_f=model_xgb.predict(test_df)
#pred_lgbm = model_lgbm.predict(test_df)


In [None]:
dataframe=pd.DataFrame({"row_id":test['row_id'],'num_sold':pred_f})
#dataframe=pd.DataFrame({"row_id":test['row_id'],'num_sold':pred_lgbm})
dataframe

In [None]:
dataframe['num_sold']=np.ceil(dataframe['num_sold'])
dataframe

In [None]:
#dataframe.to_csv("Output_xgb_addedfeatures.csv",index=False)
dataframe.to_csv("Output_xgb_addedfeatures+Holiday+GDP+ceil.csv",index=False)

* **Thanks  for scrolling down to this in the notebook , please do a upvote if you find this notebook useful 😴**

* **If you have any valuable advice/technique/feedback to improve my peroformance , please comment below in comment section ✌**
