In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
sns.set(color_codes=True)
pal = sns.color_palette("viridis", 10)
sns.set_palette('muted')

In [None]:
train = pd.read_csv('../input/tabular-playground-series-jan-2022/train.csv')
test = pd.read_csv('../input/tabular-playground-series-jan-2022/test.csv')

In [None]:
train.head(20)

In [None]:
train.info()

In [None]:
train.isnull().sum()

* No null values.
* Only **2 numerical columns**.
* row_id is just an unique id refering to each row.
* date column spans from **1st jan 2015** to **31st Dec 2018**. 
* There are **2 stores** namely ***KaggleMart*** and ***KaggleRama***.
* There are **3 products** in order every 3 rows i.e. ***Mug, Hat, Sticker***.
* There are 3 countries namely ***Norway, Sweden, Finland***
* ***num_sold*** is our dependent variable

In [None]:
train['country'].value_counts()

In [None]:
train['store'].value_counts()

In [None]:
train['product'].value_counts()

### The data is equally distributed for every country, product and store per date.

## Date preprocessing

In [None]:
#Before getting into EDA and vizualizations its better to handle the Date column.
train['date'] = pd.to_datetime(train['date'], format = "%Y-%m-%d")
test['date'] = pd.to_datetime(test['date'], format = "%Y-%m-%d")


train['year'] = train['date'].apply(lambda x: x.year)
train['month'] = train['date'].apply(lambda x: x.month)
train['day'] = train['date'].apply(lambda x: x.day)

test['year'] = test['date'].apply(lambda x: x.year)
test['month'] = test['date'].apply(lambda x: x.month)
test['day'] = test['date'].apply(lambda x: x.day)

In [None]:
train

## Exploratory Data Analysis

## 1. Which countries buys most ?

In [None]:
series = train.groupby(['country']).num_sold.sum()
print(series.to_string())
sns.barplot(x = series.index, y = series.values)
plt.title('Sales per country')
plt.xlabel('country')
plt.ylabel('No. of Sales')


In [None]:
plt.pie(series.values, labels = series.index,  autopct='%0.1f%%')
plt.title('Sales share per country')


### Norway clearly has the best sales and Finland has the worst.

## 2. Let's Check the sales distribution

In [None]:
sns.kdeplot(x = train['num_sold'], hue = train['country'])
plt.title('Desity plot')
plt.show()

In [None]:
plt.figure(dpi = 100)
sns.boxplot(x = train['country'], y = train['num_sold'])
plt.title('Boxplot of sales per country')
plt.show()

* Sales distribution is slightly **right skewed** in all 3 countries
* Like number of sales Norway has the highest median here too

## 3. Which Store has the most sales?

### 3.1 Exploring overall Store Sales

In [None]:
series = train.groupby('store').num_sold.sum()
plt.pie(series.values, labels = series.index,  autopct='%0.2f%%')
plt.title('Store Sales')
plt.show()

In [None]:
sns.kdeplot(train['num_sold'], hue = train['store'])
plt.title('Density plot of Store sales')
plt.show()

In [None]:
plt.figure(dpi = 100)
sns.boxplot(y = train['num_sold'], x = train['store'])
plt.title('Boxplot of sales per stores')
plt.show()

### 3.2 Which is the Best Selling Store in each country?

In [None]:
train.groupby(['country','store']).num_sold.sum()

In [None]:
fig = plt.figure(figsize  = (12,10)) 
countries = ['Finland', 'Norway', 'Sweden']
for i in range(3):
    series = train[train['country'] == countries[i]].groupby('store').num_sold.sum()
    ax = fig.add_subplot(2,2,i+1)
    ax.pie(series.values, labels = series.index,  autopct='%0.2f%%')
    ax.title.set_text(f'Store Sales Share in {countries[i]}')

### Almost similar trends in all countries as Overall trend

## 4. Exploring Product Sales

### 4.1 Overall Product sales

In [None]:
series = train.groupby('product').num_sold.sum()
plt.pie(series.values, labels = series.index,  autopct='%0.2f%%')
plt.title('Sales Share of various Products')
plt.show()

In [None]:
plt.figure(dpi = 100)
sns.boxplot(y = train['num_sold'], x = train['product'])
plt.title('Boxplot of sales per Product')
plt.show()

* Kaggle Hat is the most popular and Sticker is least.

### 4.2 Product Sales per Country

In [None]:
fig = plt.figure(figsize  = (12,10)) 
countries = ['Finland', 'Norway', 'Sweden']
for i in range(3):
    series = train[train['country'] == countries[i]].groupby('product').num_sold.sum()
    ax = fig.add_subplot(2,2,i+1)
    ax.pie(series.values, labels = series.index,  autopct='%0.2f%%')
    ax.title.set_text(f'Product Sales Share in {countries[i]}')

### 4.3 Product Sales per Store

In [None]:
fig = plt.figure(figsize  = (12,10)) 

rand =221
series = train[train['store'] == 'KaggleRama'].groupby('product').num_sold.sum()
ax1 = fig.add_subplot(221)
ax1.pie(series.values, labels = series.index,  autopct='%0.2f%%')
ax1.title.set_text(f'Product Sales Share in KaggleRama')

series = train[train['store'] == 'KaggleMart'].groupby('product').num_sold.sum()
ax2 = fig.add_subplot(222)
ax2.pie(series.values, labels = series.index,  autopct='%0.2f%%')
ax2.title.set_text(f'Product Sales Share in KaggleMart')


* ***Hat*** is the **most dominant** product irrespective of store or region. Dominating More than half of sales
* ***Countrywise*** Product Sales Ratio is **similar** to Global Sales ratio.
* ***Storewise*** Product Sales Ratio is **similar** to Global Sales ratio.

## 5. What is the seasonaity Trend ?

### 5.1 Yearly Sales Trend

In [None]:
fig = plt.figure(figsize  = (20,13)) 
year = 2015
for i in range(4):
    ax = fig.add_subplot(2,2,i+1)
    ax.plot(train[train['year']==year]['date'], train[train['year']==year]['num_sold'])
    ax.title.set_text(f'Sales Trend in {year}')
    ax.set_ylabel('Sales')
    ax.set_xlabel('Date')
    year+=1

* Can't really interpret clearly, but we can see sales shoot up in mid **April-May** and in Holidays season of **December**.
* We can see some local peaks that can be **weekend sales**

### 5.2 Let's explore Average Sales Per Month trend

In [None]:
df = pd.DataFrame(train.groupby(['country','year','month']).num_sold.mean()).reset_index()
fig = plt.figure(figsize  = (20,13)) 
year = 2015
for i in range(4):
    ax = fig.add_subplot(2,2,i+1)
    ax.plot(df[(df['year']==year) & (df['country']=='Norway')]['month'], df[(df['year']==year) & (df['country']=='Norway')]['num_sold'], label = 'Norway')
    ax.plot(df[(df['year']==year) & (df['country']=='Sweden')]['month'], df[(df['year']==year) & (df['country']=='Sweden')]['num_sold'], label = 'Sweden')
    ax.plot(df[(df['year']==year) & (df['country']=='Finland')]['month'], df[(df['year']==year) & (df['country']=='Finland')]['num_sold'], label = 'Finland')
    ax.title.set_text(f'Avg Monthly Sales Trend in {year}')
    ax.set_ylabel('Average Sales')
    ax.set_xlabel('Month')
    ax.legend()
    year+=1

* Sales follow **same trends** every year in all countries.
* Sweden And Finland trends are mores similar than that of Norway.
* Global peaks come at the end of year **December**, new year and in **April**

### 5.3 Seasonal trend in Products


In [None]:
df = pd.DataFrame(train.groupby(['product','year','month']).num_sold.mean()).reset_index()
fig = plt.figure(figsize  = (20,13)) 
year = 2015
for i in range(4):
    ax = fig.add_subplot(2,2,i+1)
    ax.plot(df[(df['year']==year) & (df['product']=='Kaggle Hat')]['month'], df[(df['year']==year) & (df['product']=='Kaggle Hat')]['num_sold'], label = 'Kaggle Hat')
    ax.plot(df[(df['year']==year) & (df['product']=='Kaggle Mug')]['month'], df[(df['year']==year) & (df['product']=='Kaggle Mug')]['num_sold'], label = 'Kaggle Mug')
    ax.plot(df[(df['year']==year) & (df['product']=='Kaggle Sticker')]['month'], df[(df['year']==year) & (df['product']=='Kaggle Sticker')]['num_sold'], label = 'Kaggle Sticker')
    ax.title.set_text(f'Avg Monthly Sales by Product {year}')
    ax.set_ylabel('Average Sales')
    ax.set_xlabel('Month')
    ax.legend()
    year+=1

In [None]:
df = pd.DataFrame(train.groupby(['product','country','month']).num_sold.mean()).reset_index()
fig = plt.figure(figsize  = (20,13)) 
countries = ['Finland', 'Norway', 'Sweden']
for i in range(3):
    ax = fig.add_subplot(2,2,i+1)
    ax.plot(df[(df['country']==countries[i]) & (df['product']=='Kaggle Hat')]['month'], df[(df['country']==countries[i]) & (df['product']=='Kaggle Hat')]['num_sold'], label = 'Kaggle Hat')
    ax.plot(df[(df['country']==countries[i]) & (df['product']=='Kaggle Mug')]['month'], df[(df['country']==countries[i]) & (df['product']=='Kaggle Mug')]['num_sold'], label = 'Kaggle Mug')
    ax.plot(df[(df['country']==countries[i]) & (df['product']=='Kaggle Sticker')]['month'], df[(df['country']==countries[i]) & (df['product']=='Kaggle Sticker')]['num_sold'], label = 'Kaggle Sticker')
    ax.title.set_text(f'Avg Monthly Sales by Product in {countries[i]}')
    ax.set_ylabel('Average Sales')
    ax.set_xlabel('Month')
    ax.legend()

* All years have similar trends in products. Just varying 1 month forward or backward sometime.
* **Hats** Peak in **April** and **December** and has minmum sales in **September-October**.
* **Mugs** Peak in **December** and the sales dips in **July-August**.
* **Stickers** follow almost **same sales through out** the Year irrespective of country, store etc.

## 6. Monthly Trends in Sales?

### 6.1 Over all trend of every month

In [None]:
df = pd.DataFrame(train.groupby(['year','month','day']).num_sold.mean()).reset_index()
fig = plt.figure(figsize=(25, 40))
month = 1
months = ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']
for i in range(12):
    ax = fig.add_subplot(6,2,i+1)
    for y in range(2015,2019):
        ax.plot(df[(df['year'] == y) & (df['month'] == month)]['day'], df[(df['year'] == y) & (df['month'] == month)]['num_sold'], label = str(y))
    ax.title.set_text(f'Sales trends in {months[i]}')
    ax.set_ylabel('Average Sales')
    ax.set_xlabel('Days of Month')
    ax.legend()
    month+=1

* There is a similar trend but different peaks, that means the weakdays are affecting trends.
* There is a great chance that during **weekends** the sales go up and vice versa

## 7. Adding new Feature (Day of the weak)

In [None]:
#adding a column weekday which contains integers, 0 means Monday and 6 means Sunday
train['weekday'] = train['date'].apply(lambda x: x.weekday())
test['weekday'] = test['date'].apply(lambda x: x.weekday())

### 7.1 Exploring weakly trends

In [None]:
fig = plt.figure(figsize=(16,13))
series = train.groupby('weekday').num_sold.mean()
ax1 = fig.add_subplot(121)
ax1.bar(x = series.index, height = series.values)
ax1.title.set_text(f'Sales trends in Weak Overall')
ax1.set_ylabel('Average Sales')
ax1.set_xlabel('Days of Weak')
    
series = train.groupby('weekday').num_sold.mean()
days_of_weak = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']
ax2 = fig.add_subplot(122)
ax2.pie(series.values, labels = [days_of_weak[i] for i in series.index], autopct = '%0.1f%%')
ax2.title.set_text(f'Sales Share per Days of Weak')

* Clearly we can see that there's a significant difference in average sales at 5,6 i.e **Saturday** and **Sunday**

### 7.2 Weekly trends of Various Products

In [None]:
fig = plt.figure(figsize=(16,13))
series = train[train['product']=='Kaggle Hat'].groupby('weekday').num_sold.mean()
ax1 = fig.add_subplot(221)
ax1.bar(x = series.index, height = series.values)
ax1.title.set_text(f'Sales trends in Weak for Hat')
ax1.set_ylabel('Average Sales')
ax1.set_xlabel('Days of Weak')
    
series = train[train['product']=='Kaggle Mug'].groupby('weekday').num_sold.mean()
ax2 = fig.add_subplot(222)
ax2.bar(x = series.index, height = series.values)
ax2.title.set_text(f'Sales trends in weak for Mug')
ax2.set_ylabel('Average Sales')
ax2.set_xlabel('Days of Weak')
    
series = train[train['product']=='Kaggle Sticker'].groupby('weekday').num_sold.mean()
ax3 = fig.add_subplot(223)
ax3.bar(x = series.index, height = series.values)
ax3.title.set_text(f'Sales trends in weak for Sticker')
ax3.set_ylabel('Average Sales')
ax3.set_xlabel('Days of Weak')

* Similar trend for all products Constant on **Monday** to **Thursday**
* Little Growth on **Friday**
* Peak on **Saturday-Sunday**

### 7.3 Weakly trends in Countries

In [None]:
fig = plt.figure(figsize=(16,13))
series = train[train['country']=='Norway'].groupby('weekday').num_sold.mean()
ax1 = fig.add_subplot(221)
ax1.bar(x = series.index, height = series.values)
ax1.title.set_text(f'Sales trends in Weak in Norway')
ax1.set_ylabel('Average Sales')
ax1.set_xlabel('Days of Weak')
    
series = train[train['country']=='Sweden'].groupby('weekday').num_sold.mean()
ax2 = fig.add_subplot(222)
ax2.bar(x = series.index, height = series.values)
ax2.title.set_text(f'Sales trends in weak in Sweden')
ax2.set_ylabel('Average Sales')
ax2.set_xlabel('Days of Weak')
    
series = train[train['country']=='Finland'].groupby('weekday').num_sold.mean()
ax3 = fig.add_subplot(223)
ax3.bar(x = series.index, height = series.values)
ax3.title.set_text(f'Sales trends in weak in Finland')
ax3.set_ylabel('Average Sales')
ax3.set_xlabel('Days of Weak')

* Same weakly trend in every Country, Country doesn't impact on weakend sales.

### 7.4 Weakly trends in Different Stores

In [None]:
fig = plt.figure(figsize=(16,13))
series = train[train['store']=='KaggleRama'].groupby('weekday').num_sold.mean()
ax1 = fig.add_subplot(221)
ax1.bar(x = series.index, height = series.values)
ax1.title.set_text(f'Sales trends in Weak for KaggleRama')
ax1.set_ylabel('Average Sales')
ax1.set_xlabel('Days of Weak')
    
series = train[train['store']=='KaggleMart'].groupby('weekday').num_sold.mean()
ax2 = fig.add_subplot(222)
ax2.bar(x = series.index, height = series.values)
ax2.title.set_text(f'Sales trends in weak for KaggleMart')
ax2.set_ylabel('Average Sales')
ax2.set_xlabel('Days of Weak')
    

* No Difference at all other than no. of sales, the trend is same. Store doesn't impacts the weakly trend.

## Model Training

## 1. Preparing Data for training

In [None]:
train_df = train.drop(['row_id','date','year'],axis = 1)
test_df = test.drop(['row_id','date','year'],axis = 1)
country_dict = {
    'Finland':1,
    'Sweden' :2,
    'Norway' :3
}
store_dict = {
    'KaggleMart':1,
    'KaggleRama':2
}
product_dict = {
    'Kaggle Sticker':1,
    'Kaggle Mug':2,
    'Kaggle Hat':3
}

country_list = []
store_list = []
product_list = []
for i in train_df.index:
    country_list.append(country_dict[train_df['country'][i]])
    store_list.append(store_dict[train_df['store'][i]])
    product_list.append(product_dict[train_df['product'][i]])

train_df['country'] = country_list
train_df['store'] = store_list
train_df['product'] = product_list


country_list = []
store_list = []
product_list = []
for i in test_df.index:
    country_list.append(country_dict[test_df['country'][i]])
    store_list.append(store_dict[test_df['store'][i]])
    product_list.append(product_dict[test_df['product'][i]])

test_df['country'] = country_list
test_df['store'] = store_list
test_df['product'] = product_list
  

train_df.head()

In [None]:
train_cols = [col for col in train_df.columns if col!='num_sold']
X = train_df[train_cols]
y = train_df['num_sold']

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, random_state = 42)

## 2. Training baseline XGBoost regressor

In [None]:
from xgboost import XGBRegressor

In [None]:
def smape(true, preds):
    return 1/len(true) * np.sum(2 * np.abs(preds-true) / (np.abs(true) + np.abs(preds))*100)

In [None]:
xgb = XGBRegressor(eval_metric = 'mape', random_state = 42)
xgb.fit(x_train, y_train)
preds = xgb.predict(x_test)

score = smape(y_test, preds)
print(score)
pred_df = pd.DataFrame({'True value': y_test, 'Predicted value':preds})
pred_df.head(40)

In [None]:
# Baseline submission getting ready
preds = xgb.predict(test_df)
submission_df = pd.DataFrame({'row_id':test['row_id'],'num_sold':preds})
submission_df.to_csv('submit_baseline.csv', index = False)

### Model tuning/selection is loading.....
### Don't forget to upvote 😊