# Introduction

Hey, thanks for viewing my Kernel!

If you like my work, please, leave an upvote: it will be really appreciated and it will motivate me in offering more content to the Kaggle community ! 😊

In [None]:
import pandas as pd
import numpy as np
import warnings


warnings.simplefilter("ignore")

train = pd.read_csv("../input/tabular-playground-series-jan-2022/train.csv")
test = pd.read_csv("../input/tabular-playground-series-jan-2022/test.csv")
train

# Data Cleaning

In [None]:
train.isna().sum()

# Feature Engineering

In [None]:
train['date'] = pd.to_datetime(train['date'])
train['year'] = train['date'].dt.year
train['month'] = train['date'].dt.month
train['day'] = train['date'].dt.day
train['dayofweek'] = train['date'].dt.dayofweek
train['dayofmonth'] = train['date'].dt.days_in_month
train['dayofyear'] = train['date'].dt.dayofyear
train['weekday'] = train['date'].dt.weekday
train['weekofyear'] = train['date'].dt.weekofyear

# Exploratory Data Analysis

In [None]:
train.groupby('country').agg({'num_sold':'sum'})

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(24, 12))
sns.distplot(x=train.loc[train['country']=='Finland','num_sold'], label='Finland', ax=ax)
sns.distplot(x=train.loc[train['country']=='Norway','num_sold'], label='Norway', ax=ax)
sns.distplot(x=train.loc[train['country']=='Sweden','num_sold'], label='Sweden', ax=ax)
plt.legend()
plt.show()

In [None]:
fig, ax = plt.subplots(1, 3, figsize=(24, 12))
sns.boxplot(data=train, x='country', y='num_sold', ax=ax[0])
sns.boxplot(data=train, x='store', y='num_sold', ax=ax[1])
sns.boxplot(data=train, x='product', y='num_sold', ax=ax[2])
plt.show()

In [None]:
fig, ax = plt.subplots(3, 1, figsize=(24, 12))
sns.lineplot(data=train, x='date', y='num_sold', hue='country', ax=ax[0])
ax[0].set_xticklabels(train['date'], rotation=90)
sns.lineplot(data=train, x='date', y='num_sold', hue='store', ax=ax[1])
ax[1].set_xticklabels(train['date'], rotation=90)
sns.lineplot(data=train, x='date', y='num_sold', hue='product', ax=ax[2])
ax[2].set_xticklabels(train['date'], rotation=90)
plt.show()

## Correlations

In [None]:
fig, ax = plt.subplots(3, 1, figsize=(12, 4))
plt.subplots_adjust(hspace=1)
feature_list = ['year', 'month', 'day', 'dayofweek', 'dayofmonth', 'dayofyear', 'weekday', 'weekofyear', 'num_sold']
sns.heatmap(train.loc[train['product']=='Kaggle Mug', feature_list].corr().iloc[-1:, :], annot=True, vmin=0, vmax=1, ax=ax[0])
ax[0].set_title('Kaggle Mug')
sns.heatmap(train.loc[train['product']=='Kaggle Hat', feature_list].corr().iloc[-1:, :], annot=True, vmin=0, vmax=1, ax=ax[1])
ax[1].set_title('Kaggle Hat')
sns.heatmap(train.loc[train['product']=='Kaggle Sticker', feature_list].corr().iloc[-1:, :], annot=True, vmin=0, vmax=1, ax=ax[2])
ax[2].set_title('Kaggle Sticker')
plt.show()

## P-values

In [None]:
from scipy.stats import pearsonr

p_feature_list = ['year', 'month', 'day', 'dayofweek', 'dayofmonth', 'dayofyear', 'weekday', 'weekofyear']
p_dict = {}
p_mug_list = []
p_hat_list = []
p_sticker_list = []
for c in p_feature_list:
    p = round(pearsonr(train.loc[train['product']=='Kaggle Mug','num_sold'], train.loc[train['product']=='Kaggle Mug',c])[1], 4)
    p_mug_list.append(p)
    p = round(pearsonr(train.loc[train['product']=='Kaggle Hat','num_sold'], train.loc[train['product']=='Kaggle Hat',c])[1], 4)
    p_hat_list.append(p)
    p = round(pearsonr(train.loc[train['product']=='Kaggle Sticker','num_sold'], train.loc[train['product']=='Kaggle Sticker',c])[1], 4)
    p_sticker_list.append(p)
p_dict['Kaggle Mug'] = p_mug_list
p_dict['Kaggle Hat'] = p_hat_list
p_dict['Kaggle Sticker'] = p_sticker_list
p_values_df = pd.DataFrame(p_dict, columns=p_dict.keys(), index=p_feature_list)

def p_value_warning_background(cell_value):
    highlight = 'background-color: lightcoral;'
    default = ''
    if cell_value > 0.05:
            return highlight
    return default

p_values_df.style.applymap(p_value_warning_background)

## Distributions

In [None]:
pip install pmdarima

### Kaggle Mug

In [None]:
from pmdarima import auto_arima
import statsmodels.api as sm

mod = sm.tsa.statespace.SARIMAX(train.loc[train['product']=='Kaggle Mug','num_sold'],
                                order=(1,1,1),
                                seasonal_order=(1, 1, 1, 7))
results = mod.fit(disp=False)
display(results.summary().tables[1])

sns.set(font_scale=1.5)
results.plot_diagnostics(figsize=(16, 8))
plt.show()

### Kaggle Hat

In [None]:
mod = sm.tsa.statespace.SARIMAX(train.loc[train['product']=='Kaggle Hat','num_sold'],
                                order=(1,1,1),
                                seasonal_order=(1, 1, 1, 7))
results = mod.fit(disp=False)
display(results.summary().tables[1])

sns.set(font_scale=1.5)
results.plot_diagnostics(figsize=(16, 8))
plt.show()

### Kaggle Sticker

In [None]:
mod = sm.tsa.statespace.SARIMAX(train.loc[train['product']=='Kaggle Sticker','num_sold'],
                                order=(1,1,1),
                                seasonal_order=(1, 1, 1, 7))
results = mod.fit(disp=False)
display(results.summary().tables[1])

sns.set(font_scale=1.5)
results.plot_diagnostics(figsize=(16, 8))
plt.show()