In [1]:
import pandas as pd
import numpy as np
import dateutil.easter as easter
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator, FormatStrFormatter, PercentFormatter
from sklearn.linear_model import LinearRegression
from sklearn.compose import TransformedTargetRegressor

In [2]:
train_df = pd.read_csv('/home/onyxia/work/Forecasting_Sticker_Sales/train.csv')
test_df = pd.read_csv('/home/onyxia/work/Forecasting_Sticker_Sales/test.csv')

# The dates are read as strings and must be converted
for df in [train_df, test_df]:
    df['date'] = pd.to_datetime(df.date)
    df.set_index('date', inplace=True, drop=False)
train_df

Unnamed: 0_level_0,id,date,country,store,product,num_sold
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-01-01,0,2010-01-01,Canada,Discount Stickers,Holographic Goose,
2010-01-01,1,2010-01-01,Canada,Discount Stickers,Kaggle,973.0
2010-01-01,2,2010-01-01,Canada,Discount Stickers,Kaggle Tiers,906.0
2010-01-01,3,2010-01-01,Canada,Discount Stickers,Kerneler,423.0
2010-01-01,4,2010-01-01,Canada,Discount Stickers,Kerneler Dark Mode,491.0
...,...,...,...,...,...,...
2016-12-31,230125,2016-12-31,Singapore,Premium Sticker Mart,Holographic Goose,466.0
2016-12-31,230126,2016-12-31,Singapore,Premium Sticker Mart,Kaggle,2907.0
2016-12-31,230127,2016-12-31,Singapore,Premium Sticker Mart,Kaggle Tiers,2299.0
2016-12-31,230128,2016-12-31,Singapore,Premium Sticker Mart,Kerneler,1242.0


There are 6 countries, 3 stores and 5 products. For all 90 combinations of these, we have the sales data for 2557 days. The 2557 days are all days of the seven years 2010, 2011, 2012, 2013, 2014, 2015, 2016. There are no missing values.

* Countries: 6
* Stores: 3
* Products: 5

6×3×5=90

In [3]:
print(train_df.groupby(['country', 'store', 'product']).date.count())

print("First day:", train_df.date.min(), "   Last day:", train_df.date.max())

print("Number of days in seven years:", 365 * 7 + 2) # seven years including two leap year

print(90 * 2557, train_df.shape, train_df.date.isna().sum())

country    store              product           
Canada     Discount Stickers  Holographic Goose     2557
                              Kaggle                2557
                              Kaggle Tiers          2557
                              Kerneler              2557
                              Kerneler Dark Mode    2557
                                                    ... 
Singapore  Stickers for Less  Holographic Goose     2557
                              Kaggle                2557
                              Kaggle Tiers          2557
                              Kerneler              2557
                              Kerneler Dark Mode    2557
Name: date, Length: 90, dtype: int64
First day: 2010-01-01 00:00:00    Last day: 2016-12-31 00:00:00
Number of days in seven years: 2557
230130 (230130, 6) 0


In [4]:
train_df.groupby(['country', 'store', 'product']).num_sold.agg(['min', 'max', 'mean']).head(45)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,min,max,mean
country,store,product,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Canada,Discount Stickers,Holographic Goose,,,
Canada,Discount Stickers,Kaggle,508.0,1216.0,715.652718
Canada,Discount Stickers,Kaggle Tiers,412.0,1002.0,592.579194
Canada,Discount Stickers,Kerneler,200.0,534.0,323.602113
Canada,Discount Stickers,Kerneler Dark Mode,211.0,660.0,377.781776
Canada,Premium Sticker Mart,Holographic Goose,200.0,449.0,248.382177
Canada,Premium Sticker Mart,Kaggle,1146.0,2749.0,1712.174032
Canada,Premium Sticker Mart,Kaggle Tiers,1004.0,2484.0,1417.701995
Canada,Premium Sticker Mart,Kerneler,470.0,1291.0,773.023465
Canada,Premium Sticker Mart,Kerneler Dark Mode,527.0,1493.0,904.39343


In [5]:
train_df.groupby(['country', 'store', 'product']).num_sold.agg(['min', 'max', 'mean']).tail(45)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,min,max,mean
country,store,product,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Kenya,Discount Stickers,Holographic Goose,,,
Kenya,Discount Stickers,Kaggle,10.0,30.0,17.160735
Kenya,Discount Stickers,Kaggle Tiers,8.0,25.0,14.109894
Kenya,Discount Stickers,Kerneler,5.0,14.0,7.57899
Kenya,Discount Stickers,Kerneler Dark Mode,5.0,17.0,8.824335
Kenya,Premium Sticker Mart,Holographic Goose,5.0,11.0,5.941915
Kenya,Premium Sticker Mart,Kaggle,25.0,73.0,41.732499
Kenya,Premium Sticker Mart,Kaggle Tiers,22.0,60.0,34.460696
Kenya,Premium Sticker Mart,Kerneler,10.0,36.0,18.615565
Kenya,Premium Sticker Mart,Kerneler Dark Mode,11.0,42.0,21.827141


In [6]:
test_df.date.min(), test_df.date.max()

(Timestamp('2017-01-01 00:00:00'), Timestamp('2019-12-31 00:00:00'))

In [7]:
kk = train_df.groupby(['country', 'store', 'product']).num_sold.mean().unstack(level='store')

kk['Discount:StickersForLess'] = kk['Discount Stickers'] / kk['Stickers for Less']
kk['Discount:Premium'] = kk['Discount Stickers'] / kk['Premium Sticker Mart']
kk['StickersForLess:Premium'] = kk['Stickers for Less'] / kk['Premium Sticker Mart']

kk

Unnamed: 0_level_0,store,Discount Stickers,Premium Sticker Mart,Stickers for Less,Discount:StickersForLess,Discount:Premium,StickersForLess:Premium
country,product,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Canada,Holographic Goose,,248.382177,228.816653,,,0.921228
Canada,Kaggle,715.652718,1712.174032,1449.02151,0.493887,0.417979,0.846305
Canada,Kaggle Tiers,592.579194,1417.701995,1198.617912,0.494385,0.417986,0.845465
Canada,Kerneler,323.602113,773.023465,654.836918,0.494172,0.418619,0.847111
Canada,Kerneler Dark Mode,377.781776,904.39343,763.250293,0.494964,0.417718,0.843936
Finland,Holographic Goose,98.001564,235.260461,198.955808,0.49258,0.416566,0.845683
Finland,Kaggle,703.688698,1684.023074,1424.984357,0.493822,0.417862,0.846179
Finland,Kaggle Tiers,581.462652,1388.125147,1175.055925,0.494838,0.418883,0.846506
Finland,Kerneler,317.497849,758.843176,641.964803,0.494572,0.418397,0.845978
Finland,Kerneler Dark Mode,372.158389,890.979664,752.648025,0.494465,0.417696,0.844742


### the reason existing NAN because there are missing value in the num_sold column