In [None]:
# This is my commonly-used standard data science libraries
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python

# Basic package
import re
import itertools
import datetime
from dateutil import parser
import os
import operator

# DS basic
from scipy import stats
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O


# Viz
import matplotlib as mpl
from matplotlib import pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
pd.set_option('max_columns', 50)
#pd.options.display.max_columns = None
plt.style.use('bmh')
color_pal = plt.rcParams['axes.prop_cycle'].by_key()['color']
color_cycle = itertools.cycle(plt.rcParams['axes.prop_cycle'].by_key()['color'])

# Acknowledgements

1. [M5 Competition : EDA + Models 📈 ](https://www.kaggle.com/tarunpaparaju/m5-competition-eda-models) ~ by Rob Mulla
2. [EDA and Baseline Model ](https://www.kaggle.com/rdizzl3/eda-and-baseline-model)
3. [Mean encodings and PCA options](https://www.kaggle.com/kyakovlev/m5-custom-features)
4. [Lags and rolling lags](https://www.kaggle.com/kyakovlev/m5-lags-features)
5. [Base Grid and base features (calendar/price/etc)](https://www.kaggle.com/kyakovlev/m5-simple-fe)


# Understand data and the task
* There are two parallel competitions: Accuracy and Uncertainty
     + The accuracy competition will use the metric: Weighted Root Mean Squared Scaled Error (RMSSE)
     + The uncertainty competition will use the metric: Weighted Scaled Pinball Loss (WSPL)


* The sales data from Wal-Mart which covers stores in three US States (California, Texas, and Wisconsin) includes item level, department, product categories, and store details.

* Structure of sales file: The sales file is formatted in the unit of each items/physical objects(30490 items totally) which is uniquely identified by ids consisting of the item type, state, and store without given exact item names.  For columns, except the basic and redundant information(Again,the item type, state, and store) given, each columns represent sales for each days  

* In the calendar and sell_price files, it provides explanatory variables such as price, promotions, day of the week, and special events.

* I transpose the row index and columns since for sequential data, it will easily analyse time-relavant characteristics of sales as series, especially after using real date format as index so that we can not only use existing time features but also extract more time feature by utilising pandas library like day of week, holiday etc. 

* For submission, the next-28-day sales would be forecast (1914 to 1941). Also the forcast for 1942 to 1969 will be used for evaluation later.

* As expected, the sales data is very erratic, owing to the fact that so many factors affect the sales on a given day. On certain days, the sales quantity is zero, which indicates that a certain product may not be available on that day (as noted by Rob in his kernel).

In [None]:
past_sales = sale_df.set_index('id')[d_cols].T \
    .merge(cal_df.set_index('d')['date'], left_index=True, right_index=True) \
    .set_index('date')
past_sales.head()

# Exploratory Data Analysis
* Distribution of sales values(Good Discussion in [EDA and Baseline Model ](https://www.kaggle.com/rdizzl3/eda-and-baseline-model)), including
    + the distribution of zeros for each of the series has a mean around 0.8 which means there is a lot of intermittent data!
    + the distribution of max number of sales for each of the series is also explored and  'a lot of the items have a max number of sales **between 2 and 12**. There are also some items with a very high number of sales for a particular item. It might be fruitful to investigate these items and whether it was a holiday or not.'
* Sales by items
    + a lot of the items have intermittent demand. These are series that have many zeros with bursts of demand inbetween. This will be one of the biggest challenges in this competition.
    + In the analysis above it looks like a lot of the time series data start with leading zeros. I believe we can characterize these leading zeros as items that were not selling or available to sell for those periods of time. This might not be a good assumption for every series. We can investigate the distribution of leading zeros, this could help us bring down the large data size (although may not be a good choice for algorightms such as ARIMA).
(as noted in https://www.kaggle.com/rdizzl3/eda-and-baseline-model)
* Total Sales by Categories
* Sales broken down by time variables

* Distribution of price

* Sales Price

* Summary:
    + there is a trend in the sale data; therefore, when modelling using tree models like light gbm, may need multiply a constant(e.g.1.1) to reach the trend.
    + seasonality in week.
    + many 0.

In [None]:
## Distribution of sales values
pd.Series((stv[d_cols].values != 0).argmax(axis=1)).hist(figsize=(25, 5), bins=100)



## Sales by items
# Import widgets
from ipywidgets import widgets, interactive, interact
# import ipywidgets as widgets
from IPython.display import display


days = range(1, 1913 + 1)
d_cols = [f'd_{i}' for i in days]

ids = np.random.choice(stv['id'].unique().tolist(), 1000)

series_ids = widgets.Dropdown(
    options=ids,
    value=ids[0],
    description='series_ids:'
)

def plot_data(series_ids):
    df = stv.loc[stv['id'] == series_ids][d_cols]
    df = pd.Series(df.values.flatten())

    df.plot(figsize=(20, 10), lw=2, marker='*')
    df.rolling(7).mean().plot(figsize=(20, 10), lw=2, marker='o', color='orange')
    plt.axhline(df.mean(), lw=3, color='red')
    plt.grid()
    
w = interactive(
    plot_data,
    series_ids=series_ids
)
display(w)


## Total Sales by Categories
# easily using 'stv' but no much info related to real dates. So use 'past_sales'
# stv.groupby('cat_id').sum().T.plot()

for i in stv['cat_id'].unique():
    stv.groupby('cat_id').sum()
    items_col = [c for c in past_sales.columns if i in c]
    past_sales[items_col] \
        .sum(axis=1) \
        .plot(figsize=(15, 5),
              alpha=0.8,
              title='Total Sales by Item Type')
plt.legend(stv['cat_id'].unique())
plt.show()


## Sales broken down by time variables
# - Now that we have our example item lets see how it sells by:
#     - Day of the week
#     - Month
#     - Year

# Merge calendar on our items' data
example = stv.loc[stv['id'] == 'FOODS_3_090_CA_3_validation'][d_cols].T
example = example.rename(columns={8412:'FOODS_3_090_CA_3'}) # Name it correctly
example = example.reset_index().rename(columns={'index': 'd'}) # make the index "d"
example = example.merge(cal, how='left', validate='1:1')
# Select more top selling examples
example2 = stv.loc[stv['id'] == 'HOBBIES_1_234_CA_3_validation'][d_cols].T
example2 = example2.rename(columns={6324:'HOBBIES_1_234_CA_3'}) # Name it correctly
example2 = example2.reset_index().rename(columns={'index': 'd'}) # make the index "d"
example2 = example2.merge(cal, how='left', validate='1:1')

example3 = stv.loc[stv['id'] == 'HOUSEHOLD_1_118_CA_3_validation'][d_cols].T
example3 = example3.rename(columns={6776:'HOUSEHOLD_1_118_CA_3'}) # Name it correctly
example3 = example3.reset_index().rename(columns={'index': 'd'}) # make the index "d"
example3 = example3.merge(cal, how='left', validate='1:1')


examples = ['FOODS_3_090_CA_3','HOBBIES_1_234_CA_3','HOUSEHOLD_1_118_CA_3']
example_df = [example, example2, example3]
for i in [0, 1, 2]:
    fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 3))
    example_df[i].groupby('wday').mean()[examples[i]] \
        .plot(kind='line',
              title='average sale: day of week',
              lw=5,
              color=color_pal[0],
              ax=ax1)
    example_df[i].groupby('month').mean()[examples[i]] \
        .plot(kind='line',
              title='average sale: month',
              lw=5,
              color=color_pal[4],

              ax=ax2)
    example_df[i].groupby('year').mean()[examples[i]] \
        .plot(kind='line',
              lw=5,
              title='average sale: year',
              color=color_pal[2],

              ax=ax3)
    fig.suptitle(f'Trends for item: {examples[i]}',
                 size=20,
                 y=1.1)
    plt.tight_layout()
    plt.show()
    
    

## Sales Price
fig, ax = plt.subplots(figsize=(15, 5))
stores = []
for store, d in sellp.query('item_id == "FOODS_3_090"').groupby('store_id'):
    d.plot(x='wm_yr_wk',
          y='sell_price',
          style='.',
          color=next(color_cycle),
          figsize=(15, 5),
          title='FOODS_3_090 sale price over time',
         ax=ax,
          legend=store)
    stores.append(store)
    plt.legend()
plt.legend(stores)
plt.show()



## Distribution of price

sellp['Category'] = sellp['item_id'].str.split('_', expand=True)[0]
fig, axs = plt.subplots(1, 3, figsize=(15, 4))
i = 0
for cat, d in sellp.groupby('Category'):
    ax = d['sell_price'].apply(np.log1p) \
        .plot(kind='hist',
                         bins=20,
                         title=f'Distribution of {cat} prices',
                         ax=axs[i],
                                         color=next(color_cycle))
    ax.set_xlabel('Log(price)')
    i += 1
plt.tight_layout()





## TODO
# - Simple prediction based on historical average sale by day of week
# - Facebook prophet model
# - lgbm/xgb model based on day features
thirty_day_avg_map = stv.set_index('id')[d_cols[-30:]].mean(axis=1).to_dict()
fcols = [f for f in ss.columns if 'F' in f]
for f in fcols:
    ss[f] = ss['id'].map(thirty_day_avg_map).fillna(0)
    
ss.to_csv('submission.csv', index=False)