In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
from itertools import cycle
from tqdm import tqdm
pd.set_option('max_columns', 50)
plt.style.use('bmh')
color_cycle = cycle(plt.rcParams['axes.prop_cycle'].by_key()['color'])

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Read in the data
INPUT_DIR = '/kaggle/input/m5-forecasting-uncertainty/'
cal = pd.read_csv(f'{INPUT_DIR}/calendar.csv')
st_eval = pd.read_csv(f'{INPUT_DIR}/sales_train_evaluation.csv')
st_valid = pd.read_csv(f'{INPUT_DIR}/sales_train_validation.csv')
ss = pd.read_csv(f'{INPUT_DIR}/sample_submission.csv')
sellp = pd.read_csv(f'{INPUT_DIR}/sell_prices.csv')

In [None]:
quants = ['0.005', '0.025', '0.165', '0.250', '0.500', '0.750', '0.835', '0.975', '0.995']
days = range(1, 1942)
val_eval = ['validation', 'evaluation']
time_series_columns = [f'd_{i}' for i in days]

def CreateSales( train_sales,name_list, group):
    '''This function returns a dataframe (sales) on the aggregation level given by name list and group'''
    rows_ve = [(name + "_X_" + str(q) + "_" + ve, str(q)) for name in name_list for q in quants for ve in val_eval]
    sales = train_sales.groupby(group)[time_series_columns].sum() #would not be necessary for lowest level
    return sales

def createTrainSet(sales_train_s,train_sales, name, group_level, X = False):
    sales_total = CreateSales(train_sales,name, group_level)
    if(X == True):
        sales_total = sales_total.rename(index = lambda s:  s + '_X')
    sales_train_s = sales_train_s.append(sales_total)
    return(sales_train_s)

def get_agg_df(train_sales):
    total = ['Total']
    train_sales['Total'] = 'Total'
    train_sales['state_cat'] = train_sales.state_id + "_" + train_sales.cat_id
    train_sales['state_dept'] = train_sales.state_id + "_" + train_sales.dept_id
    train_sales['store_cat'] = train_sales.store_id + "_" + train_sales.cat_id
    train_sales['store_dept'] = train_sales.store_id + "_" + train_sales.dept_id
    train_sales['state_item'] = train_sales.state_id + "_" + train_sales.item_id
    train_sales['item_store'] = train_sales.item_id + "_" + train_sales.store_id
    total = ['Total']
    states = ['CA', 'TX', 'WI']
    num_stores = [('CA',4), ('TX',3), ('WI',3)]
    stores = [x[0] + "_" + str(y + 1) for x in num_stores for y in range(x[1])]
    cats = ['FOODS', 'HOBBIES', 'HOUSEHOLD']
    num_depts = [('FOODS',3), ('HOBBIES',2), ('HOUSEHOLD',2)]
    depts = [x[0] + "_" + str(y + 1) for x in num_depts for y in range(x[1])]
    state_cats = [state + "_" + cat for state in states for cat in cats]
    state_depts = [state + "_" + dept for state in states for dept in depts]
    store_cats = [store + "_" + cat for store in stores for cat in cats]
    store_depts = [store + "_" + dept for store in stores for dept in depts]
    prods = list(train_sales.item_id.unique())
    prod_state = [prod + "_" + state for prod in prods for state in states]
    prod_store = [prod + "_" + store for prod in prods for store in stores]
    cols = [i for i in train_sales.columns if i.startswith('F')]
    sales_train_s = train_sales[cols]
    sales_train_s = pd.DataFrame()
    sales_train_s = createTrainSet(sales_train_s,train_sales, total, 'Total', X=True) #1
    sales_train_s = createTrainSet(sales_train_s, train_sales,states, 'state_id', X=True) #2
    sales_train_s = createTrainSet(sales_train_s,train_sales, stores, 'store_id', X=True) #3
    sales_train_s = createTrainSet(sales_train_s,train_sales, cats, 'cat_id', X=True) #4
    sales_train_s = createTrainSet(sales_train_s,train_sales, depts, 'dept_id', X=True) #5
    sales_train_s = createTrainSet(sales_train_s,train_sales, state_cats, 'state_cat') #6
    sales_train_s = createTrainSet(sales_train_s,train_sales, state_depts, 'state_dept') #7
    sales_train_s = createTrainSet(sales_train_s,train_sales, store_cats, 'store_cat') #8
    sales_train_s = createTrainSet(sales_train_s,train_sales, store_depts, 'store_dept') #9
    sales_train_s = createTrainSet(sales_train_s,train_sales, prods, 'item_id', X=True) #10
    sales_train_s = createTrainSet(sales_train_s,train_sales, prod_state, 'state_item') #11
    sales_train_s = createTrainSet(sales_train_s,train_sales, prod_store, 'item_store')
    sales_train_s['id'] = sales_train_s.index
    return(sales_train_s)
    

In [None]:
'''
pattern:
    Total, 
    state_id, store_id, cat_id, dept_id, item_id,
    state_cat, store_cat, store_dept, state_item, item_store
'''
def get_sub_df(agg_df, st_valid, pattern):
    d_list = list(agg_df.columns)[:-1]
    ind_list = st_valid[pattern].unique()
    if pattern in ['Total', 'state_id', 'store_id', 'cat_id', 'dept_id', 'item_id']:
        ind_list = [ ind + "_X" for ind in ind_list ]
    return agg_df.loc[ind_list][d_list].copy()

def separate_weekend(df, ind_l):
    wend = [1, 2]
    wday = [3,4,5,6,7]
    wend_df = df.query('wday in @wend')
    wday_df = df.query('wday in @wday')
    return wend_df[ind_l], wday_df[ind_l]

from scipy import stats
def check_poisson(df):
    unique_series = df.value_counts() # get unique items and its frequencies
    unique_items = unique_series.index.tolist()

    all_freq = len(df)
    chi2_dof = len(unique_items) - 2 # chi2 degree of freedom

    mu_hat = df.mean()
    chi2_obs = 0
    for x in unique_items:
        exp_freq = stats.poisson.pmf(x, mu_hat) * all_freq
        obs_freq = unique_series[x]
        chi2_obs += (obs_freq - exp_freq)**2 / exp_freq
    pval = 1.0 - stats.chi2.cdf(chi2_obs, df=chi2_dof)
    return pval

In [None]:
agg_df = get_agg_df(st_eval)

In [None]:
sub_df = get_sub_df(agg_df, st_eval, 'item_id')
ind_l = sub_df.index
sub_df = sub_df.T.copy().reset_index()
sub_df.rename(columns={'index': 'd'}, inplace=True)
merged_df = pd.merge(sub_df, cal, on='d', how='left')

In [None]:
window_size = 365
tail = merged_df.index[-1] + 1
start = tail - window_size
window_index = np.arange(start, tail)

In [None]:
split_df = merged_df.reset_index().query('index in @window_index')
wend_df_all, wday_df_all = separate_weekend(split_df, ind_l)
alpha = 0.01
col_list = ["pval", "is_poisson", "mean", "median", "var"]

In [None]:
wend_result_df = pd.DataFrame(index=ind_l, columns=col_list)
# wend
for data_type in tqdm(ind_l):
    wend_df = wend_df_all[data_type]
    pval = check_poisson(wend_df)
    wend_result_df.loc[data_type][col_list] = [pval, pval>=alpha, wend_df.mean(), wend_df.median(), wend_df.var()]

In [None]:
wday_result_df = pd.DataFrame(index=ind_l, columns=col_list)
# wday
for data_type in tqdm(ind_l):
    wday_df = wday_df_all[data_type]
    pval = check_poisson(wday_df)
    wday_result_df.loc[data_type][col_list] = [pval, pval>=alpha, wday_df.mean(), wday_df.median(), wday_df.var()]

In [None]:
wend_df_all, wday_df_all = separate_weekend(split_df, ind_l)
#check_item = poisson_item_id[4]
check_item = 'HOBBIES_1_002_X'

print(pval_df.loc[check_item])

plt.title('item {} wend'.format(check_item))
wend_df_all[check_item].hist(bins=100, density=True)
plt.show()
plt.title('item {} wday'.format(check_item))
wday_df_all[check_item].hist(bins=100, density=True)
plt.show()