In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.auto import tqdm as tqdm

from ipywidgets import widgets, interactive, interact
import ipywidgets as widgets
from IPython.display import display

import os

In [2]:
sub=pd.read_csv("submission_knn.csv") # for knn
# sub=pd.read_csv("submission_rf.csv") # for rf
# sub=pd.read_csv("submission_xgb.csv") # for xgb
# sub=pd.read_csv("submission_lgm.csv") # for lgm
# sub=pd.read_csv("submission_mlp.csv") # for mlp

In [3]:
train_sales = pd.read_csv('../datasets/sales_train_validation.csv')
submission_file = pd.read_csv('../datasets/sample_submission.csv')


In [None]:
sub_col = ['id']
sub_col.extend([f'd_{day}' for day in range(1914, 1914+28)])
sub.columns = sub_col

In [None]:
train_sales = train_sales.merge(sub, on='id')

In [None]:
total = ['Total']
train_sales['Total'] = 'Total'
train_sales['state_cat'] = train_sales.state_id + "_" + train_sales.cat_id
train_sales['state_dept'] = train_sales.state_id + "_" + train_sales.dept_id
train_sales['store_cat'] = train_sales.store_id + "_" + train_sales.cat_id
train_sales['store_dept'] = train_sales.store_id + "_" + train_sales.dept_id
train_sales['state_item'] = train_sales.state_id + "_" + train_sales.item_id
train_sales['item_store'] = train_sales.item_id + "_" + train_sales.store_id

In [None]:
val_eval = ['validation', 'evaluation']

# creating lists for different aggregation levels
total = ['Total']
states = ['CA', 'TX', 'WI']
num_stores = [('CA',4), ('TX',3), ('WI',3)]
stores = [x[0] + "_" + str(y + 1) for x in num_stores for y in range(x[1])]
print(stores)  # 商店名

In [None]:
cats = ['FOODS', 'HOBBIES', 'HOUSEHOLD']
num_depts = [('FOODS',3), ('HOBBIES',2), ('HOUSEHOLD',2)]
depts = [x[0] + "_" + str(y + 1) for x in num_depts for y in range(x[1])]
state_cats = [state + "_" + cat for state in states for cat in cats]
state_depts = [state + "_" + dept for state in states for dept in depts]
store_cats = [store + "_" + cat for store in stores for cat in cats]
store_depts = [store + "_" + dept for store in stores for dept in depts]
print(state_cats)  # 州名+大类
print("=================================================================")
print(store_depts)  # 商店名（含州）+小类（含大类）

In [None]:
prods = list(train_sales.item_id.unique())  # 商品列表
prod_state = [prod + "_" + state for prod in prods for state in states]  # 商品（含大类和小类）+州名
prod_store = [prod + "_" + store for prod in prods for store in stores]  # 商品（含大类和小类）+商店名（含州）

In [None]:
quants = ['0.005', '0.025', '0.165', '0.250', '0.500', '0.750', '0.835', '0.975', '0.995']
days = range(1, 1913 + 29)
time_series_columns = [f'd_{i}' for i in days]

In [None]:
def create_sales(name_list, group):
    '''
    This function returns a dataframe (sales) on the aggregation level given by name list and group
    '''
    # rows_ve = [(name + "_X_" + str(q) + "_" + ve, str(q)) for name in name_list for q in quants for ve in val_eval]
    sales = train_sales.groupby(group)[time_series_columns].sum() #would not be necessary for lowest level
    return sales

In [None]:
def create_quantile_dict(name_list = stores, group = 'store_id' ,X = False):
    '''
    This function writes creates sales data on given aggregation level, and then writes predictions to the global dictionary my_dict
    '''
    sales = create_sales(name_list, group)
    sales = sales.iloc[:, 1857:] #using the last few months data only
    sales_quants = pd.DataFrame(index = sales.index)  # 空ataFrame，初始化为空
    for q in quants:
        sales_quants[q] = np.quantile(sales, float(q), axis = 1)  # 增加分位数的列，如CA_FOODS计算196天的0.005分位数
    full_mean = pd.DataFrame(np.mean(sales, axis = 1))  #  新的DataFrame，只有一列，表示每个index最近196天的均值
    daily_means = pd.DataFrame(index = sales.index)  # 新的DataFrame，初始化为空
    for i in range(7):
        daily_means[str(i)] = np.mean(sales.iloc[:, i::7], axis = 1)   # 新增7列，一周内每天的均值，如最近196天CA_FOOD,周一的均值
        
    daily_factors = daily_means / np.array(full_mean)
    
    daily_factors = pd.concat([daily_factors, daily_factors, daily_factors, daily_factors], axis = 1)
    daily_factors_np = np.array(daily_factors)

    factor_df = pd.DataFrame(daily_factors_np, columns = submission_file.columns[1:])
    factor_df.index = daily_factors.index

    for i,x in enumerate(tqdm(sales_quants.index)):
        for q in quants:
            v = sales_quants.loc[x, q] * np.array(factor_df.loc[x, :])
            if X:
                my_dict[x + "_X_" + q + "_validation"] = v
                my_dict[x + "_X_" + q + "_evaluation"] = v
            else:
                my_dict[x + "_" + q + "_validation"] = v
                my_dict[x + "_" + q + "_evaluation"] = v

In [None]:
my_dict = {}
#adding prediction to my_dict on all 12 aggregation levels
create_quantile_dict(total, 'Total', X=True) #1
create_quantile_dict(states, 'state_id', X=True) #2
create_quantile_dict(stores, 'store_id', X=True) #3
create_quantile_dict(cats, 'cat_id', X=True) #4
create_quantile_dict(depts, 'dept_id', X=True) #5
create_quantile_dict(state_cats, 'state_cat') #6
create_quantile_dict(state_depts, 'state_dept') #7
create_quantile_dict(store_cats, 'store_cat') #8
create_quantile_dict(store_depts, 'store_dept') #9
create_quantile_dict(prods, 'item_id', X=True) #10
create_quantile_dict(prod_state, 'state_item') #11
create_quantile_dict(prod_store, 'item_store') #12

In [None]:
pred_df = pd.DataFrame(my_dict)
pred_df = pred_df.transpose()
pred_df_reset = pred_df.reset_index()
final_pred = pd.merge(pd.DataFrame(submission_file.id), pred_df_reset, left_on = 'id', right_on = 'index')
del final_pred['index']
final_pred = final_pred.rename(columns={0: 'F1', 1: 'F2', 2: 'F3', 3: 'F4', 4: 'F5', 5: 'F6', 6: 'F7', 7: 'F8', 8: 'F9',
                                        9: 'F10', 10: 'F11', 11: 'F12', 12: 'F13', 13: 'F14', 14: 'F15', 15: 'F16',
                                        16: 'F17', 17: 'F18', 18: 'F19', 19: 'F20', 20: 'F21', 21: 'F22', 
                                        22: 'F23', 23: 'F24', 24: 'F25', 25: 'F26', 26: 'F27', 27: 'F28'})

In [None]:
final_pred = final_pred.fillna(0)

In [None]:
final_pred

In [None]:
# final_pred.to_csv('submission_xgb_final_small.csv', index=False)
final_pred.to_csv('submission_knn_final_small.csv', index=False)
# final_pred.to_csv('submission_xgb_final_small.csv', index=False)
# final_pred.to_csv('submission_lgm_final_small.csv', index=False)
# final_pred.to_csv('submission_mlp_final_small.csv', index=False)
