In [1]:
from  datetime import datetime, timedelta
import gc
import numpy as np, pandas as pd
import lightgbm as lgb
import xgboost as xgb

  import pandas.util.testing as tm


In [None]:
%config ZMQInteractiveShell.ast_node_interactivity='all'

In [None]:
CAL_DTYPES={"event_name_1": "category", "event_name_2": "category", "event_type_1": "category", 
         "event_type_2": "category", "weekday": "category", 'wm_yr_wk': 'int16', "wday": "int16",
        "month": "int16", "year": "int16", "snap_CA": "float32", 'snap_TX': 'float32', 'snap_WI': 'float32' }
PRICE_DTYPES = {"store_id": "category", "item_id": "category", "wm_yr_wk": "int16","sell_price":"float32" }

In [None]:
h = 28 
max_lags = 57
tr_last = 1913  # 历史的最后一天
fday = datetime(2016,4, 25)  # 预测的第一天
fday

In [None]:
prices = pd.read_csv("sell_prices.csv", dtype = PRICE_DTYPES)

In [None]:
cal = pd.read_csv("calendar.csv", dtype = CAL_DTYPES)

In [None]:
dt = pd.read_csv("sales_train_validation.csv")

In [None]:
def create_dt(is_train = True, nrows = None, first_day = 1200):
    prices = pd.read_csv("sell_prices.csv", dtype = PRICE_DTYPES)
    for col, col_dtype in PRICE_DTYPES.items():
        if col_dtype == "category":
            prices[col] = prices[col].cat.codes.astype("int16")  # 类似于labelencoder，将store_id和item_id数字化表示
            prices[col] -= prices[col].min()  
            
    cal = pd.read_csv("calendar.csv", dtype = CAL_DTYPES)
    cal["date"] = pd.to_datetime(cal["date"])  #  转化为datetime类型
    for col, col_dtype in CAL_DTYPES.items():
        if col_dtype == "category":
            cal[col] = cal[col].cat.codes.astype("int16")  # 将event_name_1/event_type_1/event_name_2/event_type_2weekday数字化表示
            cal[col] -= cal[col].min()  # NaN会转换为-1，该代表将NaN归为0，其他从1开始
    
    start_day = max(1 if is_train  else tr_last-max_lags, first_day)  # 预测的第一天
    
    # 处理train_validation文件
    numcols = [f"d_{day}" for day in range(start_day,tr_last+1)]  # 选取历史数据，从first_day到tr_last（包含这一天）d_1200,d_1201
    catcols = ['id', 'item_id', 'dept_id','store_id', 'cat_id', 'state_id']
    dtype = {numcol:"float32" for numcol in numcols}   # d_1200为float32
    dtype.update({col: "category" for col in catcols if col != "id"})  # 除id外，都为category类型
    dt = pd.read_csv("sales_train_validation.csv", 
                     nrows = nrows, usecols = catcols + numcols, dtype = dtype)  # 取catcols和numcols的列(只取了需要的历史天数销量数据)
    
    for col in catcols:
        if col != "id":
            dt[col] = dt[col].cat.codes.astype("int16")  # 数字化
            dt[col] -= dt[col].min()
    
    if not is_train:  # 不是训练集
        for day in range(tr_last+1, tr_last+ 28 +1):  # 遍历预测的每一天
            dt[f"d_{day}"] = np.nan  # 先填充为NaN
    
    dt = pd.melt(dt,
                  id_vars = catcols,
                  value_vars = [col for col in dt.columns if col.startswith("d_")],
                  var_name = "d",
                  value_name = "sales")
    
    dt = dt.merge(cal, on= "d", copy = False)
    dt = dt.merge(prices, on = ["store_id", "item_id", "wm_yr_wk"], copy = False)
    
    return dt

In [None]:
FIRST_DAY = 1500
df = create_dt(is_train=True, first_day= FIRST_DAY)

In [None]:
def create_fea(dt):
    lags = [7, 28]
    lag_cols = [f"lag_{lag}" for lag in lags ]
    for lag, lag_col in zip(lags, lag_cols):  # [(7, lag_7), (28, lag_28)]
        dt[lag_col] = dt[["id","sales"]].groupby("id")["sales"].shift(lag)  # 新增两列，同一商品前7天的单价和前28天的价格

    wins = [7, 28]
    for win in wins :
        for lag,lag_col in zip(lags, lag_cols):
            dt[f"rmean_{lag}_{win}"] = dt[["id", lag_col]].groupby("id")[lag_col].transform(lambda x : x.rolling(win).mean()) 

            # log=7，wins=28表示一个商品当前的参考价为：从当前时间7天前（如1.8号的七天前是1.1号）开始一共向上取28天，求这些天的均值
    
    date_features = {
        
        "wday": "weekday",
        "week": "weekofyear",
        "month": "month",
        "quarter": "quarter",
        "year": "year",
        "mday": "day",
#         "ime": "is_month_end",
#         "ims": "is_month_start",
    }
    
#     dt.drop(["d", "wm_yr_wk", "weekday"], axis=1, inplace = True)
    
    for date_feat_name, date_feat_func in date_features.items():
        if date_feat_name in dt.columns:
            dt[date_feat_name] = dt[date_feat_name].astype("int16")
        else:
            dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")
            

create_fea(df)

In [None]:
df.dropna(inplace = True)
cat_feats = ['item_id', 'dept_id','store_id', 'cat_id', 'state_id'] + ["event_name_1", "event_name_2", "event_type_1", "event_type_2"]
useless_cols = ["id", "date", "sales","d", "wm_yr_wk", "weekday"]
train_cols = df.columns[~df.columns.isin(useless_cols)]
X_train = df[train_cols]
y_train = df["sales"]

np.random.seed(777)

fake_valid_inds = np.random.choice(X_train.index.values, 1_000_000, replace = False)  # 从index中，随机选择1000000个数
print(fake_valid_inds)
print(len(fake_valid_inds))

In [None]:
train_inds = np.setdiff1d(X_train.index.values, fake_valid_inds)  # train是valid剩下的id号
train_data = xgb.DMatrix(X_train.loc[train_inds] , label = y_train.loc[train_inds])
fake_valid_data = xgb.DMatrix(X_train.loc[fake_valid_inds], label = y_train.loc[fake_valid_inds],)# This is a random sample, we're not gonna apply any time series train-test-split tricks here!

In [None]:
params = {'eta':0.1,
          'gamma':0.1,
          'max_depth':8, 
          'subsample':1,
          'lambda': 0.1,
          'alpha':0.1,
          'objective':'count:poisson', # 分类结果or概率值
          'eval_metric':'rmse', #多分类mlogloss，二分类为logloss
          # 'num_class':3, # 标签种类数，二分类无需设置
          'min_child_weight':1,
          'colsample_bytree':1,
          'colsample_bylevel':1,
          # 'scale_pos_weight':0.8, # 平衡正负样本比例，默认1
          # 'max_delta_step':0,
          # 'max_leaves': 0
          'seed':666,    
}  

In [None]:
watch_list = [(train_data,'train'), (fake_valid_data,'valid')]
m_xgb = xgb.train(params, train_data, num_boost_round=5000, evals=watch_list, early_stopping_rounds=20) 

In [None]:
m_xgb.save_model("xgb_model.lgb")

In [None]:
m_xgb = xgb.Booster(model_file='xgb_model.lgb')

In [None]:
alphas = [1.035, 1.03, 1.025]
weights = [1/len(alphas)]*len(alphas)  # 权重，当前权重值一样

In [None]:
sub = 0.

for icount, (alpha, weight) in enumerate(zip(alphas, weights)):  # [(1.035, 0.333), (1.03, 0.333), (1.025, 0.333)]
    # te是所有的数据，tst是部分数据
    te = create_dt(False)  # is_train = False  原始数据
    cols = [f"F{i}" for i in range(1,29)]

    for tdelta in range(0, 28):
        day = fday + timedelta(days=tdelta)  # 预测的具体哪一天，比如第1914那天的日期
        print(icount, day)
        tst = te[(te.date >= day - timedelta(days=max_lags)) & (te.date <= day)].copy()  
        create_fea(tst)
        tst = xgb.DMatrix(tst.loc[tst.date == day , train_cols])  # 需要预测的数据
        te.loc[te.date == day, "sales"] = alpha*m_xgb.predict(tst) # magic multiplier by kyakovlev



    te_sub = te.loc[te.date >= fday, ["id", "sales"]].copy()  # 取出预测的数据
#     te_sub.loc[te.date >= fday+ timedelta(days=h), "id"] = te_sub.loc[te.date >= fday+timedelta(days=h), 
#                                                                           "id"].str.replace("validation$", "evaluation")
    te_sub["F"] = [f"F{rank}" for rank in te_sub.groupby("id")["id"].cumcount()+1]  # [F1,F2,F3,...F28]
    te_sub = te_sub.set_index(["id", "F" ]).unstack()["sales"][cols].reset_index()  # 转变成submission的形式（只是index只有每种商品的预测值，没有分位数和汇总的一些）
    te_sub.fillna(0., inplace = True) 
    te_sub.sort_values("id", inplace = True)
    te_sub.reset_index(drop=True, inplace = True)
    # te_sub.to_csv(f"submission_{icount}.csv",index=False)
    if icount == 0 :
        sub = te_sub
        sub[cols] *= weight
    else:
        sub[cols] += te_sub[cols]*weight
    print(icount, alpha, weight)


sub2 = sub.copy()
sub2["id"] = sub2["id"].str.replace("validation$", "evaluation")
sub = pd.concat([sub, sub2], axis=0, sort=False)
sub.to_csv("submission_xgb_1.csv",index=False)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.auto import tqdm as tqdm

from ipywidgets import widgets, interactive, interact
import ipywidgets as widgets
from IPython.display import display

import os

In [None]:
train_sales = pd.read_csv('sales_train_validation.csv')
calendar_df = pd.read_csv('calendar.csv')
submission_file = pd.read_csv('sample_submission.csv')
sell_prices = pd.read_csv('sell_prices.csv')
xgb_sub = pd.read_csv("submission_xgb_1.csv")

In [None]:
xgb_col = ['id']
xgb_col.extend([f'd_{day}' for day in range(1914, 1914+28)])
xgb_sub.columns = xgb_col

In [None]:
total = ['Total']
train_sales['Total'] = 'Total'
train_sales['state_cat'] = train_sales.state_id + "_" + train_sales.cat_id
train_sales['state_dept'] = train_sales.state_id + "_" + train_sales.dept_id
train_sales['store_cat'] = train_sales.store_id + "_" + train_sales.cat_id
train_sales['store_dept'] = train_sales.store_id + "_" + train_sales.dept_id
train_sales['state_item'] = train_sales.state_id + "_" + train_sales.item_id
train_sales['item_store'] = train_sales.item_id + "_" + train_sales.store_id

In [None]:
val_eval = ['validation', 'evaluation']

# creating lists for different aggregation levels
total = ['Total']
states = ['CA', 'TX', 'WI']
num_stores = [('CA',4), ('TX',3), ('WI',3)]
stores = [x[0] + "_" + str(y + 1) for x in num_stores for y in range(x[1])]
print(stores)  # 商店名

In [None]:
cats = ['FOODS', 'HOBBIES', 'HOUSEHOLD']
num_depts = [('FOODS',3), ('HOBBIES',2), ('HOUSEHOLD',2)]
depts = [x[0] + "_" + str(y + 1) for x in num_depts for y in range(x[1])]
state_cats = [state + "_" + cat for state in states for cat in cats]
state_depts = [state + "_" + dept for state in states for dept in depts]
store_cats = [store + "_" + cat for store in stores for cat in cats]
store_depts = [store + "_" + dept for store in stores for dept in depts]
print(state_cats)  # 州名+大类
print("=================================================================")
print(store_depts)  # 商店名（含州）+小类（含大类）

In [None]:
prods = list(train_sales.item_id.unique())  # 商品列表
prod_state = [prod + "_" + state for prod in prods for state in states]  # 商品（含大类和小类）+州名
prod_store = [prod + "_" + store for prod in prods for store in stores]  # 商品（含大类和小类）+商店名（含州）

In [None]:
quants = ['0.005', '0.025', '0.165', '0.250', '0.500', '0.750', '0.835', '0.975', '0.995']
days = range(1, 1913 + 29)
time_series_columns = [f'd_{i}' for i in days]

In [None]:
def create_sales(name_list, group):
    '''
    This function returns a dataframe (sales) on the aggregation level given by name list and group
    '''
    # rows_ve = [(name + "_X_" + str(q) + "_" + ve, str(q)) for name in name_list for q in quants for ve in val_eval]
    sales = train_sales.groupby(group)[time_series_columns].sum() #would not be necessary for lowest level
    return sales

In [None]:
def create_quantile_dict(name_list = stores, group = 'store_id' ,X = False):
    '''
    This function writes creates sales data on given aggregation level, and then writes predictions to the global dictionary my_dict
    '''
    sales = create_sales(name_list, group)
    sales = sales.iloc[:, 1864:] #using the last few months data only
    sales_quants = pd.DataFrame(index = sales.index)  # 空ataFrame，初始化为空
    for q in quants:
        sales_quants[q] = np.quantile(sales, float(q), axis = 1)  # 增加分位数的列，如CA_FOODS计算196天的0.005分位数
    full_mean = pd.DataFrame(np.mean(sales, axis = 1))  #  新的DataFrame，只有一列，表示每个index最近196天的均值
    daily_means = pd.DataFrame(index = sales.index)  # 新的DataFrame，初始化为空
    for i in range(7):
        daily_means[str(i)] = np.mean(sales.iloc[:, i::7], axis = 1)   # 新增7列，一周内每天的均值，如最近196天CA_FOOD,周一的均值
        
    daily_factors = daily_means / np.array(full_mean)
    
    daily_factors = pd.concat([daily_factors, daily_factors, daily_factors, daily_factors], axis = 1)
    daily_factors_np = np.array(daily_factors)

    factor_df = pd.DataFrame(daily_factors_np, columns = submission_file.columns[1:])
    factor_df.index = daily_factors.index

    for i,x in enumerate(tqdm(sales_quants.index)):
        for q in quants:
            v = sales_quants.loc[x, q] * np.array(factor_df.loc[x, :])
            if X:
                my_dict[x + "_X_" + q + "_validation"] = v
                my_dict[x + "_X_" + q + "_evaluation"] = v
            else:
                my_dict[x + "_" + q + "_validation"] = v
                my_dict[x + "_" + q + "_evaluation"] = v

In [None]:
my_dict = {}
#adding prediction to my_dict on all 12 aggregation levels
create_quantile_dict(total, 'Total', X=True) #1
create_quantile_dict(states, 'state_id', X=True) #2
create_quantile_dict(stores, 'store_id', X=True) #3
create_quantile_dict(cats, 'cat_id', X=True) #4
create_quantile_dict(depts, 'dept_id', X=True) #5
create_quantile_dict(state_cats, 'state_cat') #6
create_quantile_dict(state_depts, 'state_dept') #7
create_quantile_dict(store_cats, 'store_cat') #8
create_quantile_dict(store_depts, 'store_dept') #9
create_quantile_dict(prods, 'item_id', X=True) #10
create_quantile_dict(prod_state, 'state_item') #11
create_quantile_dict(prod_store, 'item_store') #12

In [None]:
pred_df = pd.DataFrame(my_dict)
pred_df = pred_df.transpose()
pred_df_reset = pred_df.reset_index()
final_pred = pd.merge(pd.DataFrame(submission_file.id), pred_df_reset, left_on = 'id', right_on = 'index')
del final_pred['index']
final_pred = final_pred.rename(columns={0: 'F1', 1: 'F2', 2: 'F3', 3: 'F4', 4: 'F5', 5: 'F6', 6: 'F7', 7: 'F8', 8: 'F9',
                                        9: 'F10', 10: 'F11', 11: 'F12', 12: 'F13', 13: 'F14', 14: 'F15', 15: 'F16',
                                        16: 'F17', 17: 'F18', 18: 'F19', 19: 'F20', 20: 'F21', 21: 'F22', 
                                        22: 'F23', 23: 'F24', 24: 'F25', 25: 'F26', 26: 'F27', 27: 'F28'})
final_pred = final_pred.fillna(0)

In [None]:
final_pred.to_csv('submission_xgb_final.csv', index=False)