# CODE SUMMARY (ㅁㅁㅁㅉ)

# Import Libraries

In [1]:
# General imports
import warnings
warnings.filterwarnings("ignore")

# data
import pickle
import datetime
import numpy as np
import pandas as pd
import random

# model
from lightgbm import LGBMRegressor

# visualize
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns

# sklearn
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import LabelEncoder, OneHotEncoder


## Variables

In [None]:
# target
TARGET = '취급액'

# directories
LOCAL_DIR = ".."
MODELS_DIR = LOCAL_DIR + "/data/saved_models/"
PROCESSED_DATA_DIR = LOCAL_DIR + "/data/20/"
FEATURED_DATA_DIR = LOCAL_DIR + '/data/20/'
SUBMISSION_DIR = LOCAL_DIR + "/submission/"
OPT_DATA_DIR = LOCAL_DIR + "/data/13/"
RAW_DATA_DIR = LOCAL_DIR + "/data/00/"

# set global vars
data_list = ['df_wk_lag', 'df_wk_no_lag', 'df_wd_lag', 'df_wd_no_lag', 'df_all_lag']

lag_col1 = ['lag_scode_price', 'lag_scode_count', 'lag_mcode_price', 'lag_mcode_count', 'lag_bigcat_price',
            'lag_bigcat_count', 'lag_bigcat_price_day', 'lag_bigcat_count_day', 'lag_small_c_price',
            'lag_small_c_count', 'lag_all_price_show', 'lag_all_price_day']

lag_col2 = ['ts_pred', 'rolling_mean_7', 'rolling_mean_14', 'rolling_mean_21',
            'rolling_mean_28', 'mean_sales_origin']

lag_wd = ['lag_sales_wd_1', 'lag_sales_wd_2', 'lag_sales_wd_3','lag_sales_wd_4', 'lag_sales_wd_5']
lag_wk = ['lag_sales_wk_1', 'lag_sales_wk_2']
full_lag_col = ['lag_sales_1', 'lag_sales_2', 'lag_sales_5', 'lag_sales_7']

cat_col = ['상품군', 'weekdays', 'show_id', 'small_c', 'middle_c', 'big_c',
           'pay', 'months', 'hours_inweek', 'weekends', 'japp', 'parttime',
           'min_start', 'primetime', 'prime_smallc',
           'freq', 'bpower', 'steady', 'men', 'pay', 'luxury',
           'spring', 'summer', 'fall', 'winter', 'rain']

encoded_cols = ['상품코드', '상품군', 'weekdays', 'parttime', 'show_id','small_c', 'middle_c',
                'big_c', 'original_c', 'pay', 'exposed_t']

base_cols = ['방송일시', '노출(분)', '마더코드', '상품코드', '상품명', '상품군', '판매단가', '취급액']

## Helper Functions

### For Data Preprocessing

In [None]:
def load_df(path):
    """
    :objective: load data
    :return: pandas dataframe
    """
    try:
        df = pd.read_pickle(path)
        return df.reset_index()
    except:
        print("check file directory")


def drop_useless(df):
    """
    :objective: drop useless features for model.
    :return: pandas dataframe
    """
    #useless features
    xcol = ['방송일시', '노출(분)', '마더코드', '상품명', 'exposed', 'ymd', 'volume',
            'years', 'days', 'hours', 'week_num', 'holidays', 'red', 'min_range', 'brand',
            'small_c_code', 'middle_c_code', 'big_c_code', 'sales_power']
    col = [x for x in df.columns if x in xcol]
    df = df.drop(columns=col)
    df = df.copy()
    return df

def check_na(df):
    """
    :objective: show na
    :return: columns with na / na counts
    """
    print(df.isnull().sum())

def na_to_zeroes(df):
    """
    :objective: Change all na's to zero.(just for original lag!)
    :return: pandas dataframe
    """
    xcol = [x for x in df.columns if
            x in lag_col1 + lag_col2 + ['mid_click_r', 'age30_middle', 'age40_middle', 'age50_middle',
                                        'age60above_middle', 'pc_middle', 'mobile_middle']]
    for col in xcol:
        df[col] = df[col].fillna(0)
    return df

def run_label_all(df):
    """
    :objective: Perform labelencoding for all categorical/object columns
    :return: pandas dataframe
    """
    lab_col = df.select_dtypes(include=['object','category']).columns.tolist()
    for col in lab_col:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].values)
    return df

def run_preprocess(df):
    """
    :objective: Run Feature deletion, NA imputation, label encoding
    :return: pandas dataframe
    """
    df = drop_useless(df)
    df = na_to_zeroes(df)
    df = run_label_all(df)
    df1 = df.copy()
    return df1

### For Train

In [None]:
# Seeder
def seed_everything(seed=127):
    random.seed(seed)
    np.random.seed(seed)

# metrics
# negative mape (For Bayesian Optimization)
def neg_mape(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    result = (-1) * mape
    return result

# MAPE
def get_mape(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    final = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    return final

# RMSE
def get_rmse(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    rmse = np.sqrt(np.mean((y_true - y_pred) ** 2))
    return rmse

# MAE
def get_mae(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    mae = np.mean(np.abs(y_true - y_pred))
    return mae


def cv_split(df, month, printprop=False):
    """
    :objective: get index to create cross validation dataset
    :param df: pandas dataframe
    :param month: int - from 1 to 12, month to be splited
    :param printprop: boolean - whether to print proportion of cv to full data
    :return: int - index for full data to be splited
    """
    split = int(df[df['months'] == month].index.values.max())
    prop = str(split / df.shape[0])
    if printprop:
        print(f'Proportion of train set is {prop}')
        return split
    else:
        return split


def divide_train_val(df_pp, month, drop):
    """
    :objective: divide full data into train, validation
    :param df_pp: pandas dataframe, preprocessed
    :param month: int - from 1 to 12, month to be splited
    :param drop: list of str - columns to be dropped
    :return: pd.DataFrame
    """
    split = cv_split(df=df_pp, month=month)
    train_x = df_pp.iloc[:split, :].drop(columns=['index',
                                                  'show_id', TARGET] + drop)  ## 'index' check!!
    train_y = df_pp.iloc[:split, :][TARGET]
    val_x = df_pp.iloc[split:, :].drop(columns=['index',
                                                'show_id', TARGET] + drop)
    val_y = df_pp.iloc[split:, :][TARGET]
    return train_x, train_y, val_x, val_y


def divide_top(df, num_train, num_val):
    """
    :objective: divide full data by mean_sales_origin ranking
    :param df: pandas dataframe
    :param num_train: int - index to divide train and val
    :param num_val: int - index to divide train and val
    :return: pandas dataframe
    """
    top_df = df.sort_values('mean_sales_origin', ascending=False)

    top_tr_lag_x = top_df.iloc[:num_train, :].drop(['index', 'show_id', TARGET], axis=1)
    top_tr_lag_y = top_df.iloc[:num_train, :][TARGET]
    top_v_lag_x = top_df.iloc[num_train:(num_train + num_val), :].drop(['index', 'show_id', TARGET], axis=1)
    top_v_lag_y = top_df.iloc[num_train:(num_train + num_val), :][TARGET]

    return top_df, top_tr_lag_x, top_tr_lag_y, top_v_lag_x, top_v_lag_y

# Load Data

In [None]:
# Import 4 types of dataset
# Descriptions:
#   - df_wd_lag : weekday / + lags
#   - df_wk_lag: weekend / + lags
#   - df_wd_test : weekday / + lags on test data
#   - df_wk_test: weekend / + lags on test data

df_wd_lag = load_df(FEATURED_DATA_DIR + '/train_fin_wd_lag.pkl')
df_wk_lag = load_df(FEATURED_DATA_DIR + '/train_fin_wk_lag.pkl')

df_wd_test = load_df(FEATURED_DATA_DIR + '/test_fin_wd_lag.pkl')
df_wk_test = load_df(FEATURED_DATA_DIR + '/test_fin_wk_lag.pkl')

# combined data for label encoding
tmp_combined = pd.concat([df_wd_lag, df_wk_lag, df_wd_test, df_wk_test]).drop(columns=['index'])



In [None]:

# Preprocessed datasets
tmp_combined = run_preprocess(tmp_combined)
df_wd_lag_PP = tmp_combined.loc[:, tmp_combined.columns.isin(df_wd_lag.columns)].iloc[:df_wd_lag.shape[0]].reset_index()
df_wk_lag_PP = tmp_combined.loc[:, tmp_combined.columns.isin(df_wk_lag.columns)]\
                .iloc[df_wd_lag.shape[0]:(df_wd_lag.shape[0]+df_wk_lag.shape[0])].reset_index()
df_wd_test_PP = tmp_combined.loc[:, tmp_combined.columns.isin(df_wd_test.columns)]\
                .iloc[(df_wd_lag.shape[0]+df_wk_lag.shape[0]):(df_wd_lag.shape[0]+df_wk_lag.shape[0]+df_wd_test.shape[0])]
df_wk_test_PP = tmp_combined.loc[:, tmp_combined.columns.isin(df_wk_test.columns)].iloc[-df_wk_test.shape[0]:]

# write pickle for test data
df_wd_test_PP.to_pickle(FEATURED_DATA_DIR + 'test_fin_wd_PP.pkl')
df_wk_test_PP.to_pickle(FEATURED_DATA_DIR + 'test_fin_wk_PP.pkl')
# Divide data
# WD
train_wd_lag_x, train_wd_lag_y, val_wd_lag_x, val_wd_lag_y = divide_train_val(df_wd_lag_PP, 8, drop=[])
top_wd_lag, top_tr_wd_lag_x, top_tr_wd_lag_y, top_v_wd_lag_x, top_v_wd_lag_y = divide_top(df_wd_lag_PP, 4004, 2013)
# WK
train_wk_lag_x, train_wk_lag_y, val_wk_lag_x, val_wk_lag_y = divide_train_val(df_wk_lag_PP, 8, drop=[])
top_wk_lag, top_tr_wk_lag_x, top_tr_wk_lag_y, top_v_wk_lag_x, top_v_wk_lag_y = divide_top(df_wk_lag_PP, 2206, 999)


In [None]:

def run_lgbm(params, train_x, train_y, val_x, val_y, df_type='wd_all'):
    """
    :objective: run lgbm model
    :param params: dictionary
    :param train_x: pd.DataFrame
    :param train_y: pd.DataFrame
    :param val_x: pd.DataFrame
    :param val_y: pd.DataFrame
    :param df_type: str - 'wd_all', 'wk_all', 'wd_top', 'wk_top'
    :return: LGBMRegressor, np.array
    """

    seed_everything(seed=127)

    model_lg = LGBMRegressor(**params)
    model_lg.fit(train_x, train_y)
    lgbm_preds = model_lg.predict(val_x)

    # Plot LGBM: Predicted vs. True values
    plt.figure(figsize= (40,5))
    plt.rcParams["axes.grid.axis"] = "y"
    plt.rcParams["axes.grid"] = True
    x = range(0, len(lgbm_preds))
    plt.plot(x, val_y, label='true', marker='', color='grey', linewidth=2, alpha=0.8)
    plt.plot(x, lgbm_preds, label='predicted', marker='', color='tomato', linewidth=2)
    pop_b = mpatches.Patch(color='tomato', label='Predicted')
    pop_c = mpatches.Patch(color='grey', label='True')
    plt.legend(handles=[pop_b, pop_c], fontsize=27, loc=2)
    plt.xticks(fontsize=20)
    plt.yticks(fontsize=20)
    plt.xlabel('Time', fontsize=25)
    plt.ylabel('Sales', fontsize=25)
    plt.show()

    # Get Scores
    print(f'MAPE of best iter is {get_mape(val_y, lgbm_preds)}')
    print(f'MAE of best iter is {get_mae(val_y, lgbm_preds)}')

    model_name = MODELS_DIR + 'lgbm_finalmodel_' + df_type + '.bin'
    pickle.dump(model_lg, open(model_name, 'wb'))

    return model_lg, lgbm_preds

##################################################################
################# Step 1. For ALL observations ###################
##################################################################


# parameters for wd/wk model
params_all_wd = {'feature_fraction': 1,
                 'learning_rate': 0.001,
                 'min_data_in_leaf': 135,
                 'n_estimators': 3527,
                 'num_iterations': 2940,
                 'subsample': 1,
                 'boosting_type': 'dart',
                 'objective': 'regression',
                 'metric': 'mape',
                 'categorical_feature': [3, 9, 10, 11]  ## weekdays, small_c, middle_c, big_c
                 }

params_all_wk = {'feature_fraction': 1,
                 'learning_rate': 0.001,
                 'min_data_in_leaf': 134,
                 'n_estimators': 3474,
                 'num_iterations': 2928,
                 'subsample': 1,
                 'boosting_type': 'dart',
                 'objective': 'regression',
                 'metric': 'mape',
                 'categorical_feature': [3, 9, 10, 11]}  ## weekdays, small_c, middle_c, big_c


###########################################################################
################### Step 2. For High-rank observations ###################
###########################################################################

params_top_wd = {'feature_fraction': 1,
                 'learning_rate': 0.0025,
                 'min_data_in_leaf': 70,
                 'n_estimators': 5000,
                 'num_iterations': 4000,
                 'subsample': 1,
                 'boosting_type': 'dart',
                 'objective': 'regression',
                 'metric': 'mape',
                 'categorical_feature': [3, 9, 10, 11]  ## weekdays, small_c, middle_c, big_c
                 }

params_top_wk = {'feature_fraction': 1,
                 'learning_rate': 0.0025,
                 'min_data_in_leaf': 30,
                 'n_estimators': 5000,
                 'num_iterations': 3500,
                 'subsample': 1,
                 'boosting_type': 'dart',
                 'objective': 'regression',
                 'metric': 'mape',
                 'categorical_feature': [3, 9, 10, 11]  ## weekdays, small_c, middle_c, big_c
                 }

#####################################################################
############## Step 3. Mix results from step1 & step2 ###############
#####################################################################


def mixed_df(model_top, top_df, val_all_df_x, preds_all, num_top):
    """
    :objective:
    :param model_top:
    :param top_df:
    :param val_all_df_x:
    :param preds_all:
    :param num_top:
    :return:
    """
    top_idx = set(top_df.iloc[:num_top, :].index)
    val_idx = set(val_all_df_x.index)
    top_in_val = list(val_idx.intersection(top_idx))

    val_copy = val_all_df_x.copy()
    val_copy[TARGET] = preds_all

    for i in top_in_val:
        val_copy[TARGET].loc[val_copy.index == i] = model_top.predict(val_all_df_x.loc[val_all_df_x.index == i])

    return val_copy


def mix_results(true_y, pred_y):
    """
    :objective:
    :param true_y:
    :param pred_y:
    :return: plot figure
    """
    # Plot TOP: Predicted vs. True values
    plt.figure(figsize=(40, 5))
    plt.rcParams["axes.grid.axis"] = "y"
    plt.rcParams["axes.grid"] = True
    x = range(0, len(true_y))
    plt.plot(x, true_y, label='true', marker='', color='grey', linewidth=2, alpha=0.8)
    plt.plot(x, pred_y, label='predicted', marker='', color='tomato', linewidth=2)
    pop_b = mpatches.Patch(color='tomato', label='Predicted')
    pop_c = mpatches.Patch(color='grey', label='True')
    plt.legend(handles=[pop_b, pop_c], fontsize=27, loc=2)
    plt.xticks(fontsize=20)
    plt.yticks(fontsize=20)
    plt.xlabel('Time', fontsize=20)
    plt.ylabel('Sales', fontsize=20)
    plt.show()

    print(f'MAPE of mixed model is {get_mape(true_y, pred_y)}')
    print(f'MAE of mixed model is {get_mae(true_y, pred_y)}')
    print(f'RMSE of mixed model is {get_rmse(true_y, pred_y)}')


def run_models():
    # base model
    model_wd_all, preds_wd_all = run_lgbm(params_all_wd, train_wd_lag_x, train_wd_lag_y,
                                          val_wd_lag_x, val_wd_lag_y, 'wd_all')
    model_wk_all, preds_wk_all = run_lgbm(params_all_wk, train_wk_lag_x, train_wk_lag_y,
                                          val_wk_lag_x, val_wk_lag_y, 'wk_all')
    # top model
    model_wd_top, preds_wd_top = run_lgbm(params_top_wd, top_tr_wd_lag_x, top_tr_wd_lag_y,
                                          top_v_wd_lag_x, top_v_wd_lag_y, 'wd_top')
    model_wk_top, preds_wk_top = run_lgbm(params_top_wk, top_tr_wk_lag_x, top_tr_wk_lag_y,
                                          top_v_wk_lag_x, top_v_wk_lag_y, 'wk_top')
    # mixed
    mixed_wd = mixed_df(model_wd_top, top_wd_lag, val_wd_lag_x, preds_wd_all, num_top=6017)
    mix_results(val_wd_lag_y, mixed_wd[TARGET])
    mixed_wk = mixed_df(model_wk_top, top_wk_lag, val_wk_lag_x, preds_wk_all, num_top=3205)
    mix_results(val_wk_lag_y, mixed_wk[TARGET])

    return mixed_wd, mixed_wk



In [None]:

#####################################################################
######################## Cross Validation ###########################
#####################################################################

def cross_validation(cv_months=[7, 8, 9]):

    # for step 1 model
    # wd
    for num in cv_months:
        month = num
        train_x_wd, train_y_wd, val_x_wd, val_y_wd = divide_train_val(df_wd_lag_PP, month, drop=[])
        print(f'WD - CV with month {month} is starting.')
        run_lgbm(params_all_wd, train_x_wd, train_y_wd, val_x_wd, val_y_wd, 'wd_all')

    # wk
    for num in cv_months:
        month = num
        train_x_wk, train_y_wk, val_x_wk, val_y_wk = divide_train_val(df_wk_lag_PP, month, drop=[])
        print(f'WK - CV with month {month} is starting.')
        run_lgbm(params_all_wk, train_x_wk, train_y_wk, val_x_wk, val_y_wk, 'wk_all')

    # for step 3 model
    # wd
    cv_wd = [[2952, 1052, 12], [4524, 2093, 40]]
    for num in cv_months:
        print(f'WD - CV for Mixed model - month {num} is starting.')
        for lst in cv_wd:
            print(f'WD - CV for Mixed model - top {lst[2]}% is starting.')
            train = lst[0]
            val = lst[1]
            train_x_wd_all, train_y_wd_all, val_x_wd_all, val_y_wd_all = divide_train_val(df_wd_lag_PP, num, drop=[])
            top_cv, train_x_wd_top, train_y_wd_top, val_x_wd_top, val_y_wd_top = divide_top(df_wd_lag_PP, train, val)
            model_all_cv, preds_all_cv = run_lgbm(params_all_wd, train_x_wd_all, train_y_wd_all, val_x_wd_all, val_y_wd_all,
                                                  'wd_all')
            model_top_cv, preds_top_cv = run_lgbm(params_top_wd, train_x_wd_top, train_y_wd_top, val_x_wd_top, val_y_wd_top,
                                                  'wd_top')
            mixed_wd = mixed_df(model_top_cv, top_cv, val_x_wd_all, preds_all_cv, num_top=(lst[0] + lst[1]))
            mix_results(val_y_wd_all, mixed_wd[TARGET])

    # wk
    cv_wk = [[1205, 504, 16], [2856, 1359, 40]]
    for num in cv_months:
        print(f'WK - CV for Mixed model - month {num} is starting.')
        for lst in cv_wk:
            print(f'WK - CV for Mixed model - top {lst[2]}% is starting.')
            train = lst[0]
            val = lst[1]
            train_x_wk_all, train_y_wk_all, val_x_wk_all, val_y_wk_all = divide_train_val(df_wk_lag_PP, num, drop=[])
            top_cv, train_x_wk_top, train_y_wk_top, val_x_wk_top, val_y_wk_top = divide_top(df_wk_lag_PP, train, val)
            _, preds_all_cv = run_lgbm(params_all_wk, train_x_wk_all, train_y_wk_all, val_x_wk_all, val_y_wk_all, 'wk_all')
            model_top_cv, _ = run_lgbm(params_top_wk, train_x_wk_top, train_y_wk_top, val_x_wk_top, val_y_wk_top, 'wk_top')
            mixed_wk = mixed_df(model_top_cv, top_cv, val_x_wk_all, preds_all_cv, num_top=(lst[0] + lst[1]))
            mix_results(val_y_wk_all, mixed_wk[TARGET])



In [None]:
# #####################################################################
# #################### Robust Cross Validation ########################
# #####################################################################
# """
# perform cross validation on 2019-dec data by 2019 Jan to Aug data
# to guarantee the time series robustness of our model
# """

def robust_cross_validation():

    # Data preparation, Jan to Aug
    train_x_wd_rb, train_y_wd_rb, val_x_wd_rb, val_y_wd_rb = divide_train_val(df_wd_lag_PP, 8, drop=[])
    top_wd_rb, top_tr_x_wd_rb, top_tr_y_wd_rb, top_v_x_wd_rb, top_v_y_wd_rb = divide_top(df_wd_lag_PP, 4004, 2013)
    train_x_wk_rb, train_y_wk_rb, val_x_wk_rb, val_y_wk_rb = divide_train_val(df_wk_lag_PP, 8, drop=[])
    top_wk_rb, top_tr_x_wk_rb, top_tr_y_wk_rb, top_v_x_wk_rb, top_v_y_wk_rb = divide_top(df_wk_lag_PP, 2206, 999)
    # target - 2019 Dec
    DEC = 12
    wd_dec_x = df_wd_lag_PP[df_wd_lag_PP.months == DEC].drop(['index', 'show_id', TARGET], axis=1)
    wd_dec_y = df_wd_lag_PP[df_wd_lag_PP.months == DEC][TARGET]
    wk_dec_x = df_wk_lag_PP[df_wk_lag_PP.months == DEC].drop(['index', 'show_id', TARGET], axis=1)
    wk_dec_y = df_wk_lag_PP[df_wk_lag_PP.months == DEC][TARGET]

    # wd
    model_all_rb, preds_all_rb = run_lgbm(params_all_wd, train_x_wd_rb, train_y_wd_rb,
                                          val_x_wd_rb, val_y_wd_rb, 'wd_all')
    model_top_rb, _ = run_lgbm(params_top_wd, top_tr_x_wd_rb, top_tr_y_wd_rb, top_v_x_wd_rb, top_v_y_wd_rb,'wd_top')
    preds_wd_dec = model_all_rb.predict(wd_dec_x)
    mixed_wd_rb = mixed_df(model_top_rb, top_wd_rb, wd_dec_x, preds_wd_dec, num_top=6017)
    mix_results(wd_dec_y, mixed_wd_rb[TARGET])

    # wk
    model_all_rb, _ = run_lgbm(params_all_wk, train_x_wk_rb, train_y_wk_rb, val_x_wk_rb, val_y_wk_rb, 'wk_all')
    model_top_rb, _ = run_lgbm(params_top_wk, top_tr_x_wk_rb, top_tr_y_wk_rb, top_v_x_wk_rb, top_v_y_wk_rb,'wk_top')
    preds_wk_dec = model_all_rb.predict(wk_dec_x)
    mixed_wk_rb = mixed_df(model_top_rb, top_wk_rb, wk_dec_x, preds_wk_dec, num_top=3205)
    mix_results(wk_dec_y, mixed_wk_rb[TARGET])

    return mixed_wd_rb, mixed_wk_rb



