In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import dask.dataframe as dd
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
import dask_xgboost as xgb
import dask.dataframe as dd
from sklearn import preprocessing, metrics
import gc
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
def read_data():
    print('Reading files...')
    calendar = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/calendar.csv')
    calendar = reduce_mem_usage(calendar)
    print('Calendar has {} rows and {} columns'.format(calendar.shape[0], calendar.shape[1]))
    sell_prices = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sell_prices.csv')
    sell_prices = reduce_mem_usage(sell_prices)
    print('Sell prices has {} rows and {} columns'.format(sell_prices.shape[0], sell_prices.shape[1]))
    sales_train_validation = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sales_train_validation.csv')
    print('Sales train validation has {} rows and {} columns'.format(sales_train_validation.shape[0], sales_train_validation.shape[1]))
    submission = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sample_submission.csv')
    return calendar, sell_prices, sales_train_validation, submission


In [None]:
def melt_and_merge(calendar, sell_prices, sales_train_validation, submission, nrows = 47168030, merge = False):
    
    # melt sales data, get it ready for training
    sales_train_validation = pd.melt(sales_train_validation, id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name = 'day', value_name = 'demand')
    print('Melted sales train validation has {} rows and {} columns'.format(sales_train_validation.shape[0], sales_train_validation.shape[1]))
    sales_train_validation = reduce_mem_usage(sales_train_validation)
    
    # seperate test dataframes
    test1_rows = [row for row in submission['id'] if 'validation' in row]
    test2_rows = [row for row in submission['id'] if 'evaluation' in row]
    test1 = submission[submission['id'].isin(test1_rows)]
    test2 = submission[submission['id'].isin(test2_rows)]
    
    # change column names
    test1.columns = ['id', 'd_1914', 'd_1915', 'd_1916', 'd_1917', 'd_1918', 'd_1919', 'd_1920', 'd_1921', 'd_1922', 'd_1923', 'd_1924', 'd_1925', 'd_1926', 'd_1927', 'd_1928', 'd_1929', 'd_1930', 'd_1931', 
                      'd_1932', 'd_1933', 'd_1934', 'd_1935', 'd_1936', 'd_1937', 'd_1938', 'd_1939', 'd_1940', 'd_1941']
    test2.columns = ['id', 'd_1942', 'd_1943', 'd_1944', 'd_1945', 'd_1946', 'd_1947', 'd_1948', 'd_1949', 'd_1950', 'd_1951', 'd_1952', 'd_1953', 'd_1954', 'd_1955', 'd_1956', 'd_1957', 'd_1958', 'd_1959', 
                      'd_1960', 'd_1961', 'd_1962', 'd_1963', 'd_1964', 'd_1965', 'd_1966', 'd_1967', 'd_1968', 'd_1969']
    
    # get product table
    product = sales_train_validation[['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']].drop_duplicates()
    
    # merge with product table
    test2['id'] = test2['id'].str.replace('_evaluation','_validation')
    test1 = test1.merge(product, how = 'left', on = 'id')
    test2 = test2.merge(product, how = 'left', on = 'id')
    test2['id'] = test2['id'].str.replace('_validation','_evaluation')
    
    # 
    test1 = pd.melt(test1, id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name = 'day', value_name = 'demand')
    test2 = pd.melt(test2, id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name = 'day', value_name = 'demand')
    
    sales_train_validation['part'] = 'train'
    test1['part'] = 'test1'
    test2['part'] = 'test2'
    
    data = pd.concat([sales_train_validation, test1, test2], axis = 0)
    
    del sales_train_validation, test1, test2
    
    # get only a sample for fst training
    data = data.loc[nrows:]
    
    # drop some calendar features
    calendar.drop(['weekday', 'wday', 'month', 'year'], inplace = True, axis = 1)
    
    # delete test2 for now
    data = data[data['part'] != 'test2']
    
    if merge:
        # notebook crash with the entire dataset (maybee use tensorflow, dask, pyspark xD)
        data = pd.merge(data, calendar, how = 'left', left_on = ['day'], right_on = ['d'])
        data.drop(['d', 'day'], inplace = True, axis = 1)
        # get the sell price data (this feature should be very important)
        data = data.merge(sell_prices, on = ['store_id', 'item_id', 'wm_yr_wk'], how = 'left')
        print('Our final dataset to train has {} rows and {} columns'.format(data.shape[0], data.shape[1]))
    else: 
        pass
    del sales_train_validation
    gc.collect()
    
    return data

In [None]:
calendar, sell_prices, sales_train_validation, submission = read_data()
NUM_ITEMS = sales_train_validation.shape[0]  # 30490
DAYS_PRED = 28
n = 365 * 2 * NUM_ITEMS
data = melt_and_merge(calendar, sell_prices, sales_train_validation, submission, nrows = n, merge = True)

In [None]:
# limit to 2 years because old data might be irrelevant
sales_seq = ['d_' + str(i) for i in range(900, 1914)]
sales_train_sum = sales_train_validation.groupby('item_id')[sales_seq].sum()

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=4)
pca.fit(sales_train_sum)
pca_results = pca.transform(sales_train_sum)
pca.explained_variance_ratio_

dics = []
for i in range(0, 4):
    new_col = 'PCA_{}'.format(str(i+1))
    sales_train_sum[new_col] = pca_results[:, i]
    dic = sales_train_sum[new_col].to_dict()
    dics.append(dic)

def add_pca_feats(data):
    for i in range(0, 4):
        new_col = 'PCA_{}'.format(str(i+1))
        data[new_col] = data['item_id'].map(dics[i])
        data[new_col] = data[new_col].astype('float16')
    return data

In [None]:
from tqdm import tqdm_notebook as tqdm_nb

def transform(data):
    
    nan_features = ['event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
    for feature in nan_features:
        data[feature].fillna('unknown', inplace = True)
        
    cat = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'event_name_1', 'event_type_1', 'event_name_2', 
           'event_type_2','season','is_weekend']
    for feature in tqdm_nb(cat):
        encoder = preprocessing.LabelEncoder()
        data[feature] = encoder.fit_transform(data[feature])
    del sales_train_sum
    return data

In [None]:
def simple_fe(data):
    
    # rolling demand features.
    print("Starting rolling demand features")
    
    for shift in [28, 29, 30]:
        data[f"shift_t{shift}"] = data.groupby(["id"])["demand"].transform(lambda x: x.shift(shift))
    for size in [7,14,30]:
        data[f"rolling_std_t{size}"] = data.groupby(["id"])["demand"].transform(lambda x: x.shift(28).rolling(size).std())
    for size in [7,14,30]:
        data[f"rolling_sum_t{size}"] = data.groupby(["id"])["demand"].transform(lambda x: x.shift(28).rolling(size).sum())
    for size in [7,14,30]:
        data[f"rolling_min_t{size}"] = data.groupby(["id"])["demand"].transform(lambda x: x.shift(28).rolling(size).min())
    for size in [7,14,30]:
        data[f"rolling_max_t{size}"] = data.groupby(["id"])["demand"].transform(lambda x: x.shift(28).rolling(size).max())
    for size in [7, 14, 30, 90, 180]:
        data[f"rolling_mean_t{size}"] = data.groupby(["id"])["demand"].transform(lambda x: x.shift(28).rolling(size).mean())
    for size in [7,14,30]:
        data[f"rolling_kurt_t{size}"] = data.groupby(["id"])["demand"].transform(lambda x: x.shift(28).rolling(size).kurt())
    for size in [7,14,30]:
        data[f"rolling_skew_t{size}"] = data.groupby(["id"])["demand"].transform(lambda x: x.shift(28).rolling(size).skew())
        print("Ending rolling demand features")
        
    # lag log feature creation 
    print("Starting lag log feature creation")
    data['lag_log_t56'] = data.groupby('id')['demand'].transform(lambda x: np.log1p(x).shift(56))
    data['rolling_mean_log_t30'] = data.groupby('id')['demand'].transform(lambda x: np.log1p(x).shift(56).rolling(30, min_periods=1).mean())
    data['cummean'] = data.groupby('id')['demand'].transform(lambda x: x.shift(1).expanding().mean())
    print("Ending lag log feature creation")
    
    # price features
    print("Starting price features")
    data["shift_price_t1"] = data.groupby(["id"])["sell_price"].transform(lambda x: x.shift(1))
    data["price_change_t1"] = (data["shift_price_t1"] - data["sell_price"]) / (data["shift_price_t1"])
    data["rolling_price_max_t365"] = data.groupby(["id"])["sell_price"].transform(lambda x: x.shift(1).rolling(365).max())
    data["price_change_t365"] = (data["rolling_price_max_t365"] - data["sell_price"]) / (data["rolling_price_max_t365"])
    data["rolling_price_std_t7"] = data.groupby(["id"])["sell_price"].transform(lambda x: x.rolling(7).std())
    data["rolling_price_std_t30"] = data.groupby(["id"])["sell_price"].transform(lambda x: x.rolling(30).std())
    data.drop(['rolling_price_max_t365', 'shift_price_t1'], inplace = True, axis = 1)
    
    data["sell_price_rel_diff"] = data.groupby(["store_id", "item_id"])["sell_price"].pct_change()
    sell_price_cummin = data.groupby(["store_id", "item_id"])["sell_price"].cummin()
    sell_price_cummax = data.groupby(["store_id", "item_id"])["sell_price"].cummax()
    data["sell_price_cumrel"] = (data["sell_price"] - sell_price_cummin) / (sell_price_cummax - sell_price_cummin)
    data.fillna({"sell_price_rel_diff": 0, "sell_price_cumrel": 1}, inplace=True)
    del sell_price_cummin,sell_price_cummax
    print("Ending price features")

    # time features
    print("Starting time features")
    data['date'] = pd.to_datetime(data['date'])
    data['year'] = data['date'].dt.year
    data['month'] = data['date'].dt.month
    data['week'] = data['date'].dt.week
    data['day'] = data['date'].dt.day
    data['dayofweek'] = data['date'].dt.dayofweek
    data["is_weekend"] = data["dayofweek"].isin([5, 6])
    data['dayofyear'] = data['date'].dt.dayofyear
    data['quarter'] = data['date'].dt.quarter
    data['semester'] = np.where(data.quarter.isin([1,2]),1,2)
    data['season'] = np.where(data.month.isin([3,4,5]),'spring',np.where(data.month.isin([6,7,8]),'summer',np.where(data.month.isin([9,10,11]),'fall','winter')))
    print("Ending time features")
    
    return data


In [None]:
def run_lgb(data):
    
    # going to evaluate with the last 28 days
    x_train = data[data['date'] <= '2016-03-27']
    y_train = x_train['demand']
    x_val = data[(data['date'] > '2016-03-27') & (data['date'] <= '2016-04-24')]
    y_val = x_val['demand']
    test = data[(data['date'] > '2016-04-24')]
    del data
    gc.collect()

    # define random hyperparammeters
    params = {
        'boosting_type': 'gbdt',
        'metric': 'rmse',
        'objective': 'regression',
        'n_jobs': -1,
        'seed': 236,
        'learning_rate': 0.1,
        'bagging_fraction': 0.75,
        'bagging_freq': 10, 
        'colsample_bytree': 0.75}

    train_set = lgb.Dataset(x_train[features], y_train)
    val_set = lgb.Dataset(x_val[features], y_val)
    
    del x_train, y_train, sales_train_validation

    model = lgb.train(params, train_set, num_boost_round = 2500, early_stopping_rounds = 50, valid_sets = [train_set, val_set], verbose_eval = 100)
    lgb.plot_importance(model, figsize=(12, 6))
    plt.show()
    val_pred = model.predict(x_val[features])
    val_score = np.sqrt(metrics.mean_squared_error(val_pred, y_val))
    print(f'Our val rmse score is {val_score}')
    y_pred = model.predict(test[features])
    test['demand'] = y_pred
    return test

In [None]:
def predict(test, submission):
    predictions = test[['id', 'date', 'demand']]
    predictions = pd.pivot(predictions, index = 'id', columns = 'date', values = 'demand').reset_index()
    predictions.columns = ['id'] + ['F' + str(i + 1) for i in range(28)]

    evaluation_rows = [row for row in submission['id'] if 'evaluation' in row] 
    evaluation = submission[submission['id'].isin(evaluation_rows)]

    validation = submission[['id']].merge(predictions, on = 'id')
    final = pd.concat([validation, evaluation])
    final.to_csv('submission.csv', index = False)

In [None]:
# define list of features
features = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'year', 'month', 'week', 'day', 'dayofweek', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2', 
            'snap_CA', 'snap_TX', 'snap_WI', 'sell_price', 'lag_t28', 'lag_t29', 'lag_t30', 'rolling_mean_t7', 'rolling_std_t7', 'rolling_mean_t30', 'rolling_mean_t90', 
            'rolling_mean_t180', 'rolling_std_t30', 'price_change_t1', 'price_change_t365', 'rolling_price_std_t7', 'rolling_price_std_t30', 'rolling_skew_t30', 'rolling_kurt_t30',
            'is_weekend','dayofyear','quarter','semester','season','PCA_1', 'PCA_2', 'PCA_3','PCA_4']


In [None]:
def transform_train_and_eval(data):
    data = reduce_mem_usage(data)
    #del sales_train_validation
    print("Adding PCA features")
    add_pca_feats(data)
    
    data = reduce_mem_usage(data)
    print("Transformation Starts")
    data = transform(data)
    print(data.columns)
    data = reduce_mem_usage(data)
    print("Transformation Ends and Feature creation starts")
    data = simple_fe(data)
    print(data.columns)
    
    # reduce memory for new features so we can train
    print("Feature creation ends and Reducing Memory")
    data = reduce_mem_usage(data)
    print("Model Run starts")
    test = run_lgb(data)
    print("Model Run ends and Prediction Starts")
    predict(test, submission)
    
transform_train_and_eval(data)