# 1. Data Preprocessing

Please down load raw dataset from https://www.kaggle.com/c/m5-forecasting-accuracy/data into data folder.

In [1]:
import numpy as np
import pandas as pd

%matplotlib inline
 
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

import pickle

## Read raw data set

Following methods in https://www.kaggle.com/girmdshinsei/for-japanese-beginner-with-wrmsse-in-lgbm, we reduce the data size.

In [8]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns: 
        col_type = df[col].dtypes
        if col_type in numerics: 
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [9]:
def read_data():
    print('Reading files...')
    calendar = pd.read_csv('../data/calendar.csv')
    calendar = reduce_mem_usage(calendar)
    print('Calendar has {} rows and {} columns'.format(calendar.shape[0], calendar.shape[1]))
    
    sell_prices = pd.read_csv('../data/sell_prices.csv')
    sell_prices = reduce_mem_usage(sell_prices)
    print('Sell prices has {} rows and {} columns'.format(sell_prices.shape[0], sell_prices.shape[1]))
    
#     sales_train_val = pd.read_csv('../data/sales_train_validation.csv')
#     print('Sales train validation has {} rows and {} columns'.format(sales_train_val.shape[0], sales_train_val.shape[1]))
    
    sales_test_val = pd.read_csv('../data/sales_train_evaluation.csv')
    sales_test_val = reduce_mem_usage(sales_test_val)
    print('Sales evaluation has {} rows and {} columns'.format(sales_test_val.shape[0], sales_test_val.shape[1]))
    
    submission = pd.read_csv('../data/sample_submission.csv')
    
    return calendar, sell_prices, sales_train_val, submission, sales_test_val
#     return calendar, sell_prices, submission, sales_test_val

In [10]:
 calendar, sell_prices, sales_train_val, submission, sales_test_val = read_data()
# calendar, sell_prices, submission, sales_test_val = read_data()

Reading files...
Mem. usage decreased to  0.12 Mb (41.9% reduction)
Calendar has 1969 rows and 14 columns
Mem. usage decreased to 130.48 Mb (37.5% reduction)
Sell prices has 6841121 rows and 4 columns


In [11]:
calendar.to_pickle('dataset/calendar.pkl')
submission.to_pickle('dataset/submission.pkl')
sell_prices.to_pickle('dataset/sell_price.pkl')

### Category variables

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
def encode_categorical(df, cols):
    
    for col in cols:
        # Leave NaN as it is.
        le = LabelEncoder()
        #not_null = df[col][df[col].notnull()]
        df[col] = df[col].fillna('nan')
        df[col] = pd.Series(le.fit_transform(df[col]), index=df.index)

    return df


calendar = encode_categorical(
    calendar, ["event_name_1", "event_type_1", "event_name_2", "event_type_2"]
).pipe(reduce_mem_usage)

sales_test_val = encode_categorical(
    sales_test_val, ["item_id", "dept_id", "cat_id", "store_id", "state_id"],
).pipe(reduce_mem_usage)

sell_prices = encode_categorical(sell_prices, ["item_id", "store_id"]).pipe(
    reduce_mem_usage
)

# sales_train_valからidの詳細部分(itemやdepartmentなどのid)を重複なく一意に取得しておく。(extract a detail of id columns)
product = sales_test_val[['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']].drop_duplicates()

In [None]:
product.to_pickle('dataset/product.pkl')
product.head()

In [None]:
sales_test_val = pd.read_pickle('dataset/sales_test_val.pkl')

In [None]:
# EOF