# Data Precessing

### Due to the massive amount of data, we use a small amount of data for the demonstration on the local server to save time and use the entire data set for the Kaggle platform prediction.

* ## Load the data and analysis libraries
> ### Load libraries

In [1]:
from  datetime import datetime, timedelta
# import gc
import numpy as np, pandas as pd
import time	


> ### There are some configuration parameters. Use the following controls parameter IS_LOCAL_DEMO to switch between the Kaggle platform and local test. We used day_1750 as the first day and loaded all sales_train_validation for training on the Kaggle platform editor. For a local test demo, we use 1000 rows and form day_1850.

In [35]:
IS_LOCAL_DEMO = True
# IS_LOCAL_DEMO = False

In [36]:
last_day = 1913  # the last day for trian
fday = datetime(2016,4, 25)  # the first predict day
first_day = (1850 if (IS_LOCAL_DEMO) else 1750)
demo_rows = (10000 if (IS_LOCAL_DEMO) else None) # the parameter just for quick local demo. should be all 
print(demo_rows)


10000


> ### Due to the vast amount of data, we need to change the data type to reduce the pressure on memory.

In [37]:
PRICE_DTYPES = {"store_id": "category", "item_id": "category", "wm_yr_wk": "int16","sell_price":"float32" }
prices = pd.read_csv("../datasets/sell_prices.csv", dtype = PRICE_DTYPES)

for col, col_dtype in PRICE_DTYPES.items():
    if col_dtype == "category":
        prices[col] = prices[col].cat.codes.astype("int16")  
        prices[col] -= prices[col].min()  

In [38]:
CAL_DTYPES={"event_name_1": "category", "event_name_2": "category", "event_type_1": "category", 
         "event_type_2": "category", "weekday": "category", 'wm_yr_wk': 'int16', "wday": "int16",
        "month": "int16", "year": "int16", "snap_CA": "float32", 'snap_TX': 'float32', 'snap_WI': 'float32' }
cal = pd.read_csv("../datasets/calendar.csv", dtype = CAL_DTYPES)

cal["date"] = pd.to_datetime(cal["date"]) 
for col, col_dtype in CAL_DTYPES.items():
    if col_dtype == "category":
        cal[col] = cal[col].cat.codes.astype("int16") 
        cal[col] -= cal[col].min() 

> ### Selecting only the columns and dates we need for the prediction can significantly reduce the computational pressure.

In [39]:
numcols = [f"d_{day}" for day in range(first_day, last_day + 1)] # select dates
catcols = ['id', 'item_id', 'dept_id','store_id', 'cat_id', 'state_id']
dtype = {numcol: "float32" for numcol in numcols}   
dtype.update({col: "category" for col in catcols if col != "id"})  # Except for id, change others to type of category 

train_data = pd.read_csv("../datasets/sales_train_validation.csv", usecols = catcols + numcols, dtype = dtype,
                         nrows=demo_rows )  # The parameter will be None when runing on kaggle
print(train_data.shape)
# print(train_data.info())
train_data.head(3)

(10000, 70)


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1850,d_1851,d_1852,d_1853,...,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0.0,4.0,0.0,1.0,...,1.0,3.0,0.0,1.0,1.0,1.0,3.0,0.0,1.0,1.0
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,1.0,0.0,0.0,0.0,...,2.0,1.0,2.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0


> ###  Using OneHotEncoder should be getting a better result. However, it will make the data frame several times more massive. We use a compromise method to the part of feature values to meaningless digitize.

In [40]:
for col in catcols:
    if col != "id":
        train_data[col] = train_data[col].cat.codes.astype("int16")  # change the value to int
        train_data[col] -= train_data[col].min()

for day in range(last_day + 1, last_day + 28 +1): 
    train_data[f"d_{day}"] = np.nan 

> ### Change the data frame structure and merge them.

In [41]:
train_data = pd.melt(train_data,
              id_vars = catcols,
              value_vars = [col for col in train_data.columns if col.startswith("d_")],
              var_name = "d",
              value_name = "sales")

train_data = train_data.merge(cal, on= "d", copy = False)
train_data = train_data.merge(prices, on = ["store_id", "item_id", "wm_yr_wk"], copy = False)

In [42]:
train_data["d"]=train_data["d"].apply(lambda x: int(x.split("_")[1])) # d_1 to 1


* ## Dataset Persistence
>###  Save the data frame to the hard drive and shut down this script could release the space of memory.
>### We save the data frame as "train_data.csv.gz" for the local demo and "train_data.csv" for the Kaggle platform.

In [None]:
if(IS_LOCAL_DEMO):
    train_data.to_csv("train_data.csv.gz",compression='gzip',index=False)# for local demo
else:
    train_data.to_csv("train_data.csv",index=False) # for kaggle  


