 # Feature Engineering

 ## Merge Sales and Prices and Calendar Data

In [1]:
import pandas as pd
import dask.dataframe as dd

In [2]:
# Read datasets
df_cal = pd.read_csv("../datasets/calendar.csv")
df_cal = dd.from_pandas(df_cal, npartitions=30)

df_prices = pd.read_csv("../datasets/prices.csv")
df_prices = dd.from_pandas(df_prices, npartitions=30)

df_train = pd.read_csv("../datasets/train.csv")

# Merge prices and calendar date datasets
df_prices_cal = dd.merge(df_cal, df_prices, on = "wm_yr_wk", how = "left")

In [3]:
df_prices_cal["id"] = df_prices_cal["item_id"].add("_").add(df_prices_cal["store_id"])
df_prices_cal.head()

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,store_id,item_id,sell_price,id
0,2014-03-29,11409,Saturday,1,3,2014,d_1156,East_2,Food_2_229,2.18,Food_2_229_East_2
1,2014-03-29,11409,Saturday,1,3,2014,d_1156,East_2,Food_2_231,2.5,Food_2_231_East_2
2,2014-03-29,11409,Saturday,1,3,2014,d_1156,East_2,Food_2_233,4.48,Food_2_233_East_2
3,2014-03-29,11409,Saturday,1,3,2014,d_1156,East_2,Food_2_234,2.88,Food_2_234_East_2
4,2014-03-29,11409,Saturday,1,3,2014,d_1156,East_2,Food_2_236,2.98,Food_2_236_East_2


 Merge prices and calendar dates with train datasets:

In [4]:
df_prices_cal.dtypes

date           object
wm_yr_wk        int64
weekday        object
wday            int64
month           int64
year            int64
d              object
store_id       object
item_id        object
sell_price    float64
id             object
dtype: object

In [5]:
df_prices_cal["date"] = dd.to_datetime(df_prices_cal["date"])
df_prices_cal["weekday"] = df_prices_cal["weekday"].astype("string")
df_prices_cal["d"] = df_prices_cal["d"].astype("string")
df_prices_cal["store_id"] = df_prices_cal["store_id"].astype("string")
df_prices_cal["item_id"] = df_prices_cal["item_id"].astype("string")
df_prices_cal["id"] = df_prices_cal["id"].astype("string")

In [6]:
# Pivot df_train longer to match df_prices_cal
df_long_train = pd.wide_to_long(df_train, stubnames = "d_", j = "day_number", i = "id")
df_long_train = df_long_train.reset_index()
df_long_train = dd.from_pandas(df_long_train, npartitions=30)

# Rename the units sold column appropriately
df_long_train['units_sold'] = df_long_train['d_']
del df_long_train['d_']

In [7]:
df_long_train.head()

Unnamed: 0,id,day_number,category_id,store_id,item_id,subcat_id,region_id,units_sold
0,Beauty_1_001_East_1,1,Beauty,East_1,Beauty_1_001,Beauty_1,East,0
1,Beauty_1_002_East_1,1,Beauty,East_1,Beauty_1_002,Beauty_1,East,0
2,Beauty_1_003_East_1,1,Beauty,East_1,Beauty_1_003,Beauty_1,East,0
3,Beauty_1_004_East_1,1,Beauty,East_1,Beauty_1_004,Beauty_1,East,0
4,Beauty_1_005_East_1,1,Beauty,East_1,Beauty_1_005,Beauty_1,East,0


In [8]:
# Rename `d_` in `df_prices_cal` to `day_number`
df_prices_cal['day_number'] = df_prices_cal['d']
del df_prices_cal['d']

# Remove the `d_` from `day_number`
df_prices_cal['day_number'] = df_prices_cal['day_number'].str.replace("d_", "")
df_prices_cal["day_number"] = df_prices_cal["day_number"].astype(int)

In [9]:
df_prices_cal.head()

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,store_id,item_id,sell_price,id,day_number
0,2014-03-29,11409,Saturday,1,3,2014,East_1,Cleaning_1_455,2.97,Cleaning_1_455_East_1,1156
1,2014-03-29,11409,Saturday,1,3,2014,East_1,Cleaning_1_456,6.97,Cleaning_1_456_East_1,1156
2,2014-03-29,11409,Saturday,1,3,2014,East_1,Cleaning_1_457,9.97,Cleaning_1_457_East_1,1156
3,2014-03-29,11409,Saturday,1,3,2014,East_1,Cleaning_1_458,2.48,Cleaning_1_458_East_1,1156
4,2014-03-29,11409,Saturday,1,3,2014,East_1,Cleaning_1_459,0.98,Cleaning_1_459_East_1,1156


In [10]:
df_prices_cal = df_prices_cal.drop(['item_id', 'store_id'], axis=1)
df_prices_cal.head()

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,sell_price,id,day_number
0,2014-03-29,11409,Saturday,1,3,2014,8.97,Cleaning_2_076_East_3,1156
1,2014-03-29,11409,Saturday,1,3,2014,5.34,Cleaning_2_078_East_3,1156
2,2014-03-29,11409,Saturday,1,3,2014,4.47,Cleaning_2_079_East_3,1156
3,2014-03-29,11409,Saturday,1,3,2014,8.98,Cleaning_2_080_East_3,1156
4,2014-03-29,11409,Saturday,1,3,2014,4.47,Cleaning_2_082_East_3,1156


In [11]:
# Merge training data with calendar and price data
df_long_train = df_long_train.merge(df_prices_cal, on = ['day_number', 'id'], how = 'left')
df_long_train.head()

+------------------------------+------------+-------------+
| Merge columns                | left dtype | right dtype |
+------------------------------+------------+-------------+
| ('day_number', 'day_number') | int64      | int32       |
| ('id', 'id')                 | object     | string      |
+------------------------------+------------+-------------+
Cast dtypes explicitly to avoid unexpected results.
  ).format(col_tb)


Unnamed: 0,id,day_number,category_id,store_id,item_id,subcat_id,region_id,units_sold,date,wm_yr_wk,weekday,wday,month,year,sell_price
0,Food_2_219_Central_1,1024,Food,Central_1,Food_2_219,Food_2,Central,0,2013-11-17,11343.0,Sunday,2.0,11.0,2013.0,2.68
1,Food_2_223_Central_1,1024,Food,Central_1,Food_2_223,Food_2,Central,0,2013-11-17,11343.0,Sunday,2.0,11.0,2013.0,2.18
2,Food_2_245_Central_1,1024,Food,Central_1,Food_2_245,Food_2,Central,1,2013-11-17,11343.0,Sunday,2.0,11.0,2013.0,2.98
3,Food_2_308_Central_1,1024,Food,Central_1,Food_2_308,Food_2,Central,0,2013-11-17,11343.0,Sunday,2.0,11.0,2013.0,2.42
4,Food_2_319_Central_1,1024,Food,Central_1,Food_2_319,Food_2,Central,1,2013-11-17,11343.0,Sunday,2.0,11.0,2013.0,2.42


In [12]:
df_long_train.dtypes

id                     object
day_number              int64
category_id            object
store_id               object
item_id                object
subcat_id              object
region_id              object
units_sold              int64
date           datetime64[ns]
wm_yr_wk              float64
weekday                string
wday                  float64
month                 float64
year                  float64
sell_price            float64
dtype: object

In [13]:
# Reassign dtypes to save space
# df_long_train["id"] = df_long_train["id"].astype("string")
# df_long_train["region_id"] = df_long_train["region_id"].astype("string")
# df_long_train["category_id"] = df_long_train["category_id"].astype("string")
# df_long_train["subcat_id"] = df_long_train["subcat_id"].astype("string")
# df_long_train["store_id"] = df_long_train["store_id"].astype("string")
# df_long_train["item_id"] = df_long_train["item_id"].astype("string")
# df_long_train["wm_yr_wk"] = df_long_train["wm_yr_wk"].astype(int)
# df_long_train["wday"] = df_long_train["wday"].astype(int)
# df_long_train["month"] = df_long_train["month"].astype(int)
# df_long_train["year"] = df_long_train["year"].astype(int)

In [13]:
df_long_train_repart = df_long_train.repartition(npartitions = 100)

In [14]:
# Save merged dataset:
# df_long_train_repart.to_parquet("../datasets/long_train_parquet", write_index = False)

(None,)

### Start from here to resample dataset for training ML models

In [None]:
df_long_train_repart = dd.read_parquet("../datasets/long_train_parquet/")

 Subset randomly on whole dataset

In [None]:
# Drop NAs
sampled_df = df_long_train_repart.dropna()

In [None]:
# Random sample 1% of entire dataframe
# NOTE: Change frac to get bigger subsets of the data
sampled_df_subset = sampled_df.sample(frac = 0.01, random_state = 42)
sampled_df_subset = sampled_df_subset.compute()

sampled_df_subset.head()

Unnamed: 0,id,day_number,category_id,store_id,item_id,subcat_id,region_id,units_sold,date,wm_yr_wk,weekday,wday,month,year,sell_price
160774,Food_2_374_East_4,1119,Food,East_4,Food_2_374,Food_2,East,1,2014-02-20,11403.0,Thursday,6.0,2.0,2014.0,2.0
566575,Food_3_310_East_1,1583,Food,East_1,Food_3_310,Food_3,East,2,2015-05-30,11518.0,Saturday,1.0,5.0,2015.0,1.0
150132,Beauty_2_030_West_2,1108,Beauty,West_2,Beauty_2_030,Beauty_2,West,0,2014-02-09,11402.0,Sunday,2.0,2.0,2014.0,0.97
124020,Food_3_067_West_2,1210,Food,West_2,Food_3_067,Food_3,West,0,2014-05-22,11416.0,Thursday,6.0,5.0,2014.0,3.98
268828,Food_2_155_East_1,1290,Food,East_1,Food_2_155,Food_2,East,0,2014-08-10,11428.0,Sunday,2.0,8.0,2014.0,2.5


In [None]:
# Export to csv
# sampled_df_subset.to_csv("../datasets/long_train_subset.csv", index = False)