In [1]:
import gc
import pathlib
import sys

In [2]:
import numpy as np
import pandas as pd

In [3]:
root_dir_path = pathlib.Path("..")
data_dir_path = root_dir_path / "data"
raw_dir_path = data_dir_path / "raw"
calendar_path = raw_dir_path / "calendar.csv"
sales_train_validation_path = raw_dir_path / "sales_train_validation.csv"
sell_prices_path = raw_dir_path / "sell_prices.csv"
sample_submission_path = raw_dir_path / "sample_submission.csv"
processed_dir_path = data_dir_path / "processed"
train_path = processed_dir_path / "train.parquet"
test_path = processed_dir_path / "test.parquet"
src_dir_path = root_dir_path / "src"

In [4]:
sys.path.append(str(src_dir_path))

In [5]:
from utils import *

In [6]:
train_days = 1913
test_days = 28

## calendar

In [7]:
calendar = pd.read_csv(calendar_path, parse_dates=["date"])

In [8]:
calendar.drop(columns=["weekday", "wday", "month", "year"], inplace=True)

In [9]:
for col in ["event_name_1", "event_type_1", "event_name_2", "event_type_2"]:
    calendar[col] = pd.factorize(calendar[col], sort=True)[0]

In [10]:
reduce_memory_usage(calendar)

In [11]:
calendar.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1969 entries, 0 to 1968
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   date          1969 non-null   datetime64[ns]
 1   wm_yr_wk      1969 non-null   int16         
 2   d             1969 non-null   object        
 3   event_name_1  1969 non-null   int8          
 4   event_type_1  1969 non-null   int8          
 5   event_name_2  1969 non-null   int8          
 6   event_type_2  1969 non-null   int8          
 7   snap_CA       1969 non-null   int8          
 8   snap_TX       1969 non-null   int8          
 9   snap_WI       1969 non-null   int8          
dtypes: datetime64[ns](1), int16(1), int8(7), object(1)
memory usage: 48.2+ KB


## sales_train_validation

In [12]:
sales_train_validation = pd.read_csv(sales_train_validation_path)

In [13]:
for col in ["store_id", "item_id", "dept_id", "cat_id", "state_id"]:
    sales_train_validation[col] = pd.factorize(sales_train_validation[col], sort=True)[0]

In [14]:
reduce_memory_usage(sales_train_validation)

In [15]:
sales_train_validation.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30490 entries, 0 to 30489
Columns: 1919 entries, id to d_1913
dtypes: int16(1289), int8(629), object(1)
memory usage: 93.5+ MB


In [16]:
sales_train_validation.duplicated(["item_id", "store_id"]).sum()

0

In [17]:
sales_train_validation["id"].str.endswith("_validation").sum()

30490

In [18]:
sales_train_validation["id"].str.endswith("_evaluation").sum()

0

## sell_prices

In [19]:
sell_prices = pd.read_csv(sell_prices_path)

In [20]:
for col in ["store_id", "item_id"]:
    sell_prices[col] = pd.factorize(sell_prices[col], sort=True)[0]

In [21]:
reduce_memory_usage(sell_prices)

In [22]:
sell_prices.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6841121 entries, 0 to 6841120
Data columns (total 4 columns):
 #   Column      Dtype  
---  ------      -----  
 0   store_id    int8   
 1   item_id     int16  
 2   wm_yr_wk    int16  
 3   sell_price  float32
dtypes: float32(1), int16(2), int8(1)
memory usage: 58.7 MB


## sample_submission

In [23]:
sample_submission = pd.read_csv(sample_submission_path)

In [24]:
sample_submission[["item_id", "store_id"]] = sample_submission["id"].str.extract(r"(\w+_\d+_\d+)_(\w+_\d+)_\w+")
sample_submission["dept_id"] = sample_submission["item_id"].str.extract(r"(\w+_\d+)_\d+")
sample_submission["cat_id"] = sample_submission["dept_id"].str.extract(r"(\w+)_\d+")
sample_submission["state_id"] = sample_submission["store_id"].str.extract(r"(\w+)_\d+")

In [25]:
for col in ["store_id", "item_id", "dept_id", "cat_id", "state_id"]:
    sample_submission[col] = pd.factorize(sample_submission[col], sort=True)[0]

In [26]:
reduce_memory_usage(sample_submission)

In [27]:
sample_submission.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60980 entries, 0 to 60979
Data columns (total 34 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        60980 non-null  object
 1   F1        60980 non-null  int8  
 2   F2        60980 non-null  int8  
 3   F3        60980 non-null  int8  
 4   F4        60980 non-null  int8  
 5   F5        60980 non-null  int8  
 6   F6        60980 non-null  int8  
 7   F7        60980 non-null  int8  
 8   F8        60980 non-null  int8  
 9   F9        60980 non-null  int8  
 10  F10       60980 non-null  int8  
 11  F11       60980 non-null  int8  
 12  F12       60980 non-null  int8  
 13  F13       60980 non-null  int8  
 14  F14       60980 non-null  int8  
 15  F15       60980 non-null  int8  
 16  F16       60980 non-null  int8  
 17  F17       60980 non-null  int8  
 18  F18       60980 non-null  int8  
 19  F19       60980 non-null  int8  
 20  F20       60980 non-null  int8  
 21  F21       60

## train

In [28]:
train = sales_train_validation.melt(id_vars=["id", "item_id", "store_id", "dept_id", "cat_id", "state_id"], var_name="d", value_name="demand")

In [29]:
del sales_train_validation

In [30]:
gc.collect()

76

In [31]:
train = train.merge(calendar, copy=False, how="left", on="d")
train = train.merge(sell_prices, copy=False, how="left", on=["store_id", "item_id", "wm_yr_wk"])

In [32]:
train.drop(columns="wm_yr_wk", inplace=True)

In [33]:
train.sort_values(["date", "store_id", "item_id"], ignore_index=True, inplace=True)

In [34]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58327370 entries, 0 to 58327369
Data columns (total 17 columns):
 #   Column        Dtype         
---  ------        -----         
 0   id            object        
 1   item_id       int16         
 2   store_id      int8          
 3   dept_id       int8          
 4   cat_id        int8          
 5   state_id      int8          
 6   d             object        
 7   demand        int16         
 8   date          datetime64[ns]
 9   event_name_1  int8          
 10  event_type_1  int8          
 11  event_name_2  int8          
 12  event_type_2  int8          
 13  snap_CA       int8          
 14  snap_TX       int8          
 15  snap_WI       int8          
 16  sell_price    float32       
dtypes: datetime64[ns](1), float32(1), int16(2), int8(11), object(2)
memory usage: 2.3+ GB


In [35]:
train.to_parquet(train_path)

## test

In [36]:
test = sample_submission.melt(id_vars=["id", "item_id", "store_id", "dept_id", "cat_id", "state_id"], var_name="d", value_name="demand")

In [37]:
del sample_submission

In [38]:
gc.collect()

60

In [39]:
test.drop(columns="demand", inplace=True)

In [40]:
is_evaluation = test["id"].str.endswith("_evaluation")
intermediate = test["d"].str[1:]
intermediate = intermediate.astype("int32")
intermediate += train_days
intermediate[is_evaluation] += test_days
test["d"] = intermediate.apply("d_{}".format)

In [41]:
del is_evaluation
del intermediate

In [42]:
gc.collect()

89

In [43]:
test = test.merge(calendar, copy=False, how="left", on="d")
test = test.merge(sell_prices, copy=False, how="left", on=["store_id", "item_id", "wm_yr_wk"])

In [44]:
test.drop(columns="wm_yr_wk", inplace=True)

In [45]:
test.sort_values(["date", "store_id", "item_id"], ignore_index=True, inplace=True)

In [46]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1707440 entries, 0 to 1707439
Data columns (total 16 columns):
 #   Column        Dtype         
---  ------        -----         
 0   id            object        
 1   item_id       int16         
 2   store_id      int8          
 3   dept_id       int8          
 4   cat_id        int8          
 5   state_id      int8          
 6   d             object        
 7   date          datetime64[ns]
 8   event_name_1  int8          
 9   event_type_1  int8          
 10  event_name_2  int8          
 11  event_type_2  int8          
 12  snap_CA       int8          
 13  snap_TX       int8          
 14  snap_WI       int8          
 15  sell_price    float32       
dtypes: datetime64[ns](1), float32(1), int16(1), int8(11), object(2)
memory usage: 66.8+ MB


In [47]:
test.to_parquet(test_path)