In [1]:
import os
import gc
import warnings

import pandas as pd
from pandas.plotting import register_matplotlib_converters
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from deepseries.models import WaveNet

In [2]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [3]:
calendar = pd.read_csv("./data/calendar.csv").pipe(reduce_mem_usage)
price = pd.read_csv("./data/sell_prices.csv").pipe(reduce_mem_usage)
sale = pd.read_csv("./data/sales_train_validation.csv").pipe(reduce_mem_usage)
submission = pd.read_csv("./data/sample_submission.csv").pipe(reduce_mem_usage)

Mem. usage decreased to  0.12 Mb (41.9% reduction)
Mem. usage decreased to 130.48 Mb (37.5% reduction)
Mem. usage decreased to 95.00 Mb (78.7% reduction)
Mem. usage decreased to  2.09 Mb (84.5% reduction)


In [5]:
NUM_ITEMS = sale.shape[0]  # 30490
DAYS_PRED = submission.shape[1] - 1  # 28

In [14]:
id_columns = ["id", "item_id", "dept_id", "cat_id", "store_id", "state_id"]
product = train[id_columns]

vals = submission[submission["id"].str.endswith("validation")]
evals = submission[submission["id"].str.endswith("evaluation")]

vals.columns = ["id"] + [f"d_{d}" for d in range(1914, 1914 + DAYS_PRED)]
evals.columns = ["id"] + [f"d_{d}" for d in range(1942, 1942 + DAYS_PRED)]
vals = vals.merge(product, how="left", on="id")
evals = evals.merge(product, how="left", on="id")

In [28]:
days_train = [f"d_{d}" for d in range(1, 1914)]
days_val = [f"d_{d}" for d in range(1914, 1914+DAYS_PRED)]
days_eval = [f"d_{d}" for d in range(1942, 1942+DAYS_PRED)]

In [21]:
for d in range(1914, 1942+DAYS_PRED):
    sale[f"d_{d}"] = np.nan

In [32]:
id_cols = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']
sale = sale.melt(id_vars=id_cols, var_name="d", value_name="demand")
sale.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,demand
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0.0
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0.0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0.0
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0.0
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0.0


In [36]:
calendar = calendar.rename({"snap_CA": "CA", "snap_TX": "TX", "snap_WI": "WI"}, axis=1)
calendar = calendar.melt(id_vars=calendar.columns[:-3], var_name="state_id", value_name="snap")

In [37]:
for col in ["event_name_1", "event_type_1", "event_name_2", "event_type_2"]:
    calendar[col] = LabelEncoder().fit_transform(calendar[col].fillna("None"))
calendar["date"] = pd.to_datetime(calendar.date)
calendar["week"] = calendar.date.dt.week

In [38]:
merge = sale.merge(calendar.drop(["weekday"], axis=1), on=["d", "state_id"], how="left")
merge = merge.merge(price, on=["store_id", "item_id", "wm_yr_wk"], how="left")

In [42]:
merge.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,demand,date,wm_yr_wk,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap,week,sell_price
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0.0,2011-01-29,11101,1,1,2011,19,2,3,1,0,4,
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0.0,2011-01-29,11101,1,1,2011,19,2,3,1,0,4,
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0.0,2011-01-29,11101,1,1,2011,19,2,3,1,0,4,
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0.0,2011-01-29,11101,1,1,2011,19,2,3,1,0,4,
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0.0,2011-01-29,11101,1,1,2011,19,2,3,1,0,4,


In [46]:
merge[merge.id.str.contains("HOBBIES_1_001_CA_1")].id.unique()

array(['HOBBIES_1_001_CA_1_validation'], dtype=object)

In [50]:
for col in id_cols:
    merge[col] = LabelEncoder().fit_transform(merge[col])

In [52]:
merge = reduce_mem_usage(merge)

Mem. usage decreased to 2748.18 Mb (62.8% reduction)


In [56]:
merge.sell_price.describe()

count    4.773540e+07
mean              NaN
std      0.000000e+00
min      1.000214e-02
25%      2.179688e+00
50%      3.470703e+00
75%      5.839844e+00
max      1.073125e+02
Name: sell_price, dtype: float64

In [55]:
merge.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,demand,date,wm_yr_wk,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap,week,sell_price
0,14370,1437,3,1,0,0,d_1,0.0,2011-01-29,11101,1,1,2011,19,2,3,1,0,4,
1,14380,1438,3,1,0,0,d_1,0.0,2011-01-29,11101,1,1,2011,19,2,3,1,0,4,
2,14390,1439,3,1,0,0,d_1,0.0,2011-01-29,11101,1,1,2011,19,2,3,1,0,4,
3,14400,1440,3,1,0,0,d_1,0.0,2011-01-29,11101,1,1,2011,19,2,3,1,0,4,
4,14410,1441,3,1,0,0,d_1,0.0,2011-01-29,11101,1,1,2011,19,2,3,1,0,4,


In [58]:
merge

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,demand,date,wm_yr_wk,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap,week,sell_price
0,14370,1437,3,1,0,0,d_1,0.0,2011-01-29,11101,1,1,2011,19,2,3,1,0,4,
1,14380,1438,3,1,0,0,d_1,0.0,2011-01-29,11101,1,1,2011,19,2,3,1,0,4,
2,14390,1439,3,1,0,0,d_1,0.0,2011-01-29,11101,1,1,2011,19,2,3,1,0,4,
3,14400,1440,3,1,0,0,d_1,0.0,2011-01-29,11101,1,1,2011,19,2,3,1,0,4,
4,14410,1441,3,1,0,0,d_1,0.0,2011-01-29,11101,1,1,2011,19,2,3,1,0,4,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60034805,14329,1432,2,0,9,2,d_1969,,2016-06-19,11621,2,6,2016,16,4,2,0,0,24,2.980469
60034806,14339,1433,2,0,9,2,d_1969,,2016-06-19,11621,2,6,2016,16,4,2,0,0,24,2.480469
60034807,14349,1434,2,0,9,2,d_1969,,2016-06-19,11621,2,6,2016,16,4,2,0,0,24,3.980469
60034808,14359,1435,2,0,9,2,d_1969,,2016-06-19,11621,2,6,2016,16,4,2,0,0,24,1.280273


In [57]:
from torch.utils.data import Dataset, DataLoader

In [None]:
class SeriesData(Dataset):
    
    def __init__(self,)