In [1]:
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
dataset_train = pd.read_csv("../../data/intermediante_files/raw_dataset_train.csv")
dataset_train = dataset_train.drop(["id", "Unnamed: 0"], axis=1)

In [3]:
dataset_train.shape

(61352740, 12)

# Reindex dates

#### Train

In [4]:
def reindex(df, date_column, sales_column, fill_dict):
    
    from pandas import date_range, to_datetime

    min_period, max_period = df[date_column].min(), df[date_column].max()
    periods = date_range(start=min_period, end=max_period, freq="D")
    
    new_df = df.set_index(date_column)
    new_df.index = to_datetime(new_df.index)
    
    new_df = new_df.reindex(periods)
    for col in fill_dict:
        kind, arg = fill_dict[col].split(":")
        if kind == "value":
            new_df[col] = new_df[col].fillna(arg)
        elif kind == "method":
            new_df[col] = new_df[col].fillna(method=arg)
        else:
            raise
    return new_df.reset_index()
            

fill_dict = {"store_nbr": "method:ffill",
            "item_nbr": "method:ffill",
            "unit_sales": "value:0",
            "onpromotion": "value:0",
            "family": "method:ffill",
            "class": "method:ffill",
            "perishable": "method:ffill",
            "city": "method:ffill",
            "state": "method:ffill",
            "type": "method:ffill",
            "cluster": "method:ffill"}

dataset_train = dataset_train.groupby(["store_nbr", "item_nbr"], as_index=False).apply(lambda df: reindex(df, "date","unit_sales", fill_dict))

In [8]:
dataset_train = dataset_train.rename(columns={"index": "date"})

In [9]:
dataset_train.to_csv("../../data/intermediante_files/raw_dataset_reindexed.csv", index=False)