# Read in Dataset

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from IPython.display import display, Markdown
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm, tqdm_notebook

import warnings 
warnings.simplefilter('ignore')

np.set_printoptions(suppress=True) # supress scientific notation

In [None]:
def date_parser(x):
    return datetime.strptime(x, '%Y-%m-%d')

data = pd.read_csv('../Playground-dataset/03-Walmart-Store-Sales-Forecasting-Dataset/train.csv',
            header=0,
            date_parser=date_parser,
            parse_dates=['Date'])

data.columns = ['store','dept','date','weekly_sales','is_holiday']

In [2]:
def printmd(string):
    return display(Markdown(string))

In [None]:
printmd('**Preivew of the Raw Dataset.**')
display(data.head())

In [None]:
# join store and dept to single column -> unique identifier 
data['store_dept'] = data[['store','dept']].apply(
    lambda x: '_'.join(x.map(str)), axis=1) # type convert to string first

In [None]:
# original data
pd.to_pickle(data, '../Playground-dataset/03-Walmart-Store-Sales-Forecasting-Dataset/data.pickle')

In [3]:
data = pd.read_pickle('../Playground-dataset/03-Walmart-Store-Sales-Forecasting-Dataset/data.pickle')

# Data Visualization

## Sales Aggregated on Store

The total sales of each store is different. Some stores have constantly higher sales while others have lower sales. Probaly it relates to the store size. Nonetheless, if observing the trend of sales, all stores seem to follow quite consistant/similar pattern.

In [None]:
# Preprocess data: long to wide on 'store'.
wide_store = data.groupby(['store','date'])['weekly_sales'].sum().unstack(['store'])

printmd('**Preview of Wide Store:**')
display(wide_store.head(n=2))

In [None]:
pd.to_pickle(wide_store, 
             '../Playground-dataset/03-Walmart-Store-Sales-Forecasting-Dataset/wide_store.pickle')

In [None]:
wide_store = pd.read_pickle(
    '../Playground-dataset/03-Walmart-Store-Sales-Forecasting-Dataset/wide_store.pickle')

In [None]:
from TimeSeriesUtils import plot_yaxis_with_break
plot_yaxis_with_break(wide_store)

## Sales Aggregated on Dept

Contrary to what we observe in store sales. The sales pattern on each department seems quite different, thus warranting to separate them and investigate individually.

In [None]:
# Preprocess data: long to wide on 'dept'. 
wide_dept = data.groupby(['dept','date'])['weekly_sales'].sum().unstack(['dept'])

printmd('**Preview of Wide Dept:**')
display(wide_dept.head(n=2))

In [None]:
pd.to_pickle(wide_dept, 
             '../Playground-dataset/03-Walmart-Store-Sales-Forecasting-Dataset/wide_dept.pickle')

In [None]:
wide_dept = pd.read_pickle('../Playground-dataset/03-Walmart-Store-Sales-Forecasting-Dataset/wide_dept.pickle')

In [None]:
from TimeSeriesUtils import plot_yaxis_with_break
plot_yaxis_with_break(wide_dept, 
                      lower_ylim=(0, 4000000), 
                      upper_ylim=(9000000, 11000000),
                      title='Weekly Sales over Dept')

## Change Rate of Sales

### Data Validation on Store

From the test below, we are sure there is no inconsecutive week data. No missing sales record for any week on store data. Only the first row is missing, which is understandable for data shifting in this case. 

In [None]:
# Check if these exists inconsecutive week on store.
week_date = pd.Series(wide_store.index.values)
week_date_nex = week_date.shift(1)

seven_days = timedelta(days=7) # create timedelta object
week_date_test = week_date - week_date_nex

printmd('**Row Index for Time Interval not Equal to 7 Days:**')
week_date_test[week_date_test != seven_days].index

### Data Validation on Dept

From the test below, we are sure ther is no inconsecutive week data. No missing sales record for any week on department data.

In [None]:
# Check if these exists inconsecutive week on dept.
week_date = pd.Series(wide_dept.index.values)
week_date_nex = week_date.shift(1)

seven_days = timedelta(days=7) # create timedelta object
week_date_test = week_date - week_date_nex

printmd('**Row Index for Time Interval not Equal to 7 Days:**')
week_date_test[week_date_test != seven_days].index

### Change Rate of Sales on Store

The change rate of sales on stores is quite similar among all stores.

In [None]:
# Create dataset of change rate on store each week.
wide_store_shift = wide_store.shift(1)
wide_store_rate = (wide_store.fillna(0) - wide_store_shift.fillna(0))/(wide_store_shift.fillna(0)+1)
wide_store_rate = wide_store_rate.iloc[1:,:] # drop off first row
printmd('**Preview of Change Rate of Sales on Store:**')
display(wide_store_rate.head(n=2))

In [None]:
pd.to_pickle(wide_store_rate, 
    '../Playground-dataset/03-Walmart-Store-Sales-Forecasting-Dataset/wide_store_rate.pickle')

In [None]:
wide_store_rate = pd.read_pickle(
    '../Playground-dataset/03-Walmart-Store-Sales-Forecasting-Dataset/wide_store_rate.pickle')

In [None]:
from TimeSeriesUtils import plot_zoom
plot_zoom(wide_store_rate, zoom=(-0.3,0.3))

### Change Rate of Sales on Dept

From the change rate of sales on departments, we can tell there are some departments having different trend and pattern to others.

In [None]:
# Create dataset of change rate on dept each week.
wide_dept_shift = wide_dept.shift(1)
wide_dept_rate = (wide_dept.fillna(0) - wide_dept_shift.fillna(0))/(wide_dept_shift.fillna(0)+1)
wide_dept_rate = wide_dept_rate.iloc[1:,:] # drop off first row
printmd('**Preview of Change Rate of Sales on Dept:**')
display(wide_dept_rate.head(n=2))

In [None]:
pd.to_pickle(wide_dept_rate, 
    '../Playground-dataset/03-Walmart-Store-Sales-Forecasting-Dataset/wide_dept_rate.pickle')

In [None]:
wide_dept_rate = pd.read_pickle(
    '../Playground-dataset/03-Walmart-Store-Sales-Forecasting-Dataset/wide_dept_rate.pickle')

In [None]:
from TimeSeriesUtils import plot_zoom
plot_zoom(wide_dept_rate, zoom=(-5,5))

# Split Train and Test Dataset

Here I will use date `2011-12-31` as splitting point for training and testing dataset. The implementation of time series clustering is based on training dataset only.

In [None]:
# Split train and test dataset on store.
store_rate_train = wide_store_rate[wide_store_rate.index <= '2011-12-31']
store_rate_test = wide_store_rate[wide_store_rate.index > '2011-12-31']

In [None]:
pd.to_pickle(store_rate_train, 
        '../Playground-dataset/03-Walmart-Store-Sales-Forecasting-Dataset/store_rate_train.pickle')

pd.to_pickle(store_rate_test, 
        '../Playground-dataset/03-Walmart-Store-Sales-Forecasting-Dataset/store_rate_test.pickle')

In [4]:
store_rate_train = pd.read_pickle(
    '../Playground-dataset/03-Walmart-Store-Sales-Forecasting-Dataset/store_rate_train.pickle')
store_rate_test = pd.read_pickle(
    '../Playground-dataset/03-Walmart-Store-Sales-Forecasting-Dataset/store_rate_test.pickle')

In [None]:
# Split train and test dataset on dept.
dept_rate_train = wide_dept_rate[wide_dept_rate.index <= '2011-12-31']
dept_rate_test = wide_dept_rate[wide_dept_rate.index > '2011-12-31']

In [None]:
pd.to_pickle(dept_rate_train, 
        '../Playground-dataset/03-Walmart-Store-Sales-Forecasting-Dataset/dept_rate_train.pickle')

pd.to_pickle(dept_rate_test, 
        '../Playground-dataset/03-Walmart-Store-Sales-Forecasting-Dataset/dept_rate_test.pickle')

In [5]:
dept_rate_train = pd.read_pickle(
    '../Playground-dataset/03-Walmart-Store-Sales-Forecasting-Dataset/dept_rate_train.pickle')
detp_rate_test = pd.read_pickle(
    '../Playground-dataset/03-Walmart-Store-Sales-Forecasting-Dataset/dept_rate_test.pickle')

# Clustering on Time Series - DTW Method

In [6]:
# Compute Dynamic Time Warping distance -> the smaller the value, the more similar the two time series are.

def array_scaler(x):
    """Scales array to 0-1
    
    Dependencies:
        import numpy as np
    Args:
        x: mutable iterable array of float
    returns:
        scaled x
    """
    arr_min = min(x)
    x = np.array(x) - float(arr_min)
    arr_max = max(x)
    x = x/float(arr_max)
    return x


def dtw_distance(x, y, d=lambda x,y: abs(x-y), scaled=False, fill=False):
    """Finds the distance of two arrays by dynamic time warping method
    source: https://en.wikipedia.org/wiki/Dynamic_time_warping
    
    Dependencies:
        import numpy as np
    Args:
        x, y: arrays
        d: distance function, default is absolute difference
        scaled: boolean, should arrays be scaled before calculation
        fill: boolean, should NA values be filled with 0
    returns:
        distance as float, 0.0 means series are exactly same, upper limit is infinite
    """
    if fill:
        x = np.nan_to_num(x)
        y = np.nan_to_num(y)
    if scaled:
        x = array_scaler(x)
        y = array_scaler(y)
    n = len(x) + 1
    m = len(y) + 1
    DTW = np.zeros((n, m))
    DTW[:, 0] = float('Inf')
    DTW[0, :] = float('Inf')
    DTW[0, 0] = 0
    
    for i in range(1, n):
        for j in range(1, m):
            cost = d(x[i-1], y[j-1])
            DTW[i, j] = cost + min(DTW[i-1, j], DTW[i, j-1], DTW[i-1, j-1])

    return DTW[n-1, m-1]

## On Store Sales

In [None]:
store_rt_list = store_rate_train.to_dict('list')
store_keys = pd.Series(list(store_rt_list.keys()))

In [None]:
dtw_matrix = pd.DataFrame(
    store_keys.apply(lambda x: store_keys.apply(
        lambda y: dtw_distance(store_rt_list[x], store_rt_list[y], scaled=False, fill=True))))

dtw_matrix.columns, dtw_matrix.index = store_keys, store_keys

In [None]:
# Hierarchical clustering on time series.
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial.distance import squareform
import matplotlib.pyplot as plt

In [None]:
dists_ = squareform(dtw_matrix)
dists_

# The first argument of linkage should not be the square distance matrix. 
# It must be the condensed distance matrix.
linkage_matrix = linkage(dists_, "complete")

In [None]:
plt.figure(figsize=(20,5))
dendrogram(linkage_matrix)
plt.title("Time Series Clustering")
plt.tick_params(labelsize='xx-large')
plt.show()

In [None]:
# Use bulit-in sklearn hierarchical clustering to make cluster perdiction.
agg_model = AgglomerativeClustering(n_clusters=5, affinity='precomputed', linkage='complete')
labels = pd.Series(agg_model.fit_predict(dtw_matrix))

In [None]:
# Store the cluster label as dataframe.
store_labels = pd.DataFrame(
    {'store_label':labels,
     'store':dtw_matrix.index.values})

In [None]:
pd.to_pickle(store_labels, 
    '../Playground-dataset/03-Walmart-Store-Sales-Forecasting-Dataset/store_labels.pickle')

In [8]:
store_labels = pd.read_pickle(
    '../Playground-dataset/03-Walmart-Store-Sales-Forecasting-Dataset/store_labels.pickle')

## On Department Sales

In [None]:
dept_rt_list = dept_rate_train.to_dict('list')
dept_keys = pd.Series(list(dept_rt_list.keys()))

In [None]:
dtw_matrix = pd.DataFrame(
    dept_keys.apply(lambda x: dept_keys.apply(
        lambda y: dtw_distance(dept_rt_list[x], dept_rt_list[y], scaled=False, fill=True))))

dtw_matrix.columns, dtw_matrix.index = dept_keys, dept_keys

In [None]:
# Find out the abnormal department.
# printmd('**Department that is so Dissimilar to other Departments:**')
# display(np.argmax(dtw_matrix.iloc[0,:], axis=1))

# Remove that department.
# dtw_matrix_sub = dtw_matrix.loc[dtw_matrix.index != 77,:]
# dtw_matrix_sub = dtw_matrix_sub.loc[:, dtw_matrix_sub.columns != 77]

In [None]:
# Hierarchical clustering on time series.
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial.distance import squareform
import matplotlib.pyplot as plt

In [None]:
dists_ = squareform(dtw_matrix)
dists_

# The first argument of linkage should not be the square distance matrix. 
# It must be the condensed distance matrix.
linkage_matrix = linkage(dists_, "complete")

In [None]:
dendrogram(linkage_matrix)
plt.title("Time Series Clustering")
plt.show()

## Identify Outliers

From the dendrogram above, we can't clearly identify the patterns of other major department sells owing to some outliers. Here, I'd first exclude these outliers and try to find patterns in major time series. Then, I'd again try to cluster outlier only series, so that we can better observe the patterns on both normal and outlier time series.

In [None]:
df = dept_rate_train.T
printmd('**Preview of df:**')
display(df.head(n=2))

In [None]:
# z-score method
from scipy import stats

def identify_outlier(df, zthreshold=3):
    """
    @param
        df: row are each time series, columns are the datetimes
    """
    final_set = set()

    for col in df.columns:
        zscores = stats.zscore(df.loc[:,col])
        row_num = np.where(zscores > zthreshold)[0] # row number
        idxes = list(df.index.values[row_num])
        idxes = set(idxes)
        final_set = final_set.union(idxes)
    
    final_set = sorted(list(final_set))

    print('Outliers Being Detected: {}'.format(final_set))
    print('Number of Outliers: {}'.format(len(final_set)))
    print('Number of Total Samples: {}'.format(len(df.index)))
    
    return final_set

In [None]:
# Get indexes of outlier rows.
final_set = identify_outlier(df)

In [None]:
# Separate good data points and outlier data points.
good_df = df.drop(final_set)
good_dept_df = good_df.T
outlier_df = df.loc[final_set,:]
outlier_dept_df = outlier_df.T

In [None]:
pd.to_pickle(good_dept_df,
    '../Playground-dataset/03-Walmart-Store-Sales-Forecasting-Dataset/good_dept_df.pickle')

pd.to_pickle(outlier_dept_df,
    '../Playground-dataset/03-Walmart-Store-Sales-Forecasting-Dataset/outlier_dept_df.pickle')

In [None]:
good_dept_df = pd.read_pickle(
    '../Playground-dataset/03-Walmart-Store-Sales-Forecasting-Dataset/good_dept_df.pickle')
outlier_dept_df = pd.read_pickle(
    '../Playground-dataset/03-Walmart-Store-Sales-Forecasting-Dataset/outlier_dept_df.pickle')

## On Dept - Outlier Excluded

In [None]:
dept_rt_list = good_dept_df.to_dict('list')
dept_keys = pd.Series(list(dept_rt_list.keys()))

In [None]:
dtw_matrix = pd.DataFrame(
    dept_keys.apply(lambda x: dept_keys.apply(
        lambda y: dtw_distance(dept_rt_list[x], dept_rt_list[y], scaled=False, fill=True))))

dtw_matrix.columns, dtw_matrix.index = dept_keys, dept_keys

In [None]:
# Hierarchical clustering on time series.
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial.distance import squareform
import matplotlib.pyplot as plt

In [None]:
dists_ = squareform(dtw_matrix)
dists_

# The first argument of linkage should not be the square distance matrix. 
# It must be the condensed distance matrix.
linkage_matrix = linkage(dists_, "complete")

In [None]:
plt.figure(figsize=(20,5))
dendrogram(linkage_matrix)
plt.title("Time Series Clustering")
plt.tick_params(labelsize='xx-large')
plt.show()

In [None]:
# Use bulit-in sklearn hierarchical clustering to make cluster perdiction.
agg_model = AgglomerativeClustering(n_clusters=4, affinity='precomputed', linkage='complete')
labels = pd.Series(agg_model.fit_predict(dtw_matrix))

In [None]:
# Store the cluster label as dataframe.
normal_dept_labels = pd.DataFrame(
    {'dept_label': labels,
     'dept':dtw_matrix.index.values})
display(normal_dept_labels.head())

In [None]:
pd.to_pickle(normal_dept_labels, 
    '../Playground-dataset/03-Walmart-Store-Sales-Forecasting-Dataset/normal_dept_labels.pickle')

In [None]:
normal_dept_labels = pd.read_pickle(
    '../Playground-dataset/03-Walmart-Store-Sales-Forecasting-Dataset/normal_dept_labels.pickle')

##  On Dept - Focus on Outlier

In [None]:
dept_rt_list = outlier_dept_df.to_dict('list')
dept_keys = pd.Series(list(dept_rt_list.keys()))

In [None]:
dtw_matrix = pd.DataFrame(
    dept_keys.apply(lambda x: dept_keys.apply(
        lambda y: dtw_distance(dept_rt_list[x], dept_rt_list[y], scaled=False, fill=True))))

dtw_matrix.columns, dtw_matrix.index = dept_keys, dept_keys

In [None]:
# Hierarchical clustering on time series.
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial.distance import squareform
import matplotlib.pyplot as plt

In [None]:
# dtw_matrix_sub = dtw_matrix.drop(6) # drop row of index 6
# dtw_matrix_sub = dtw_matrix_sub.drop(6,axis=1) # drop column of index 6

dists_ = squareform(dtw_matrix)
dists_

# The first argument of linkage should not be the square distance matrix. 
# It must be the condensed distance matrix.
linkage_matrix = linkage(dists_, "complete")

In [None]:
plt.figure(figsize=(20,5))
dendrogram(linkage_matrix)
plt.title("Time Series Clustering")
plt.tick_params(labelsize='xx-large')
plt.show()

In [None]:
# Use bulit-in sklearn hierarchical clustering to make cluster perdiction.
agg_model = AgglomerativeClustering(n_clusters=3, affinity='precomputed', linkage='complete')
labels = pd.Series(agg_model.fit_predict(dtw_matrix))

In [None]:
# Store the cluster label as dataframe.
outlier_dept_labels = pd.DataFrame(
    {'dept_label':labels + 5,
     'dept':dtw_matrix.index.values})
display(outlier_dept_labels.head())

In [None]:
pd.to_pickle(outlier_dept_labels, 
    '../Playground-dataset/03-Walmart-Store-Sales-Forecasting-Dataset/outlier_dept_labels.pickle')

In [None]:
outlier_dept_labels = pd.read_pickle(
    '../Playground-dataset/03-Walmart-Store-Sales-Forecasting-Dataset/outlier_dept_labels.pickle')

In [None]:
# Append dept labels of both normal dataset and outlier dataset.
dept_labels = normal_dept_labels.append(outlier_dept_labels).reset_index(drop=True)

printmd('**Preview of Dept Labels:**')
display(dept_labels.head())

In [None]:
pd.to_pickle(dept_labels, 
    '../Playground-dataset/03-Walmart-Store-Sales-Forecasting-Dataset/dept_labels.pickle')

In [None]:
dept_labels = pd.read_pickle(
    '../Playground-dataset/03-Walmart-Store-Sales-Forecasting-Dataset/dept_labels.pickle')

## Visualization on Clustering Result

In [None]:
def plot_cluster_result(df, class_df, label_col, seg_col):
    num_chart = len(class_df[label_col].unique())
    class_values = class_df[label_col].unique()
    
    sns.set_style('whitegrid')
    fig, ax = plt.subplots(num_chart,1, figsize=(20,25), sharex=True)
    ax = ax.ravel()
    
    for i in range(num_chart):
        segs = list(class_df[seg_col][class_df[label_col] == class_values[i]])
        sub_df = df.loc[:,[v in segs for v in df.columns]]
        _ = sub_df.plot(kind='line', legend=False, ax=ax[i], colormap='tab20c')
        

    _ = plt.tick_params(labelsize='large')
    _ = plt.xlabel('date', fontsize='xx-large')
    _ = fig.suptitle('{} Growth Rate'.format(seg_col.capitalize()), y=0.9, fontsize='xx-large')
    _ = plt.show()


In [None]:
plot_cluster_result(store_rate_train, class_df=store_labels, label_col='store_label', seg_col='store')

In [None]:
plot_cluster_result(dept_rate_train, class_df=dept_labels, label_col='dept_label', seg_col='dept')

# Feature Engineering

## Change Rate for each (Store, Dept) Pair

In [10]:
# Refer back to the original dataset.
data = pd.read_pickle('../Playground-dataset/03-Walmart-Store-Sales-Forecasting-Dataset/data.pickle')

### Store `is_holiday` in a Separate Dataframe

The task requires to distinguish sales of holiday from non-holiday. The error on sales prediction is weighted heavier on holiday sales. Thus, before implementing feature engineering on weekly sales, I'd save the is_holiday flag to separate dataframe. Then I will join the holiday flag back when the data is processed and readay for use.

In [None]:
# Store `is_holiday` in a separate df.
is_holiday = data[['date','is_holiday']]
is_holiday_unq = is_holiday[~is_holiday.duplicated()] # store unique date

### Create Consecutive Time Series for each (Store, Dept) Pair

In [31]:
data = data.drop('is_holiday', axis=1)
data.head()

Unnamed: 0,store,dept,date,weekly_sales,store_dept
0,1,1,2010-02-05,24924.5,1_1
1,1,1,2010-02-12,46039.49,1_1
2,1,1,2010-02-19,41595.55,1_1
3,1,1,2010-02-26,19403.54,1_1
4,1,1,2010-03-05,21827.9,1_1


In [None]:
# Create 7 days series -> Time series of consecutive weeks for each (store, dept)
from TimeSeriesUtils import consecutive_series
days_series = consecutive_series(df=data, col='date', day_width=7)

In [None]:
# Join the (store, dept) flag to the days_series,
# so that we have full consecutive weekday for each (store, dept).
from TimeSeriesUtils import cross_join_key_series

final_df = cross_join_key_series(df=data, key='store_dept', series=days_series)
final_df['store'] = final_df['store_dept'].apply(lambda x: float(x.split('_')[0])) # separate store
final_df['dept'] = final_df['store_dept'].apply(lambda x: float(x.split('_')[1])) # separate dept

In [None]:
# Cross join the original data to the consecutive time series.
data_full = pd.merge(
    data, final_df, how='right', 
    left_on=['store','dept','store_dept','date'], 
    right_on=['store','dept','store_dept','date'])
data_full = data_full.sort_values(['store_dept', 'date']).reset_index(drop=True)
data_full['weekly_sales'] = data_full['weekly_sales'].fillna(0) # fill in 0 for any missing value
data_full.head()

## Data Validation on each (store, dept) Pair

In [None]:
# Date validation on (store, dept) pair, check for inconsecutive week.
data_pre = data_full.groupby(
    ['store_dept'])['date'].transform(lambda x: x.shift(1)) # shift one week later
data_pre.rename('date_lag', inplace=True)
data_pre.head()

In [None]:
# Check the rows of NaT (null value for datetime).
data_pre[data_pre.isnull()][:5]

In [None]:
# Concatenate original data and date_lag.
data_full = pd.concat([data_full,data_pre], axis=1)

# Preview of the result.
print('Preview of what the data looks like on rows containing missing values.')
display(data_full[425:432])

In [None]:
# Check if exist rows with timedelta is different 7 days range.
data_full['days_lag'] = data_full['date'] - data_full['date_lag']

seven_days = timedelta(days=7) # create timedelta object
data_full['check_days_lag'] = data_full['days_lag'] != seven_days # boolean for 7 days range


# Check for rows with timedelta having different day range, excluding rows of missing values
display(data_full[~data_full['date_lag'].isnull() & data_full['check_days_lag']][:5])
print('All rows are either missing values or having 7 days range.')

In [None]:
pd.to_pickle(data_full, 
             '../Playground-dataset/03-Walmart-Store-Sales-Forecasting-Dataset/data_full.pickle')

In [72]:
data_full = pd.read_pickle(
    '../Playground-dataset/03-Walmart-Store-Sales-Forecasting-Dataset/data_full.pickle')

## Create Lag Sales Column

In [73]:
from TimeSeriesUtils import create_lag_sales

# sales lag 1 period
col_previous_1 = create_lag_sales(df=data_full, 
                                groupby_col=['store_dept'], 
                                shift_col='weekly_sales',
                                shift_num=1)

0         NaN
1    40212.84
2    67699.32
3    49748.33
4    33601.22
Name: weekly_sales_lag_1, dtype: float64

In [74]:
# Concatenate original data and sales_lag.
data_full = pd.concat([data_full,col_previous_1], axis=1)

In [75]:
# sales lag 2 period.
col_previous_2 = create_lag_sales(df=data_full, 
                                groupby_col=['store_dept'], 
                                shift_col='weekly_sales',
                                shift_num=2)

0         NaN
1         NaN
2    40212.84
3    67699.32
4    49748.33
Name: weekly_sales_lag_2, dtype: float64

In [76]:
# Concatenate original data and sales_lag.
data_full = pd.concat([data_full,col_previous_2], axis=1)

In [77]:
# sales lag 3 period.
col_previous_3 = create_lag_sales(df=data_full, 
                                groupby_col=['store_dept'], 
                                shift_col='weekly_sales',
                                shift_num=3)

0         NaN
1         NaN
2         NaN
3    40212.84
4    67699.32
Name: weekly_sales_lag_3, dtype: float64

In [78]:
# Concatenate original data and sales_lag.
data_full = pd.concat([data_full,col_previous_3], axis=1)

### Compute for Change Rate

Notice that `weekly_sales` contains negative values (don't konw why?). Thus, when computing for change rate, be aware of the unexpected result out of the divion on zero (especially on denominator).

In [79]:
def compute_chg(cola, colb):
    """
    Compute for change rate for weekly sales.
    Special handles for missing values, divion by zero etc.
    
    @param:
        cola: series A
        colb: series B
    
    """
    if pd.isnull(colb):
        return np.nan
    elif colb == 0:
        return cola/1 - 1
    else:
        return cola/colb - 1

In [80]:
# Compute sales change rate. -> target variable
data_full['sales_chg_rt'] = data_full.apply(
    lambda x: compute_chg(x['weekly_sales'],x['weekly_sales_lag_1']), axis=1)
data_full.head()

Unnamed: 0,store,dept,date,weekly_sales,store_dept,date_lag,days_lag,check_days_lag,weekly_sales_lag_1,weekly_sales_lag_2,weekly_sales_lag_3,sales_chg_rt
0,10,1,2010-02-05,40212.84,10_1,NaT,NaT,True,,,,
1,10,1,2010-02-12,67699.32,10_1,2010-02-05,7 days,False,40212.84,,,0.683525
2,10,1,2010-02-19,49748.33,10_1,2010-02-12,7 days,False,67699.32,40212.84,,-0.265158
3,10,1,2010-02-26,33601.22,10_1,2010-02-19,7 days,False,49748.33,67699.32,40212.84,-0.324576
4,10,1,2010-03-05,36572.44,10_1,2010-02-26,7 days,False,33601.22,49748.33,67699.32,0.088426


In [81]:
# Compute sales change rate. -> feature
data_full['sales_chg_rt_2'] = data_full.apply(
    lambda x: compute_chg(x['weekly_sales_lag_2'],x['weekly_sales_lag_3']), axis=1)
data_full.head()

Unnamed: 0,store,dept,date,weekly_sales,store_dept,date_lag,days_lag,check_days_lag,weekly_sales_lag_1,weekly_sales_lag_2,weekly_sales_lag_3,sales_chg_rt,sales_chg_rt_2
0,10,1,2010-02-05,40212.84,10_1,NaT,NaT,True,,,,,
1,10,1,2010-02-12,67699.32,10_1,2010-02-05,7 days,False,40212.84,,,0.683525,
2,10,1,2010-02-19,49748.33,10_1,2010-02-12,7 days,False,67699.32,40212.84,,-0.265158,
3,10,1,2010-02-26,33601.22,10_1,2010-02-19,7 days,False,49748.33,67699.32,40212.84,-0.324576,0.683525
4,10,1,2010-03-05,36572.44,10_1,2010-02-26,7 days,False,33601.22,49748.33,67699.32,0.088426,-0.265158


In [82]:
# Check for `inf` values.
data_full[(data_full['sales_chg_rt'] == -np.inf) | (data_full['sales_chg_rt_2'] == -np.inf)]

Unnamed: 0,store,dept,date,weekly_sales,store_dept,date_lag,days_lag,check_days_lag,weekly_sales_lag_1,weekly_sales_lag_2,weekly_sales_lag_3,sales_chg_rt,sales_chg_rt_2


## Change Rate of Last Year

Let's include the change rate in the same period of last year as one of the predictive features. The same period of last year is assigned on the closest weekday(Friday) relative to the same date(Friday) this year. 

For instance, if the date last year relative to the same date this year is on Friday, then it's just the same date last year being assigned. If the date in last year is on Saturday, Sunday, Monday, then it's the previous friday that's assigend as the coresponding date in last year. If it's on Tuesday, Wednesday, Thursday, then it's the next friday that's assigned as the corresponding date in last year.

In [None]:
# Compute the relative week on last year.
def date_last_year(date, weekday):
    if weekday == 4:
        return date
    elif weekday == 5:
        return (date - timedelta(days=1))
    elif weekday == 6:
        return (date - timedelta(days=2))
    elif weekday == 0:
        return (date - timedelta(days=4))
    elif weekday == 1:
        return (date + timedelta(days=3))
    elif weekday == 2:
        return (date + timedelta(days=2))
    elif weekday == 3:
        return (date + timedelta(days=1))


In [None]:
# Get the corresponding date of last year.
data_full['date_last_yr'] = np.nan

with tqdm_notebook(total=data_full.shape[0]) as pbar:

    for i in range(data_full.shape[0]):
        year = data_full['date'][i].year
        month = data_full['date'][i].month
        day = data_full['date'][i].day

        date = str(year-1) + '-' + '{:02d}'.format(month) + '-' + '{:02d}'.format(day) 
        try:
            date = datetime.strptime(date, '%Y-%m-%d') # cope with 29th of Feb.
        except:
            date = datetime.strptime(str(year-1) + '-02-28', '%Y-%m-%d')

        weekday = date.weekday()

        date = date_last_year(date, weekday)
        data_full['date_last_yr'][i] = date
        pbar.update(1)
        


In [None]:
pd.to_pickle(data_full,
             '../Playground-dataset/03-Walmart-Store-Sales-Forecasting-Dataset/data_full_2.pickle')

In [None]:
data_full_2 = pd.read_pickle(
    '../Playground-dataset/03-Walmart-Store-Sales-Forecasting-Dataset/data_full_2.pickle')

In [None]:
data_full_2.head()

In [None]:
data_full_2['date_last_yr'] = pd.to_datetime(data_full_2['date_last_yr'])

In [None]:
data_full_2.info()

In [None]:
# Join the sales change rate of last year.
data_full_2 = pd.merge(data_full_2, data_full_2[['date','store_dept','sales_chg_rt']], how='left', 
    left_on=['date_last_yr', 'store_dept'], right_on=['date','store_dept'])

data_full_2.head()

## Assign Cluster Group

In [None]:
# Assign cluster number for store.
data_full_3 = pd.merge(data_full_2, store_labels,
                     left_on='store', right_on='store', how='left')

In [None]:
# Assign cluster number for dept.
data_full_3 = pd.merge(data_full_3, dept_labels,
                     left_on='dept', right_on='dept', how='left')

## Create Datetime Feature

Split month and week of month from `date` column.

In [None]:
data_full_3['date_x_month'] = data_full_3['date_x'].dt.month # month
data_full_3['date_x_wom'] = data_full_3['date_x'].apply(lambda x: (x.day-1)//7 + 1) # week of month
data_full_3.head()

## Create Dummy Feature

In [None]:
# Columns to keep.
keep_col = ['store_dept','date_x','sales_chg_rt_x',
            'sales_chg_rt_2','sales_chg_rt_y',
            'store_label','dept_label',
            'date_x_month','date_x_wom']

data_full_4 = data_full_3[keep_col]

In [None]:
dummy_features = ['store_label', 'dept_label','date_x_month','date_x_wom']

for c in dummy_features:
    dummy = pd.get_dummies(data_full_4[c], prefix=c, drop_first=False)
    data_full_4 = pd.concat([data_full_4, dummy], axis=1)

drop_col = ['store_label', 'dept_label','date_x_month','date_x_wom']
data_full_4 = data_full_4.drop(drop_col, axis=1)

In [None]:
data_full_4.head()

# Create Training, Validation and Test Dataset

In [None]:
# Remove rows with missing values.
final_data = data_full_4[~data_full_4.isnull().any(axis=1)]

In [None]:
display(final_data.head())

In [None]:
pd.to_pickle(final_data,
            '../Playground-dataset/03-Walmart-Store-Sales-Forecasting-Dataset/final_data.pickle')

In [None]:
final_data = pd.read_pickle(
    '../Playground-dataset/03-Walmart-Store-Sales-Forecasting-Dataset/final_data.pickle')

In [None]:
train_data = pd.DataFrame()
validation_data = pd.DataFrame()
test_data = pd.DataFrame()

# Create train and test dataset for each store_dept.
for k, group in final_data.groupby('store_dept'):
    train_g = group[group['date_x'] <= '2011-12-31'] # match the criterion for clustering time series
    remain = group[group['date_x'] > '2011-12-31']
    remain_half_row = remain.shape[0] // 2
    train_data = train_data.append(train_g)
    validation_data = validation_data.append(remain.iloc[:remain_half_row,:])
    test_data = test_data.append(remain.iloc[remain_half_row:,:])

In [None]:
train_data.head()

In [None]:
# train data
train_y = train_data[['store_dept','date_x','sales_chg_rt_x']]
train_feature = train_data.drop(['store_dept','date_x','sales_chg_rt_x'], axis=1)

In [None]:
# validation data
valitation_y = validation_data[['store_dept','date_x','sales_chg_rt_x']]
validation_feature = validation_data.drop(['store_dept','date_x','sales_chg_rt_x'], axis=1)

In [None]:
# test data
test_y = test_data[['store_dept','date_x','sales_chg_rt_x']]
test_feature = test_data.drop(['store_dept','date_x','sales_chg_rt_x'], axis=1)

In [None]:
validation_data.head()

In [None]:
train_data.tail()