# Retail Data Analytics
<div style="color:#00381c;
           display:fill;
           border-radius:50px;
            border-style: solid;
            padding: 25px 25px;
           border-width: 5px;
            border-color:#00381c;
           background-color:#f7e1cd;
           letter-spacing:0.1px;
            font-family:'Futura';
            line-height: 1.7em;
            font-size:1.5em;">
    <h3 style = "line-height:1.3;">This notebook deals the retail sales - time series data. I am using <a style = "color:#0b70db;" href = "https://www.kaggle.com/manjeetsingh/retaildataset"> Retail sales data</a> to play around with.</h3>
<h2><b>Context</b></h2>
<h3 style = "line-height:1.3;">The Challenge - One challenge of modeling retail data is the need to make decisions based on limited history. Holidays and select major events come once a year, and so does the chance to see how strategic decisions impacted the bottom line. In addition, markdowns are known to affect sales – the challenge is to predict which departments will be affected and to what extent.</h3>
<h2><b>Content</b></h2>
<h3 style = "line-height:1.3;">You are provided with historical sales data for 45 stores located in different regions - each store contains a number of departments. The company also runs several promotional markdown events throughout the year. These markdowns precede prominent holidays, the four largest of which are the Super Bowl, Labor Day, Thanksgiving, and Christmas. The weeks including these holidays are weighted five times higher in the evaluation than non-holiday weeks.</h3>
</div>

In [None]:
#Import necessary libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style("whitegrid")
sns.set_palette("tab10")
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import plotly.express as px
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode,iplot
plt.style.use('seaborn-notebook')
params = {'legend.fontsize': 15,
          'legend.title_fontsize': 16,
          'figure.figsize': (15, 5),
         'axes.labelsize': 18,
         'axes.titlesize':20,
         'xtick.labelsize':18,
         'ytick.labelsize':18}
plt.rcParams.update(params)

# Loading the data:

In [None]:
# Loading the data into pandas dataframe for EDA
df_stores = pd.read_csv('../input/retaildataset/stores data-set.csv')
df_features = pd.read_csv('../input/retaildataset/Features data set.csv')
df_features.Date = pd.to_datetime(df_features.Date)
df_sales = pd.read_csv('../input/retaildataset/sales data-set.csv')
df_sales.Date = pd.to_datetime(df_sales.Date)

<div style="color:#00381c;
           display:fill;
           border-radius:50px;
            border-style: solid;
            padding: 25px 25px;
           border-width: 5px;
            border-color:#00381c;
           background-color:#f7e1cd;
           letter-spacing:0.1px;
            font-family:'Futura';
            line-height: 1.7em;
            font-size:1.5em;">
    <h2 style = "line-height:1.3;"><b>sales data-set.csv:</b></h2>
    <h3>Anonymized information about the 45 stores, indicating the type and size of store</h3>
</div>

In [None]:
df_sales.head()
df_sales.info()
df_sales.describe().T

<div style="color:#00381c;
           display:fill;
           border-radius:50px;
            border-style: solid;
            padding: 25px 25px;
           border-width: 5px;
            border-color:#00381c;
           background-color:#f7e1cd;
           letter-spacing:0.1px;
            font-family:'Futura';
            line-height: 1.7em;
            font-size:1.5em;">
    <h2 style = "line-height:1.3;"><b>stores data-set.csv:</b></h2>
    <h3 style = "line-height:1.3;">
Historical sales data, which covers to 2010-02-05 to 2012-11-01. Within this tab you will find the following fields:
<ul>
<li>Store - the store number</li>
<li>Dept - the department number</li>
<li>Date - the week</li>
<li>Weekly_Sales -  sales for the given department in the given store</li>
<li>IsHoliday - whether the week is a special holiday week</li>
        </ul>
    </h3>
</div>

In [None]:
df_stores.head()
df_stores.info()
df_stores.describe().T

# Merging the data into a single Dataframe:

In [None]:
df_features = df_features.merge(df_stores, on = 'Store')
df = df_features.merge(df_sales, on = ['Store','Date','IsHoliday'])
df=df.fillna(0)

<div style="color:#00381c;
           display:fill;
           border-radius:50px;
            border-style: solid;
            padding: 25px 25px;
           border-width: 5px;
            border-color:#00381c;
           background-color:#f7e1cd;
           letter-spacing:0.1px;
            font-family:'Futura';
            line-height: 1.7em;
            font-size:1.5em;">
    <h2 style = "line-height:1.3;"><b>Features data-set.csv :</b></h2>
    <h3 style = "line-height:1.3em">
        Contains additional data related to the store, department, and regional activity for the given dates.
<ul><li>Store - the store number</li>
<li>Date - the week</li>
<li>Temperature - average temperature in the region</li>
<li>Fuel_Price - cost of fuel in the region</li>
<li>MarkDown1-5 - anonymized data related to promotional markdowns. MarkDown data is only available after Nov 2011, and is not available for all stores all the time. Any missing value is marked with an NA</li>
<li>CPI - the consumer price index</li>
<li>Unemployment - the unemployment rate</li>
<li>IsHoliday - whether the week is a special holiday week</li></ul></h3>
</div>

In [None]:
df.head()
df.info()
df.describe().T
df = df.sort_values(by = ['Date'])

In [None]:
# splitting date into 3 columns denoting Year, Month and Day respectively
df['Year'] = df.Date.apply(lambda x: int(str(x)[:4]))
df['Month'] = df.Date.apply(lambda x: int(str(x)[5:7]))
df['Year-Month'] = df.Date.apply(lambda x: str(x)[:7])
df['Day'] = df.Date.apply(lambda x: int(str(x)[8:10]))

# Exploratory Data analysis:

In [None]:
#df.groupby(['Year','Month']).Fuel_Price.mean()
plot_no = 1
_ = plt.subplots(figsize = (20,10))
_ = plt.xticks(rotation = 60)
_ = sns.lineplot(data = df, x = 'Year-Month',y = 'Fuel_Price')
_ = plt.title('LinePlot showing the change in fuel price over the span of 3 years', fontsize=20)
plt.savefig(str(plot_no)+'_plot.png')
plot_no +=1

In [None]:
r = 5 #lets round off the temperature in the range of r
df['Temperature_r'] = df.sort_values(by=['Temperature']).Temperature.apply(lambda x : x - x %r)

In [None]:
_ = plt.subplots(figsize = (20,10))
_ = plt.ylim(3.1,3.45)
plots = sns.barplot(data = df, x = 'IsHoliday', y = 'Fuel_Price', hue = 'Type')
_ = plt.title('BarPlot showing the change in fuel price with respect the type of the store with holidays grouped')
for bar in plots.patches:
    _ = plots.annotate(format(bar.get_height(), '.2f'), 
                   (bar.get_x() + bar.get_width() / 2, 
                    bar.get_height() - (bar.get_height()-3.1)/2), ha='center', va='center',
                   size=15, xytext=(0, 0),bbox=dict(boxstyle="round4,pad=0.6", fc="w", ec="black", lw=2),
                   textcoords='offset points')
plt.savefig(str(plot_no)+'_plot.png')
plot_no +=1

In [None]:
_ = plt.subplots(figsize = (20,10))
_ = sns.lineplot(data = df, x = 'Type', y = 'Fuel_Price', hue = 'IsHoliday',style = 'IsHoliday', markers = True, ci = 68)
_ = plt.title('LinePlot showing the change in fuel price with respect the type of the store')
plt.savefig(str(plot_no)+'_plot.png')
plot_no +=1

<div style="color:#001724;
           display:fill;
           border-radius:50px;
            border-style: solid;
            padding: 25px 25px;
           border-width: 15px;
            border-color:#001724;
           background-color:#d0e7f5;
           letter-spacing:1.1px;
            font-family:'Futura';
            line-height: 1.7em;
            font-size:1.5em;">
    <h3 style = "line-height:1.3;">There is a significant increase in the fuel price for the type B store and comparatively the fuels prices were very less during weekends. </h3>
</div>

In [None]:
_ = plt.subplots(figsize = (20,10))
_ = sns.lineplot(data = df, x = 'Temperature_r', y = 'Fuel_Price', hue = 'IsHoliday',style = 'IsHoliday', markers = True, ci = 68)
_ = plt.xlabel('Temperature range')
_ = plt.title('Lineplot showing the change in fuel price with respect to the change in temperature')
plt.savefig(str(plot_no)+'_plot.png')
plot_no +=1

In [None]:
_ = plt.subplots(figsize = (20,10))
_ = sns.lineplot(data = df, x = 'Temperature_r', y = 'CPI', hue = 'Type',style = 'Type', markers = True, ci = 68)
_ = plt.title('Lineplot showing the change in CPI with respect to the change in temperature')
plt.savefig(str(plot_no)+'_plot.png')
plot_no +=1

<div style="color:#001724;
           display:fill;
           border-radius:50px;
            border-style: solid;
            padding: 25px 25px;
           border-width: 15px;
            border-color:#001724;
           background-color:#d0e7f5;
           letter-spacing:1.1px;
            font-family:'Futura';
            line-height: 1.7em;
            font-size:1.5em;">
    <h3 style = "line-height:1.3;">It is seen that the fuel price increases with increase in temperature steadily during workdays and unevenly during holidays</h3>
</div>

In [None]:
_ = plt.subplots(figsize = (20,10))
_ = sns.lineplot(data = df, x = 'Date', y = 'Fuel_Price')
_ = plt.title('Lineplot showing the change in fuel price in each month over the span of 3 years')
plt.savefig(str(plot_no)+'_plot.png')
plot_no +=1

In [None]:
_ = plt.subplots(figsize = (20,10))
_ = sns.countplot(data = df,x='Year',hue='Month')
_ = plt.title('Barplot showing the observation counts for each recorded month')
plt.savefig(str(plot_no)+'_plot.png')
plot_no +=1

<div style="color:#001724;
           display:fill;
           border-radius:50px;
            border-style: solid;
            padding: 25px 25px;
           border-width: 15px;
            border-color:#001724;
           background-color:#d0e7f5;
           letter-spacing:1.1px;
            font-family:'Futura';
            line-height: 1.7em;
            font-size:1.5em;">
    <h3 style = "line-height:1.3;">There is no significant pattern in the data points spread each months in the dataset. However one noticeable cue is that no sales data is recorded / happened during the month of September in 2013</h3>
</div>

In [None]:
_ = plt.subplots(figsize = (20,10))
plots = sns.barplot(data = df, x = 'Type', y = 'Fuel_Price')
for bar in plots.patches:
    _ = plots.annotate(format(bar.get_height(), '.2f'), 
                   (bar.get_x() + bar.get_width() / 2, 
                    bar.get_height()), ha='center', va='center',
                   size=15, xytext=(0, 23),
                   textcoords='offset points');
_ = plt.ylim(2.5,3.5)
_ = plt.title('Barplot showing the change in Fuel price with respect to the type of the store')
plt.savefig(str(plot_no)+'_plot.png')
plot_no +=1

In [None]:
df_rolled_mean = df.set_index('Date').rolling(window = 2948).mean().reset_index()
df_rolled_std = df.set_index('Date').rolling(window = 2948).std().reset_index()

In [None]:

fig,ax = plt.subplots(figsize = (20,10))
_ = sns.lineplot(data = df, x = 'Year-Month', y = 'Weekly_Sales', ax = ax, ci = 1)
_ = plt.xticks(rotation = 60)
_ = plt.title('Lineplot showing the change in Weekly_Sales in each month over the span of 3 years')
plt.savefig(str(plot_no)+'_plot.png')
plot_no +=1

<div style="color:#001724;
           display:fill;
           border-radius:50px;
            border-style: solid;
            padding: 25px 25px;
           border-width: 15px;
            border-color:#001724;
           background-color:#d0e7f5;
           letter-spacing:1.1px;
            font-family:'Futura';
            line-height: 1.7em;
            font-size:1.5em;">
    <h3 style = "line-height:1.3;">There was a peak during the end of the years 2010 and 2011 but not during 2012. This might be due to comparatively very less observations during the last 2 months in 2012</h3>
</div>

In [None]:
_ = df[['Date', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5']].plot(x = 'Date', subplots = True, figsize = (20,15))
plt.savefig(str(plot_no)+'_plot.png')
plot_no +=1

In [None]:
df_average_sales_week = df.groupby(by=['Date'], as_index=False)['Weekly_Sales'].sum()
df_average_sales = df_average_sales_week.sort_values('Weekly_Sales', ascending=False)

_ = plt.figure(figsize=(20,8))
_ = plt.plot(df_average_sales_week.Date, df_average_sales_week.Weekly_Sales)
_ = plt.title('Data spread of total weekly sales volume of the retail chain')
_ = plt.xlabel('Date')
_ = plt.ylabel('Weekly Sales')
plt.savefig(str(plot_no)+'_plot.png')
plot_no +=1

# Top Stories

In [None]:
ts = df_average_sales_week.set_index('Date')
# Top performing type of stores in term of sales
df_top_stores = df.groupby(by=['Type'], as_index=False)['Weekly_Sales'].sum()
df_top_stores.sort_values('Weekly_Sales', ascending=False)

In [None]:
# Top performing stores in term of sales
df_top_stores = df.groupby(by=['Store'], as_index=False)['Weekly_Sales'].sum()
df_top_stores.sort_values('Weekly_Sales', ascending=False)[:3]

# Forecasting

In [None]:
from statsmodels.graphics.tsaplots import acf, pacf, plot_acf, plot_pacf

fig, axes = plt.subplots(1,2, figsize=(20,5))
_ = plot_acf(ts, lags=64, ax=axes[0])
_ = plot_pacf(ts, lags=64, ax=axes[1])
plt.savefig(str(plot_no)+'_plot.png')
plot_no +=1

In [None]:
from sklearn.linear_model import LinearRegression

def fit_ar_model(ts, orders):
    X=np.array([ ts.values[(i-orders)].squeeze() if i >= np.max(orders) else np.array(len(orders) * [np.nan]) for i in range(len(ts))])
    mask = ~np.isnan(X[:,:1]).squeeze()
    Y= ts.values
    lin_reg=LinearRegression()
    lin_reg.fit(X[mask],Y[mask])
    print(lin_reg.coef_, lin_reg.intercept_)
    print('Score factor: %.2f' % lin_reg.score(X[mask],Y[mask]))
    return lin_reg.coef_, lin_reg.intercept_
    
def predict_ar_model(ts, orders, coef, intercept):
    return np.array([np.sum(np.dot(coef, ts.values[(i-orders)].squeeze())) + intercept  if i >= np.max(orders) else np.nan for i in range(len(ts))])

In [None]:
orders=np.array([1,6,52])
coef, intercept = fit_ar_model(ts,orders)
pred=pd.DataFrame(index=ts.index, data=predict_ar_model(ts, orders, coef, intercept))
_ = plt.figure(figsize=(20,5))
_ = plt.plot(ts, 'o')
_ = plt.plot(pred)
plt.savefig(str(plot_no)+'_plot.png')
plot_no +=1

In [None]:
diff=(ts['Weekly_Sales']-pred[0])/ts['Weekly_Sales']

print('AR Residuals: avg %.2f, std %.2f' % (diff.mean(), diff.std()))
 
_ = plt.figure(figsize=(20,5))
_ = plt.plot(diff, c='orange')
_ = plt.grid()
plt.savefig(str(plot_no)+'_plot.png')
plot_no +=1

In [None]:
df20=df.where( df['Store'] == 20)
df20=df20.dropna()
df20=df20.groupby(by=['Date'], as_index=False)['Weekly_Sales'].sum()
df20 = df20.set_index('Date')
df20.head()

In [None]:
_ = plt.figure(figsize=(20,5))
_ = plt.plot(df20.index, df20.values)
plt.savefig(str(plot_no)+'_plot.png')
plot_no +=1

In [None]:
fig, axes = plt.subplots(1,2, figsize=(20,5))
_ = plot_acf(df20.values, lags=21, alpha=0.05, ax=axes[0])
_ = plot_pacf(df20.values, lags=21, alpha=0.05, ax=axes[1])
plt.savefig(str(plot_no)+'_plot.png')
plot_no +=1

In [None]:
orders=np.array([1,6,29,46,52])
coef, intercept = fit_ar_model(df20,orders)
pred=pd.DataFrame(index=df20.index, data=predict_ar_model(df20, orders, coef, intercept))
_ = plt.figure(figsize=(20,5))
_ = plt.plot(df20, 'o')
_ = plt.plot(pred)
plt.savefig(str(plot_no)+'_plot.png')
plot_no +=1

In [None]:
diff=(df20['Weekly_Sales']-pred[0])/df20['Weekly_Sales']

print('AR Residuals: avg %.2f, std %.2f' % (diff.mean(), diff.std()))
 
_ = plt.figure(figsize=(20,5))
_ = plt.plot(diff, c='orange')
_ = plt.grid()
plt.savefig(str(plot_no)+'_plot.png')
plot_no +=1

In [None]:
dfext=df.where( df['Store'] == 20)
dfext=dfext.dropna()
dfext=dfext.groupby(by=['Date'], as_index=False)[['Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 
                                                  'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5']].mean()
dfext = dfext.set_index('Date')
dfext.head()

In [None]:
dfext['shifted_sales'] = df20.shift(-1)
dfext.head()

In [None]:
corr = dfext.corr()
_ = plt.figure(figsize=(10,10))
sns.heatmap(corr, 
            annot=True, fmt=".3f",
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values);
plt.savefig(str(plot_no)+'_plot.png')
plot_no +=1

In [None]:
corr['shifted_sales'].sort_values(ascending=False)

In [None]:
def fit_ar_model_ext(ts, orders, ext, fitter=LinearRegression()):
    X=np.array([ ts.values[(i-orders)].squeeze() if i >= np.max(orders) else np.array(len(orders) * [np.nan]) for i in range(len(ts))])
    X = np.append(X, ext.values, axis=1)
    mask = ~np.isnan(X[:,:1]).squeeze()
    Y= ts.values
    fitter.fit(X[mask],Y[mask].ravel())
    print(fitter.coef_, fitter.intercept_)
    print('Score factor: %.2f' % fitter.score(X[mask],Y[mask]))
    return fitter.coef_, fitter.intercept_
    
def predict_ar_model_ext(ts, orders, ext, coef, intercept):
    X=np.array([ ts.values[(i-orders)].squeeze() if i >= np.max(orders) else np.array(len(orders) * [np.nan]) for i in range(len(ts))])
    X = np.append(X, ext.values, axis=1)
    return np.array( np.dot(X, coef.T) + intercept)

In [None]:
#dfexte=dfext.drop(['shifted_sales'], axis=1)
dfexte=dfext[['Unemployment','Fuel_Price','CPI','Temperature',
              'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5']]

orders=np.array([1,6,29,46,52])
coef, intercept = fit_ar_model_ext(df20,orders,dfexte)
pred_ext=pd.DataFrame(index=df20.index, data=predict_ar_model_ext(df20, orders, dfexte, coef, intercept))
_ = plt.figure(figsize=(20,5))
_ = plt.plot(df20, 'o')
_ = plt.plot(pred)
_ = plt.plot(pred_ext)
plt.savefig(str(plot_no)+'_plot.png')
plot_no +=1

In [None]:
diff=(df20['Weekly_Sales']-pred[0])/df20['Weekly_Sales']
diff_ext=(df20['Weekly_Sales']-pred_ext[0])/df20['Weekly_Sales']

print('AR Residuals: avg %.2f, std %.2f' % (diff.mean(), diff.std()))
print('AR wiht Ext Residuals: avg %.2f, std %.2f' % (diff_ext.mean(), diff_ext.std()))
 
_ = plt.figure(figsize=(20,5))
_ = plt.plot(diff, c='orange', label='w/o external variables')
_ = plt.plot(diff_ext, c='green', label='w/ external variables')
_ = plt.legend()
_ = plt.grid()
plt.savefig(str(plot_no)+'_plot.png')
plot_no +=1