In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.metrics import mean_squared_error
import seaborn as sns
import scipy.stats as stat
import pylab
%matplotlib inline

In [None]:
data=pd.read_csv('../input/walmart-sales-data/Walmart_Store_sales.csv')

In [None]:
data.head()

In [None]:
data.isna().sum()

In [None]:
data.Holiday_Flag.value_counts()

In [None]:
data.describe()

In [None]:
# Task 1:Which store has maximum sales
plt.figure(figsize=(8,6))
data.groupby(['Store']).Weekly_Sales.sum().sort_values(ascending=False).head().plot.bar(color='orange')
plt.xlabel('Store number')
plt.ylabel('Sales')
plt.title('Sales distribution according to stores' , color='b')
# Store 20 has maximum sales of 3.013978+08

In [None]:
# Task 2:Which store has maximum standard deviation i.e., the sales vary a lot. Also, find out the coefficient of mean to standard deviation
plt.figure(figsize=(20,20))
df = data.groupby(['Store']).Weekly_Sales.agg(['mean','std']).sort_values(by='std',ascending=False).head(10).plot.bar()
plt.xlabel('Store number')
plt.ylabel('Sales')
plt.title('Mean and std deviation in sales in top 10 stores' , color='b')

In [None]:
df = data.groupby(['Store']).Weekly_Sales.agg(['mean','std']).sort_values(by='std',ascending=False).head(10)

In [None]:
df.head()

In [None]:
df['Coefficient of mean to std'] = df['mean']/df['std']

In [None]:
df['Coefficient of mean to std']

In [None]:
sns.barplot(df.index,df['Coefficient of mean to std'], palette='autumn_r')
plt.xlabel('Store number')
plt.title('Mean to std deviation in sales in top 10 stores' , color='b')

In [None]:
# Store 14 has maximum fluctuations in sales an the coefficient of mean 
# to standard deviation was found to be 6.363

In [None]:
# Task 3: Which store/s has good quarterly growth rate in Q3’2012

In [None]:
data.Date= pd.to_datetime(data.Date)

In [None]:
df2 = data[(data.Date>='2012-07-01') & (data.Date<='2012-09-30')]

In [None]:
Q3_data=df2.groupby(['Store'])['Weekly_Sales'].sum().sort_values(ascending=False).head(10)
plt.figure(figsize=(8,5))
sns.barplot(Q3_data.index, Q3_data.values, palette='ocean_r')
plt.xlabel('Store number')
plt.ylabel('Sales')
plt.title('Maximum sales in Q3 2012 for top 5 stores' , color='b')
# Stores 4,20,13,2,10 are the top 5 stores that did well in Q3'2012

In [None]:
df3= data[(data.Date>='2012-04-01') & (data.Date<='2012-06-30')]

In [None]:
# second quarter data
Q2_data=df3.groupby(['Store'])['Weekly_Sales'].sum().sort_values(ascending=False) 
Q2_data.head()

In [None]:
Q_total=pd.concat([Q2_data,Q3_data], axis=1)
Q_total.head()

In [None]:
Q_total_cols=['Q2','Q3']
Q_total.columns=Q_total_cols

In [None]:
Q_total['Sequential_Growth_Rate'] = ((Q_total.Q3-Q_total.Q2)/Q_total.Q2)*100

In [None]:
Q_total_sort=Q_total.sort_values(by='Sequential_Growth_Rate', ascending=False).head(10)
Q_total_sort.head()
# All the stores have lesser sales compared to previous quarter still 
# store 16 has least drop rate while store 4 has highest number of sales

In [None]:
plt.figure(figsize=(8,5))
sns.barplot(Q_total_sort.index, Q_total_sort.Sequential_Growth_Rate, palette='ocean_r')
plt.xlabel('Store number')
plt.ylabel('Sequential_Growth_Rate')
plt.title('Growth rate from Quarter second to thirdfor top 10 stores' , color='b')

In [None]:
# Task 4: Some holidays have a negative impact on sales. 
# Find out holidays which have higher sales than the mean sales in non-holiday season for all stores together

In [None]:
holiday_data=data[data.Holiday_Flag==1]
nonholiday_data=data[data.Holiday_Flag==0]

In [None]:
holiday_data

In [None]:
nonholiday_sales_mean=nonholiday_data.Weekly_Sales.mean()
nonholiday_sales_mean

In [None]:
# Superbowl:
superbowl_data=holiday_data[(holiday_data.Date=='2010-12-02')|(holiday_data.Date=='2011-11-02')|(holiday_data.Date=='2012-10-02')|(holiday_data.Date=='2013-08-02')]
superbowl_data.Weekly_Sales.mean()

In [None]:
# Labour Day
labour_data=holiday_data[(holiday_data.Date=='2010-10-09')|(holiday_data.Date=='2011-11-09')|(holiday_data.Date=='2012-07-09')|(holiday_data.Date=='2013-06-09')]
labour_data.Weekly_Sales.mean()

In [None]:
# Thanksgiving
thanksgiving_data=holiday_data[(holiday_data.Date=='2010-11-26')|(holiday_data.Date=='2011-11-25')|(holiday_data.Date=='2012-11-25')|(holiday_data.Date=='2013-11-29')]
thanksgiving_data.Weekly_Sales.mean()

In [None]:
# Christmas
christmas_data=holiday_data[(holiday_data.Date=='2010-12-31')|(holiday_data.Date=='2011-12-30')|(holiday_data.Date=='2012-12-28')|(holiday_data.Date=='2013-12-27')]
christmas_data.Weekly_Sales.mean()

In [None]:
# Except Christmas all the holidays including Super Bowl, Thanksgiving, Christmas have their sales higher than non holiday weeks.
# During Thanksgiving highest weekly sales were recorded.

In [None]:
# Task 5:Provide a monthly and semester view of sales in units and give insights

In [None]:
df5=data

In [None]:
df5.head()

In [None]:
df5['month']= pd.DatetimeIndex(df5['Date']).month

In [None]:
df5.head().reset_index()

In [None]:
df5['semester']= [1 if i<=6 else 2 for i in df5['month']]
df5.head()

In [None]:
plt.figure(figsize=(8,5))
monthly_sale=df5.groupby(['month'])['Weekly_Sales'].sum()
sns.barplot(monthly_sale.index, monthly_sale.values, palette='autumn')
plt.ylabel('Weekly_Sales', fontsize=12, fontweight='bold')
plt.xlabel('Month', fontsize=12, fontweight='bold')
plt.title('Monthly data of sales',fontsize=12, color='blue')
# Maximum sales were obtained in month of April when there were no holidays.

In [None]:
plt.figure(figsize=(8,5))
semester_sales=df5.groupby('semester')['Weekly_Sales'].sum()
sns.barplot(semester_sales.index, semester_sales.values, palette='autumn')
plt.ylabel('Weekly_Sales', fontsize=12, fontweight='bold')
plt.xlabel('Semester', fontsize=12, fontweight='bold')
plt.title('Semester-wise data of sales',fontsize=12, color='blue')
# Sales were more in second semester

In [None]:
# Task 6: For Store 1 – Build  prediction models to forecast demand
# Linear Regression – Utilize variables like date and restructure dates as 1 for 5 Feb 2010 (starting from the earliest date in order). 
# Hypothesize if CPI, unemployment, and fuel price have any impact on sales.
# Change dates into days by creating new variable.

In [None]:
def plot_data(model_data,feature):
    plt.figure(figsize=(12,6))
    plt.subplot(1,2,1)
    sns.distplot(model_data[feature])
    plt.subplot(1,2,2)
    stat.probplot(model_data[feature], dist='norm', plot=pylab)
    plt.show()

In [None]:
plot_data(data,'Fuel_Price') # The dat is pairly distributed normally

In [None]:
plot_data(data,'Unemployment')

In [None]:
# Log transformation
data['Unemp_log']= np.log(data.Unemployment)
plot_data(data,'Unemp_log') # Data is normally distributed

In [None]:
plot_data(data,'CPI')

In [None]:
# Log transformation
data['CPI_log']= np.log(data.CPI)
plot_data(data,'CPI_log') # Data has some gap which cannot be filled with standard feature transformation techniques

In [None]:
data.head()

In [None]:
model_data= data[data.Store==1][['Store','Date','Fuel_Price','CPI_log','Unemp_log','Weekly_Sales']]
model_data.head()

In [None]:
model_data.Date = data[data.Store==1]['Date'].index
model_data.Date+=1
model_data.head()

In [None]:
x= model_data.drop(['Weekly_Sales','Store','Date'], axis=1)
y= model_data.iloc[:,-1]

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state=5)

In [None]:
from sklearn.linear_model import LinearRegression
model=LinearRegression()
model.fit(x_train,y_train)

In [None]:
y_pred=model.predict(x_test)

In [None]:
model.score(x_train,y_train)

In [None]:
model.score(x_test,y_test)

In [None]:
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

In [None]:
model_data.corr()

In [None]:
from sklearn.preprocessing import MinMaxScaler
min_max=MinMaxScaler()
x_std = pd.DataFrame(min_max.fit_transform(x), columns=x.columns)
x_std.head()

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_std,y,test_size=0.2, random_state=5)

In [None]:
from sklearn.linear_model import LinearRegression
model_new=LinearRegression()
model_new.fit(x_train,y_train)

In [None]:
y_pred= model_new.predict(x_test)

In [None]:
model_new.score(x_test,y_test)

In [None]:
""""Conclusion: From the Pearson correlation coefficient matrix it is evident that the 
CPI, Unemployment and Fuel price do not affect the weekly sales significantly.
Thus developing a regression model using these variables to predict the sales
figure wont't fetch the expected result which is evident from the model score.
I attempted to use feature engineering techniques like min max scaler to supress
effects of variations in input variables still the model accuracy remained the same.
Thus it could be hypothesized that these variables do not affetc the sales data.
"""