## IMPORTING LIBRARIES

In [177]:
import numpy as np
import pandas as pd

# For data visualization
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import skew
# For displaying all of the columns in dataframes
pd.set_option('display.max_columns', None)
import plotly.express as px

import warnings
warnings.filterwarnings("ignore")

# For saving models
import pickle 

## LOAD THE DATA AND DESCRIBE

In [178]:
store = pd.read_csv("stores.csv")
sales=pd.read_csv("sales.csv")
features=pd.read_csv("features.csv")

In [179]:
store.head()

Unnamed: 0,Store,Type,Size
0,1,A,151315
1,2,A,202307
2,3,B,37392
3,4,A,205863
4,5,B,34875


In [180]:
store.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45 entries, 0 to 44
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Store   45 non-null     int64 
 1   Type    45 non-null     object
 2   Size    45 non-null     int64 
dtypes: int64(2), object(1)
memory usage: 1.2+ KB


In [181]:
store.isnull().sum()

Store    0
Type     0
Size     0
dtype: int64

In [182]:
sales.head()

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday
0,1,1,05/02/2010,24924.5,False
1,1,1,12/02/2010,46039.49,True
2,1,1,19/02/2010,41595.55,False
3,1,1,26/02/2010,19403.54,False
4,1,1,05/03/2010,21827.9,False


In [183]:
sales.tail()

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday
421565,45,98,28/09/2012,508.37,False
421566,45,98,05/10/2012,628.1,False
421567,45,98,12/10/2012,1061.02,False
421568,45,98,19/10/2012,760.01,False
421569,45,98,26/10/2012,1076.8,False


In [184]:
sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 421570 entries, 0 to 421569
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Store         421570 non-null  int64  
 1   Dept          421570 non-null  int64  
 2   Date          421570 non-null  object 
 3   Weekly_Sales  421570 non-null  float64
 4   IsHoliday     421570 non-null  bool   
dtypes: bool(1), float64(1), int64(2), object(1)
memory usage: 13.3+ MB


In [185]:
sales.isnull().sum()

Store           0
Dept            0
Date            0
Weekly_Sales    0
IsHoliday       0
dtype: int64

In [186]:
features.head()

Unnamed: 0,Store,Date,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday
0,1,05/02/2010,42.31,2.572,,,,,,211.096358,8.106,False
1,1,12/02/2010,38.51,2.548,,,,,,211.24217,8.106,True
2,1,19/02/2010,39.93,2.514,,,,,,211.289143,8.106,False
3,1,26/02/2010,46.63,2.561,,,,,,211.319643,8.106,False
4,1,05/03/2010,46.5,2.625,,,,,,211.350143,8.106,False


In [187]:
features.tail()

Unnamed: 0,Store,Date,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday
8185,45,28/06/2013,76.05,3.639,4842.29,975.03,3.0,2449.97,3169.69,,,False
8186,45,05/07/2013,77.5,3.614,9090.48,2268.58,582.74,5797.47,1514.93,,,False
8187,45,12/07/2013,79.37,3.614,3789.94,1827.31,85.72,744.84,2150.36,,,False
8188,45,19/07/2013,82.84,3.737,2961.49,1047.07,204.19,363.0,1059.46,,,False
8189,45,26/07/2013,76.06,3.804,212.02,851.73,2.06,10.88,1864.57,,,False


In [188]:
features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8190 entries, 0 to 8189
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Store         8190 non-null   int64  
 1   Date          8190 non-null   object 
 2   Temperature   8190 non-null   float64
 3   Fuel_Price    8190 non-null   float64
 4   MarkDown1     4032 non-null   float64
 5   MarkDown2     2921 non-null   float64
 6   MarkDown3     3613 non-null   float64
 7   MarkDown4     3464 non-null   float64
 8   MarkDown5     4050 non-null   float64
 9   CPI           7605 non-null   float64
 10  Unemployment  7605 non-null   float64
 11  IsHoliday     8190 non-null   bool   
dtypes: bool(1), float64(9), int64(1), object(1)
memory usage: 712.0+ KB


In [189]:
features.isnull().sum()

Store              0
Date               0
Temperature        0
Fuel_Price         0
MarkDown1       4158
MarkDown2       5269
MarkDown3       4577
MarkDown4       4726
MarkDown5       4140
CPI              585
Unemployment     585
IsHoliday          0
dtype: int64

In [190]:
#data type conversion
sales["Date"] = pd.to_datetime(sales["Date"],format="%d/%m/%Y")
features["Date"] = pd.to_datetime(features["Date"],format="%d/%m/%Y")

In [192]:
features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8190 entries, 0 to 8189
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Store         8190 non-null   int64         
 1   Date          8190 non-null   datetime64[ns]
 2   Temperature   8190 non-null   float64       
 3   Fuel_Price    8190 non-null   float64       
 4   MarkDown1     4032 non-null   float64       
 5   MarkDown2     2921 non-null   float64       
 6   MarkDown3     3613 non-null   float64       
 7   MarkDown4     3464 non-null   float64       
 8   MarkDown5     4050 non-null   float64       
 9   CPI           7605 non-null   float64       
 10  Unemployment  7605 non-null   float64       
 11  IsHoliday     8190 non-null   bool          
dtypes: bool(1), datetime64[ns](1), float64(9), int64(1)
memory usage: 712.0 KB


In [193]:
sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 421570 entries, 0 to 421569
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   Store         421570 non-null  int64         
 1   Dept          421570 non-null  int64         
 2   Date          421570 non-null  datetime64[ns]
 3   Weekly_Sales  421570 non-null  float64       
 4   IsHoliday     421570 non-null  bool          
dtypes: bool(1), datetime64[ns](1), float64(1), int64(2)
memory usage: 13.3 MB


In [194]:
features.head()

Unnamed: 0,Store,Date,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday
0,1,2010-02-05,42.31,2.572,,,,,,211.096358,8.106,False
1,1,2010-02-12,38.51,2.548,,,,,,211.24217,8.106,True
2,1,2010-02-19,39.93,2.514,,,,,,211.289143,8.106,False
3,1,2010-02-26,46.63,2.561,,,,,,211.319643,8.106,False
4,1,2010-03-05,46.5,2.625,,,,,,211.350143,8.106,False


In [159]:
features.tail()

Unnamed: 0,Store,Date,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday
8185,45,2013-06-28,76.05,3.639,4842.29,975.03,3.0,2449.97,3169.69,,,False
8186,45,2013-07-05,77.5,3.614,9090.48,2268.58,582.74,5797.47,1514.93,,,False
8187,45,2013-07-12,79.37,3.614,3789.94,1827.31,85.72,744.84,2150.36,,,False
8188,45,2013-07-19,82.84,3.737,2961.49,1047.07,204.19,363.0,1059.46,,,False
8189,45,2013-07-26,76.06,3.804,212.02,851.73,2.06,10.88,1864.57,,,False


In [195]:
sales.head()

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday
0,1,1,2010-02-05,24924.5,False
1,1,1,2010-02-12,46039.49,True
2,1,1,2010-02-19,41595.55,False
3,1,1,2010-02-26,19403.54,False
4,1,1,2010-03-05,21827.9,False


In [198]:
features.columns, sales.columns

(Index(['Store', 'Date', 'Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2',
        'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment',
        'IsHoliday'],
       dtype='object'),
 Index(['Store', 'Dept', 'Date', 'Weekly_Sales', 'IsHoliday'], dtype='object'))

In [218]:
df_merged=pd.merge(features, sales, on=["Store", "Date", "IsHoliday"],how="outer")
df_merged.head()

Unnamed: 0,Store,Date,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday,Dept,Weekly_Sales
0,1,2010-02-05,42.31,2.572,,,,,,211.096358,8.106,False,1.0,24924.5
1,1,2010-02-05,42.31,2.572,,,,,,211.096358,8.106,False,2.0,50605.27
2,1,2010-02-05,42.31,2.572,,,,,,211.096358,8.106,False,3.0,13740.12
3,1,2010-02-05,42.31,2.572,,,,,,211.096358,8.106,False,4.0,39954.04
4,1,2010-02-05,42.31,2.572,,,,,,211.096358,8.106,False,5.0,32229.38


In [219]:
df_merged.tail()

Unnamed: 0,Store,Date,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday,Dept,Weekly_Sales
423320,45,2013-06-28,76.05,3.639,4842.29,975.03,3.0,2449.97,3169.69,,,False,,
423321,45,2013-07-05,77.5,3.614,9090.48,2268.58,582.74,5797.47,1514.93,,,False,,
423322,45,2013-07-12,79.37,3.614,3789.94,1827.31,85.72,744.84,2150.36,,,False,,
423323,45,2013-07-19,82.84,3.737,2961.49,1047.07,204.19,363.0,1059.46,,,False,,
423324,45,2013-07-26,76.06,3.804,212.02,851.73,2.06,10.88,1864.57,,,False,,


In [220]:
df_merged.shape


(423325, 14)

In [223]:
df_full=pd.merge(df_merged,store, on=["Store"],how="outer")
df_full.shape

(423325, 16)

In [225]:
df_full.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 423325 entries, 0 to 423324
Data columns (total 16 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   Store         423325 non-null  int64         
 1   Date          423325 non-null  datetime64[ns]
 2   Temperature   423325 non-null  float64       
 3   Fuel_Price    423325 non-null  float64       
 4   MarkDown1     152433 non-null  float64       
 5   MarkDown2     112532 non-null  float64       
 6   MarkDown3     138658 non-null  float64       
 7   MarkDown4     136466 non-null  float64       
 8   MarkDown5     153187 non-null  float64       
 9   CPI           422740 non-null  float64       
 10  Unemployment  422740 non-null  float64       
 11  IsHoliday     423325 non-null  bool          
 12  Dept          421570 non-null  float64       
 13  Weekly_Sales  421570 non-null  float64       
 14  Type          423325 non-null  object        
 15  Size          423

In [284]:
df_full.duplicated().sum()

0

## EXPLORATORY DATA ANALYSIS

#### 1.Identify average customer visit in the type B store in April Months

In [229]:
#filter the type B store
typeB=df_full[(df_full['Type']=='B') & (df_full['Date'].dt.month==4)].reset_index(drop=True)
typeB


#calculate the average customer visit
average_customer_visit=round(len(typeB)/len(typeB['Store'].unique()),2)
print("Average customer visit in the type B store in April Months : ",average_customer_visit)

Average customer visit in the type B store in April Months :  945.59


#### 2. Identify best average sales in holiday week for all store types


In [230]:
#filter the holiday week(isholiday=True)
holiday_week=df_full[df_full['IsHoliday']==True].reset_index(drop=True)

#calculate the average
average_sales_by_type=holiday_week.groupby('Type')['Weekly_Sales'].mean()

#best average sales
best_store_sales=average_sales_by_type.idxmax()
best_avg_sales_in_all_stores=round(average_sales_by_type.max(),2)

print("Best average sales in all store types: ",best_avg_sales_in_all_stores)
print("Best store type: ",best_store_sales)

Best average sales in all store types:  21297.52
Best store type:  A


#### 3. Which store had a worst sales in leap year

In [231]:
#filter the leap year
leap_year=df_full[(df_full['Date'].dt.is_leap_year)].reset_index(drop=True)

#calculate the worst sales
worst_sales=leap_year.groupby('Store')['Weekly_Sales'].sum().idxmin()
worst_store_sales=leap_year.groupby('Store')['Weekly_Sales'].sum().min()

print("Worst store sales in leap year: ",worst_store_sales)
print("Worst store: ",worst_sales)

Worst store sales in leap year:  11435551.03
Worst store:  33


#### 4. What is the expected sales of each department when unemployment factor is greater than 8


In [234]:
#filter the expected sales
es=df_full[df_full["Unemployment"]>8].reset_index(drop=True)

#calculate the expected sales
expected_sales=es.groupby('Dept')['Weekly_Sales'].mean()
expected_sales_df = expected_sales.reset_index(name="Expected_Weekly_Sales")

print("Expected sales of each department when unemployment is greater than 8: ")
expected_sales_df

Expected sales of each department when unemployment is greater than 8: 


Unnamed: 0,Dept,Expected_Weekly_Sales
0,1.0,17719.577853
1,2.0,43370.044649
2,3.0,10984.784400
3,4.0,25040.236272
4,5.0,19914.917085
...,...,...
76,95.0,68200.568373
77,96.0,15487.837565
78,97.0,13024.744137
79,98.0,6225.629719


#### 5. Aggregate the net(total) sales of each department on month wise


In [235]:
# extract the month
df_full['Month']=df_full['Date'].dt.month

# aggregate the net(total) sales
net_sales=df_full.groupby(['Month','Dept'])['Weekly_Sales'].sum()
total_sales=net_sales.reset_index(name="Total_Sales")

print("Aggregate the net(total) sales of each department on month wise : ")
total_sales

Aggregate the net(total) sales of each department on month wise : 


Unnamed: 0,Month,Dept,Total_Sales
0,1,1.0,4919530.75
1,1,2.0,14344921.31
2,1,3.0,4013980.96
3,1,4.0,9132939.99
4,1,5.0,6014721.65
...,...,...,...
957,12,95.0,28788259.55
958,12,96.0,5312461.33
959,12,97.0,6035160.16
960,12,98.0,3323336.51


#### 6. Which store performs high sales in week wise

In [250]:
# Extract week number from 'Date' column
df_full["Week"]=df_full["Date"].dt.strftime("%U")

# Aggregate weekly sales
weekly_sales_by_store=df_full.groupby(["Week","Store"])["Weekly_Sales"].sum().reset_index()
store_with_highest_sales = weekly_sales_by_store.loc[weekly_sales_by_store.groupby('Week')['Weekly_Sales'].idxmax()]

print("Store with highest sales: ")
store_with_highest_sales.reset_index(drop=True, inplace=True)
store_with_highest_sales


Store with highest sales: 


Unnamed: 0,Week,Store,Weekly_Sales
0,0,1,0.0
1,1,4,3910242.34
2,2,4,3807179.07
3,3,4,3891491.7
4,4,4,3742961.36
5,5,20,6956061.74
6,6,20,6783474.32
7,7,4,6793995.99
8,8,4,6230485.7
9,9,20,6411461.09


#### 7. Identify better department performance based on the store on all the week


In [252]:
#group store and department wise sales
store_department_sales=df_full.groupby(["Store","Dept"])["Weekly_Sales"].sum()

#identify better department performance
best_dept_indices = store_department_sales.groupby(level='Store').idxmax()

best_department = store_department_sales.loc[best_dept_indices]

print("Best department performance based on the store on all the week: ")
best_department.reset_index(name="Best_Department_weekly_sales")

Best department performance based on the store on all the week: 


Unnamed: 0,Store,Dept,Best_Department_weekly_sales
0,1,92.0,19370632.64
1,2,92.0,23572153.03
2,3,38.0,15529566.07
3,4,92.0,22789210.43
4,5,38.0,7893570.24
5,6,92.0,14160545.9
6,7,72.0,6447844.96
7,8,95.0,9002059.31
8,9,38.0,11184287.78
9,10,72.0,20410926.56


#### 8. Identify the store which has minimum fuel price based on the week


In [266]:
min_fuel_price_by_store_date = df_full.groupby(['Date', 'Store'])['Fuel_Price'].min().reset_index(name="Fuel_Price")

# Find the store with the minimum fuel price based on each date
store_with_min_fuel_price = min_fuel_price_by_store_date.loc[min_fuel_price_by_store_date.groupby('Date')['Fuel_Price'].idxmin()]

print("Store with the minimum fuel price based on each week: ")
new=store_with_min_fuel_price.reset_index(drop=True)
new

Store with the minimum fuel price based on each week: 


Unnamed: 0,Date,Store,Fuel_Price
0,2010-02-05,36,2.545
1,2010-02-12,36,2.539
2,2010-02-19,36,2.472
3,2010-02-26,36,2.520
4,2010-03-05,36,2.574
...,...,...,...
177,2013-06-28,36,3.428
178,2013-07-05,4,3.385
179,2013-07-12,4,3.368
180,2013-07-19,36,3.507


#### 9. Identify overall performance of the store based on year wise

In [270]:
#extract year from date
df_full['Year'] = df_full['Date'].dt.year

#group store and date wise sales
store_date_sales=df_full.groupby(['Store','Year'])['Weekly_Sales'].sum().reset_index()

#identify overall performance based on weekly sales
store_overall_performance = store_date_sales.groupby('Store')['Weekly_Sales'].sum()
overall_performance_df = store_overall_performance.reset_index(name="Overall_Performance")

print("Overall performance of store based on year: ")
overall_performance_df

Overall performance of store based on year: 


Unnamed: 0,Store,Overall_Performance
0,1,222402800.0
1,2,275382400.0
2,3,57586740.0
3,4,299544000.0
4,5,45475690.0
5,6,223756100.0
6,7,81598280.0
7,8,129951200.0
8,9,77789220.0
9,10,271617700.0


#### 10. Identify the performance of the store on week wise with andwithout offers


In [279]:
#checking the markdown values are not nan
df_full['Offers_Present'] = ~df_full[['MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5']].isnull().all(axis=1)

# Group by week and whether offers were present or not, then calculate the sum of weekly sales
performance_by_week_with_offers = df_full.groupby(['Date', 'Offers_Present'])['Weekly_Sales'].sum().reset_index()
performance_by_week_with_offers_1=performance_by_week_with_offers[performance_by_week_with_offers['Offers_Present'] == True].reset_index(drop=True)

# Filter out rows where offers were not present
performance_by_week_without_offers = performance_by_week_with_offers[performance_by_week_with_offers['Offers_Present'] == False]

print("Weekly sales with and without offers: ")
performance_by_week_with_offers_1

Weekly sales with and without offers: 


Unnamed: 0,Date,Offers_Present,Weekly_Sales
0,2011-11-11,True,48474224.75
1,2011-11-18,True,46438980.56
2,2011-11-25,True,66593605.26
3,2011-12-02,True,49390556.49
4,2011-12-09,True,55561147.70
...,...,...,...
85,2013-06-28,True,0.00
86,2013-07-05,True,0.00
87,2013-07-12,True,0.00
88,2013-07-19,True,0.00


In [280]:
performance_by_week_without_offers

Unnamed: 0,Date,Offers_Present,Weekly_Sales
0,2010-02-05,False,49750740.50
1,2010-02-12,False,48336677.63
2,2010-02-19,False,48276993.78
3,2010-02-26,False,43968571.13
4,2010-03-05,False,46871470.30
...,...,...,...
87,2011-10-07,False,47211688.36
88,2011-10-14,False,44374820.30
89,2011-10-21,False,45818953.44
90,2011-10-28,False,45855821.05
