# Data Analysis On Walmart Dataset

##### Requirements step (Uncomment the below code to install missing libary if not using jupyter or anaconda installation)

In [None]:
# !pip install numpy pandas matplotlib seaborn

#### Import the libraries

In [None]:
import pandas as pd    # data preprocessing
import numpy as np     # mathematical computation
import matplotlib.pyplot as plt  # visualization
import seaborn as sns  # visualization

In [None]:
import warnings
warnings.filterwarnings('ignore')

#### Read the dataset

In [None]:
df = pd.read_csv('Walmart.csv')
print(type(df))  # dataframe
df.head()        # top 5 rows

##### Feature Description

Store - Store num

Date - date of sales

Weekly Sales - Weekly sales

Holiday flag  - 0(non holiday), 1(holiday)

temp - temp on day os sales

fuel_price - Cost of fuel in th region

CPI - Consumer Price Index

Unemployment - unemployment rate

In [None]:
df.shape
# rows = 6435, columns = 8

### Data Preporcessing

#### 1) Handling Null Values

In [None]:
df.isnull().sum()

#### 2) Handle the duplicates

In [None]:
df.duplicated().sum()

#### Inference
1) There are no duplicated records<br>
2) Data contains no null values

#### 3) Check data types

In [None]:
df.dtypes

In [None]:
df['Date'] = pd.to_datetime(df['Date'])
df.dtypes

#### Extract Year, Quarter, Month, Month_Name, Day, Day_Name from Date field

In [None]:
df1 = df.copy()

In [None]:
# dt = datetime accessor
df['Year'] = df['Date'].dt.year
df['Quarter'] = df['Date'].dt.quarter
df['Month'] = df['Date'].dt.month
df['MonthName'] = df['Date'].dt.month_name()
df['Day'] = df['Date'].dt.day
df['DayName'] = df['Date'].dt.day_name()

In [None]:
df.head()

In [None]:
df.dtypes.value_counts()

### Seaborn EDA

#### 1) Countplot
Counts the different categoires of a field and represents them on bar chart.

#### a) Depict Count of Holiday flag on a bar chart

In [None]:
a = df['Holiday_Flag'].value_counts()
a

In [None]:
plt.bar(a.index, a.values)
plt.show()

In [None]:
sns.countplot(x=df['Holiday_Flag'], palette=['Red', 'Green'])
plt.title('Holiday Flag Count')
plt.show()

#### Countplot for Year

In [None]:
sns.countplot(y=df['Year'])
plt.show()

#### Countplot for Year for each Holiday Flag

In [None]:
a2 = df.groupby(['Year', 'Holiday_Flag'])['Year'].count()
a2

In [None]:
df['Year'].value_counts()

In [None]:
sns.countplot(df['Year'], hue=df['Holiday_Flag'])  # clustered bar chart
# Hue parameter is used for grouping
plt.show()

In [None]:
# 2010  0               1980
#       1                180
# 2011  0               2160
#       1                180
# 2012  0               1845
#       1                 90

### Boxplot

In [None]:
num_cols = df.dtypes[df.dtypes != 'object'].index
num_cols

In [None]:
cols1 = ['Weekly_Sales', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment']

plt.figure(figsize=(15, 11))   # width = 15, height = 11
for i in range(len(cols1)):
    plt.subplot(2, 3, i+1)                # rows = 2,cols=3,
    sns.boxplot(x=df[cols1[i]])         # represent the quartile distribution
    plt.title(f'Countplot for {cols1[i]}')
plt.show()

### Violin Plot
Combination of Boxplot and KDEplot (Kernel Density Plot)

In [None]:
cols1 = ['Weekly_Sales', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment']

plt.figure(figsize=(15, 11))   # width = 15, height = 11
for i in range(len(cols1)):
    plt.subplot(2, 3, i+1)                # rows = 2,cols=3,
    # represent the quartile distribution
    sns.violinplot(x=df[cols1[i]])
    plt.title(f'Countplot for {cols1[i]}')
plt.show()

### Scatter Chart
To represent the correlation amongst the variables 

In [None]:
num_cols

In [None]:
plt.figure(figsize=(10, 10))
sns.scatterplot(x=df['Temperature'], y=df['Fuel_Price'],
                hue=df['Holiday_Flag'])
plt.show()

### Jointplot
It is a combination of ScatterChart + Histogram.

In [None]:
plt.figure(figsize=(10, 10))
# kind = 'scatter','hex','kde'
sns.jointplot(x=df['CPI'], y=df['Fuel_Price'], kind='hex', color='maroon')
# OR
sns.jointplot(x='CPI', y='Fuel_Price', data=df, kind='hex', color='maroon')
# ,hue=df['Holiday_Flag'])
plt.show()

In [None]:
df.columns

##### Q1) Problem:Create a Seaborn scatter plot to visualize the relationship between "Temperature" and "Weekly_Sales" for store 
1. Add appropriate labels and a title to the plot.

In [None]:
s1 = df[df['Store'] == 1]
print(s1.shape)

In [None]:
sns.scatterplot(x=df['Temperature'], y=df['Weekly_Sales'],)
plt.title('Temp vs Weekly Sales')
plt.show()

##### Q2) Problem:Generate a Seaborn box plot to visualize the distribution of "Weekly_Sales" for holidays(Holiday_Flag = 1) and non-holidays (Holiday_Flag = 0) across all stores. Add appropriate labels and a title to the plot.

In [None]:
sns.boxplot(x=df['Holiday_Flag'], y=df['Weekly_Sales'])
plt.title('Boxplot for Weekly Sales based on Holidy_Flag')
plt.show()

##### Q3) Problem:Visualize the distribution of "Fuel_Price"
using a Seaborn histogram for store 2. Set the number of bins to
20. Add labels and a title to the plot.

In [None]:
sns.histplot(df['Fuel_Price'], bins=20)
plt.title(f'Distribution of Fuel Price based on 20 bins')
plt.show()

##### Q4) Problem:Generate a Seaborn line plot to visualize the trend of "CPI" over time for store 
3. Set the x-axis as the "Date" and the y-axis as "CPI." Add appropriate labels and a title

In [None]:
df.columns

In [None]:
s3 = df[df['Store'] == 3]
print(s3.shape)

In [None]:
plt.figure(figsize=(12, 5))
sns.lineplot(x=s3['Date'], y=s3['CPI'])
plt.title('Date vs CPI for Store 3')
plt.show()

##### Q4) Problem:Create a Seaborn bar plot to visualize the total sales ("Weekly_Sales") for each store. Sort the stores in descending order of total sales. Add appropriate labels and a title to the plot. 

In [None]:
df['Store'].value_counts()

In [None]:
sns.barplot(x=df['Store'], y=df['Weekly_Sales'])   # Store wise mean Sales
plt.title('Store wsie Total Sales')
plt.show()

In [None]:
q5 = df.groupby('Store')['Weekly_Sales'].sum()
q5

In [None]:
plt.figure(figsize=(15, 6))
sns.barplot(x=q5.index, y=q5.values,
            order=q5.sort_values(ascending=False).index)
plt.title('Store wise Total Weekly Sales(Sum of Weekly Sales) in desc order of Total Weekly Sales')
plt.show()

##### Q6) Problem:Generate a Seaborn pair plot to visualize pairwise relationships between "Temperature," "Fuel_Price," "Unemployment,"and "CPI" for store 5. Use different colors for data points with and without holiday flags (Holiday_Flag = 1 and 0). Add a title to the plot.

In [None]:
s5 = df[df['Store'] == 5]
print(s5.shape)

In [None]:
sns.pairplot(data=s5, vars=['Temperature', 'Fuel_Price', 'Unemployment', 'CPI'],
             hue='Holiday_Flag')
plt.show()

#### HeatMap
1) It is used to represent correlation (not scatterchart but the magnidtue).


In [None]:
corr = df.corr()
corr

In [None]:
# corr(x,y) = sum((xi-xmean)*(yi-ymean))/sqrt((sum((xi-xmean)**2) + sum((yi-ymean)**2)))

In [None]:
plt.figure(figsize=(20, 20))
sns.heatmap(corr, annot=True, cmap='coolwarm')  # annot-Annotation
plt.show()

In [None]:
x = [1, 4, 7, 12, 17]
y = [3, 6, 13, 26, 21]
np.corrcoef(x, y)

# [corr(x,x)  corr(x,y)]
# [corr(y,x)  corr(y,y)]

### Stripplot
Distribution of data points for a given category.

In [None]:
df.columns

In [None]:
plt.figure(figsize=(12, 10))
sns.stripplot(x=df['Year'], y=df['Weekly_Sales'],
              palette=['red', 'yellow', 'brown'])
plt.title('Stripplot for Year vs Weekly Sales')
plt.show()

In [None]:
plt.figure(figsize=(12, 10))
sns.swarmplot(x=df['Year'], y=df['Weekly_Sales'])
plt.title('Swarmplot for Year vs Weekly Sales')
plt.show()

#### Distplot, KdePlot, Histplot

In [None]:
sns.distplot(s1['Weekly_Sales'], kde=False)  # kde = kernel density estimate
plt.show()

In [None]:
sns.kdeplot(s1['Weekly_Sales'])  # kde = kernel density estimate
plt.show()

In [None]:
sns.histplot(s1['Weekly_Sales'], kde=True)  # kde = kernel density estimate
plt.show()

### Pairplot

1) It represents the scatterchart for all pairs of numerical features (by default).<br>
2) It repensents the correlation.<br>
3) Pairplot is represented in form of square matrix. Where the primary diagonal charts are 
usually the histogram (for the same feature on both x and y axis)

In [None]:
print(s1.shape)
print(s1.columns)

In [None]:
sns.histplot(s1['Weekly_Sales'], kde=True)  # kde = kernel density estimate
sns.despine(right=False, left=True, top=False, bottom=True)
plt.show()

In [None]:
sns.barplot(s1['Quarter'], s1['Weekly_Sales'])
sns.despine(right=False, top=True, bottom=False)
plt.show()

-----
##### Different Types of Plots in Seaborn used in these EDA

1. countplot
2. boxplot
3. violinplot
4. jointplot
5. scatter chart
6. pairplot
7. heatmap
8. lineplot
9. barplot
10. Stripplot
11. swarmplot
12. Pairplot
13. kdeplot
14. histplot
15. distplot

----

### End Of Walmart dataset EDA

----