## **Exploratory Data Analysis - Retail**


* "Exploratory Data Analysis" was performed on the "SampleSuperstore" dataset.


# Author: Muhammet Varlı

In [None]:
from warnings import filterwarnings
filterwarnings('ignore')

In [None]:
# Some Libraries Imported
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

## **1. Data Read**

In [None]:
df=pd.read_csv("../input/tsf-datasets/SampleSuperstore.csv")

In [None]:
# Let's have an overview of the data set
df.head()

## **2. Recognizing the dataset**

In [None]:
# Information of dataset
df.info()

In [None]:
# Checking missing value of the dataset
df.isnull().sum()

In [None]:
# Now let's look at the details of the object type features in the dataset.

In [None]:
df["Ship Mode"].value_counts()

In [None]:
df["Segment"].value_counts()

In [None]:
df["Country"].value_counts()

In [None]:
df["City"].value_counts()

In [None]:
df["State"].value_counts()

In [None]:
df["Region"].value_counts()

In [None]:
df["Category"].value_counts()

In [None]:
df["Sub-Category"].value_counts()

In [None]:
df["Quantity"].value_counts()

In [None]:
df["Discount"].value_counts()

## **3. Data Visualization**

In [None]:
# 1. Region
plt.figure(figsize=(16,8))
df['Region'].value_counts().plot.bar()
plt.title('Sales by Regions')
plt.ylabel('Count')
plt.xlabel('Region')
plt.show()


In [None]:
# 2. Top 20 States in Sales
plt.figure(figsize=(16,8))
top20states = df['State'].value_counts()
top20states = top20states [:20]
top20states.plot(kind='bar', color='red')
plt.title('Top 20 States in Sales')
plt.ylabel('Count')
plt.xlabel('States')
plt.show()
# California as a State tops all the States in Sales

In [None]:
# 3. Lower 30 States in Sales
plt.figure(figsize=(16,8))
lower30states = df['State'].value_counts().sort_values(ascending=True)
lower30states = lower30states [:30]
lower30states.plot(kind='bar', color='green')
plt.title('Lower 30 States in Sales')
plt.ylabel('Count')
plt.xlabel('States')
plt.show()
# Wyoming as a State lower all the States in Sales

In [None]:
# 4. Top 20 City in Sales
plt.figure(figsize=(16,8))
top20city = df['City'].value_counts()
top20city = top20city [:20]
top20city.plot(kind='bar', color='blue')
plt.title('Top 20 City in Sales')
plt.ylabel('Count')
plt.xlabel('City')
plt.show()
# New York City as a City tops all the Cities in Sales

In [None]:
# 5. Ship Mode
plt.figure(figsize=(16,8))
df['Ship Mode'].value_counts().plot.bar()
plt.title('Ship Mode Wise Sales')
plt.ylabel('Sales')
plt.xlabel('Ship Modes')
plt.show()
# Standard Class tops all the Transport Methods


In [None]:
# 6. Segment
# Distribution of customer Segment
plt.figure(figsize=(16,8))
df['Segment'].value_counts().plot.bar()         
plt.title('Segment Wise Sales')
plt.ylabel('Count')
plt.xlabel('Segments')
plt.show()


In [None]:
# 7. Category
plt.figure(figsize=(16,8))
df['Category'].value_counts().plot.bar()
plt.title('Category Wise Sales')
plt.ylabel('Sales')
plt.xlabel('Categories')
plt.show()
# Office Supplies tops all the Sales in Categories

In [None]:
# 8. Sub-Category
plt.figure(figsize=(16,8))
df['Sub-Category'].value_counts().plot.bar()
plt.title('Sub-Category Wise Sales')
plt.ylabel('Sales')
plt.xlabel('Sub Categories')
plt.show()
# Binders tops all the Sales in Sub Categories

In [None]:
# Lets see how sub-categories are distributed wrt to category
plt.figure(figsize=(16,8))
plt.bar('Sub-Category','Category',data=df,color='y')
plt.show()

## **4. Detailed graphical analysis of the Data Set with various groupby, aggregate and breakdowns**

* The distribution of Category, Sub-category features according to their amount is observed. What stands out here is the higher amount of Office Supplies category compared to all other categories.

In [None]:
# Quantity by product Category, Sub-category
plt.figure(figsize=(16,8))
sale_category = df.groupby(["Category","Sub-Category"])['Quantity'].aggregate(np.sum).reset_index().sort_values('Quantity',ascending = False)
sns.barplot(x = "Category", hue="Sub-Category", y= "Quantity", data=sale_category)
plt.show()
# Binders in Office Supplies tops the list.

* Although the Office Supplies category is clearly ahead in the ranking by quantity, it is seen that it is not in this way in the ranking by profit. In the ranking by profit, the Technology category was the category with the highest profit.

In [None]:
# Profit by product Category, Sub-category
plt.figure(figsize=(16,8))
sale_category = df.groupby(["Category","Sub-Category"])['Profit'].aggregate(np.sum).reset_index().sort_values('Profit',ascending = False)
sns.barplot(x = "Category", hue="Sub-Category", y= "Profit", data=sale_category)
plt.show()
# Copiers in Technology tops the list.

* Although the Phones sub-category is ahead in the ranking by sales, it is seen that this is not the case in the ranking by land. In the ranking by profit, the Copiers sub-category was the most profitable sub-category.

In [None]:
# Sales by product Category, Sub-category
plt.figure(figsize=(16,8))
sale_category = df.groupby(["Category","Sub-Category"])['Sales'].aggregate(np.sum).reset_index().sort_values('Sales',ascending = False)
sns.barplot(x = "Category", hue="Sub-Category", y= "Sales", data=sale_category)
plt.show()
# Phones in Technology tops the list.

* For example, it is observed that products in Tables and Bookcases sub-categories belonging to Furniture category are damaged.
* The total profit and sales amounts of each category and sub-category in the store are shown in the chart.
* In particular, it is observed that although the total sales amounts of some products are high, the total profit obtained is low.
* Even though some products sales a lot, loss is observed.

In [None]:
df.groupby(["Category","Sub-Category"])['Profit','Sales'].agg(['sum']).plot(kind='bar',figsize=(16,8))
plt.title('Total Profit and Sales per Sub-Category')
plt.show()

In [None]:
def draw_scatter_pairs(data,cols=2, rows=2):
    feature_names=data.columns.values

    counter=0
    fig, axarr = plt.subplots(rows,cols,figsize=(22,16))
    for i in range(rows):
        for j in range(cols):
            if counter>=len(feature_names):
                break

            name=feature_names[counter]
            axarr[i][j].scatter(x = data[name], y = data['Profit'])
            axarr[i][j].set(xlabel=name, ylabel='Profit')

            counter+=1


    plt.show()

In [None]:
new_corr = df.corr().abs()
new_corr['Profit'].sort_values(ascending=False)

In [None]:
top4_corr=new_corr['Profit'].sort_values(ascending=False)[:4].drop('Profit')
# Top 4 most correlated feature. 
top4_corr_names=top4_corr.index.values

In [None]:
top4_corr_names =list(top4_corr_names) + ['Profit']
top4_corr_names

In [None]:
# Features that have the highest correlation with 'Profit'
# graphic representation

draw_scatter_pairs(df[top4_corr_names], rows=2, cols=2)

* In the graphs below, it is comfortably observed which types of properties bring higher profits according to various categorical variables. As a business manager you can choose which Ship Mode, Segment, Region, Category, Sub-category etc. It is observed that it brings higher profits.

In [None]:
features = ['Ship Mode','Segment','Region','Category','Sub-Category']

In [None]:
fig, ax = plt.subplots(3, 2, figsize=(20, 15))
for var, subplot in zip(df[features], ax.flatten()):
    sns.scatterplot(x="Sales", y="Profit",hue=var, data=df, ax=subplot)
    

In [None]:
#Relationship between sales and profit -- use scatter plot
# Aggregate versions of the above results.
def scatter_agg(feature):
    x = df.groupby(feature).agg({"Sales":np.sum, "Profit": np.sum})

    fig = plt.figure(figsize=(12,8))
    ax = fig.add_subplot(111)
    p = sns.scatterplot(x="Sales", y="Profit", hue=x.index, data=x) # kind="scatter")
    ax.set_title("Relationship between Sales and Profit by State")
    plt.tight_layout()
    plt.show()
    


In [None]:
scatter_agg('Ship Mode')

In [None]:
scatter_agg('Segment')

In [None]:
scatter_agg('Region')

In [None]:
scatter_agg('Category')

In [None]:
scatter_agg('Sub-Category')

In [None]:
fig = plt.figure(figsize=(12,8))
sns.scatterplot(x="Sales",y="Profit",hue="Segment",style="Ship Mode",data=df);