**Performing Exploratory Data Analysis on ''SampleSuperstore'' dataset**

In [2]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


# Load the  dataset
df = pd.read_csv("C:/Users/omuku/OneDrive/Desktop/Desktop/GRIP/Exploratory-Data-Analysis/SampleSuperstore.csv")
df.head() # See the first 5 rows

Unnamed: 0,Ship Mode,Segment,Country,City,State,Postal Code,Region,Category,Sub-Category,Sales,Quantity,Discount,Profit
0,Second Class,Consumer,United States,Henderson,Kentucky,42420,South,Furniture,Bookcases,261.96,2,0.0,41.9136
1,Second Class,Consumer,United States,Henderson,Kentucky,42420,South,Furniture,Chairs,731.94,3,0.0,219.582
2,Second Class,Corporate,United States,Los Angeles,California,90036,West,Office Supplies,Labels,14.62,2,0.0,6.8714
3,Standard Class,Consumer,United States,Fort Lauderdale,Florida,33311,South,Furniture,Tables,957.5775,5,0.45,-383.031
4,Standard Class,Consumer,United States,Fort Lauderdale,Florida,33311,South,Office Supplies,Storage,22.368,2,0.2,2.5164


In [3]:
# Shape of the data
df.shape

(9994, 13)

In [4]:
# Count missing values in each column
missing_values = df.isnull().sum()

# Display columns with missing values
print(missing_values[missing_values > 0])


Series([], dtype: int64)


No missing values in the dataframe


In [5]:
# Get basic information about the dataset
print(df.info())

# Summary statistics
print(df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9994 entries, 0 to 9993
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Ship Mode     9994 non-null   object 
 1   Segment       9994 non-null   object 
 2   Country       9994 non-null   object 
 3   City          9994 non-null   object 
 4   State         9994 non-null   object 
 5   Postal Code   9994 non-null   int64  
 6   Region        9994 non-null   object 
 7   Category      9994 non-null   object 
 8   Sub-Category  9994 non-null   object 
 9   Sales         9994 non-null   float64
 10  Quantity      9994 non-null   int64  
 11  Discount      9994 non-null   float64
 12  Profit        9994 non-null   float64
dtypes: float64(3), int64(2), object(8)
memory usage: 1015.1+ KB
None
        Postal Code         Sales     Quantity     Discount       Profit
count   9994.000000   9994.000000  9994.000000  9994.000000  9994.000000
mean   55190.379428    229.85800

**Summarizing Key Features of the Dataset**

In [6]:
# Sales and profit summary by category
category_summary = df.groupby('Category').agg({'Sales': ['sum', 'mean', 'std'], 'Profit': ['sum', 'mean', 'std']})
print(category_summary)

# Sales summary by region
region_summary = df.groupby('Region').agg({'Sales': ['sum', 'mean', 'std'], 'Profit': ['sum', 'mean', 'std']})
print(region_summary)



                       Sales                                Profit             \
                         sum        mean          std          sum       mean   
Category                                                                        
Furniture        741999.7953  349.834887   503.179145   18451.2728   8.699327   
Office Supplies  719047.0320  119.324101   382.182228  122490.8008  20.327050   
Technology       836154.0330  452.709276  1108.655848  145454.9481  78.752002   

                             
                        std  
Category                     
Furniture        136.049246  
Office Supplies  164.887280  
Technology       428.816633  
               Sales                               Profit             \
                 sum        mean         std          sum       mean   
Region                                                                 
Central  501239.8908  215.772661  632.779010   39706.3625  17.092709   
East     678781.2400  238.336110  620.712652 

**Creating Dashboards**

We create a  dashboard   focusing on sales and profit analysis.

In [13]:
import dash
from dash import Dash
from dash import dcc
from dash import html
from dash.dependencies import Input, Output
import plotly.express as px




In [8]:
# Sales by Category Bar Chart
def update_charts(selected_city):
    filtered_df = df[df['City'] == selected_city]
    
    # Sales by Category Bar Chart
    sales_bar_chart = px.bar(
        filtered_df.groupby('Category')['Sales'].sum().reset_index(),
        x='Category', y='Sales',
        title=f'Sales by Category for {selected_city}'
    )
    
    # Sub-Category Profit Line Chart
    profit_line_chart = px.line(
        filtered_df.groupby('Sub-Category')['Profit'].sum().reset_index(),
        x='Sub-Category', y='Profit',
        title=f'Monthly Profit for {selected_city}'
    )
    
    return sales_bar_chart, profit_line_chart



In [14]:
# Initialize the Dash app
app = Dash(__name__)

# Define the layout
app.layout = html.Div([
    html.H1("Superstore Sales Dashboard", style={'text-align': 'center'}),
    
    dcc.Dropdown(
        id='country-dropdown',
        options=[{'label': str(city), 'value': city} for city in df['City'].unique()],
        value=df['City'].min(),
        clearable=False,
        style={'width': '50%', 'margin': '0 auto'}
    ),
    
    dcc.Graph(id='sales-bar-chart'),
    dcc.Graph(id='profit-line-chart')
])


In [15]:
# Define callback
@app.callback(
    [Output('sales-bar-chart', 'figure'),
     Output('profit-line-chart', 'figure')],
    [Input('country-dropdown', 'value')]
)
def update_charts(selected_city):
    filtered_df = df[df['City'] == selected_city]
    
    # Sales by Category Bar Chart
    sales_bar_chart = px.bar(
        filtered_df.groupby('Category')['Sales'].sum().reset_index(),
        x='Category', y='Sales',
        title=f'Sales by Category for {selected_city}'
    )
    
    # Sub-Category Profit Line Chart
    profit_line_chart = px.line(
        filtered_df.groupby('Sub-Category')['Profit'].sum().reset_index(),
        x='Sub-Category', y='Profit',
        title=f'Monthly Profit for {selected_city}'
    )
    
    return sales_bar_chart, profit_line_chart


In [16]:
# Run the app
if __name__ == '__main__':
    app.run_server(debug=True)