In [1]:
# Required libraries
import pandas as pd  # Data manipulation and analysis
import matplotlib.pyplot as plt  # Data visualization
import seaborn as sns
import plotly.express as px
# Load data assuming CSV files are in the same directory
distribution_centers = pd.read_csv("./data/distribution_centers.csv")
events = pd.read_csv("./data/events.csv")
inventory_items = pd.read_csv("./data/inventory_items.csv")
order_items = pd.read_csv("./data/order_items.csv")
orders = pd.read_csv("./data/orders.csv")
products = pd.read_csv("./data/products.csv")
users = pd.read_csv("./data/users.csv")



In [2]:
products.head()

Unnamed: 0,id,cost,category,name,brand,retail_price,department,sku,distribution_center_id
0,13842,2.51875,Accessories,Low Profile Dyed Cotton Twill Cap - Navy W39S55D,MG,6.25,Women,EBD58B8A3F1D72F4206201DA62FB1204,1
1,13928,2.33835,Accessories,Low Profile Dyed Cotton Twill Cap - Putty W39S55D,MG,5.95,Women,2EAC42424D12436BDD6A5B8A88480CC3,1
2,14115,4.87956,Accessories,Enzyme Regular Solid Army Caps-Black W35S45D,MG,10.99,Women,EE364229B2791D1EF9355708EFF0BA34,1
3,14157,4.64877,Accessories,Enzyme Regular Solid Army Caps-Olive W35S45D (...,MG,10.99,Women,00BD13095D06C20B11A2993CA419D16B,1
4,14273,6.50793,Accessories,Washed Canvas Ivy Cap - Black W11S64C,MG,15.99,Women,F531DC20FDE20B7ADF3A73F52B71D0AF,1


In [3]:
products.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29120 entries, 0 to 29119
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      29120 non-null  int64  
 1   cost                    29120 non-null  float64
 2   category                29120 non-null  object 
 3   name                    29118 non-null  object 
 4   brand                   29096 non-null  object 
 5   retail_price            29120 non-null  float64
 6   department              29120 non-null  object 
 7   sku                     29120 non-null  object 
 8   distribution_center_id  29120 non-null  int64  
dtypes: float64(2), int64(2), object(5)
memory usage: 2.0+ MB


In [4]:
products.isnull().sum()

id                         0
cost                       0
category                   0
name                       2
brand                     24
retail_price               0
department                 0
sku                        0
distribution_center_id     0
dtype: int64

In [5]:
# Drop rows with null values in the "name" and "brand" columns
products = products.dropna(subset=['name', 'brand'])

# Check if there are any null values after dropping
products.isnull().sum()

id                        0
cost                      0
category                  0
name                      0
brand                     0
retail_price              0
department                0
sku                       0
distribution_center_id    0
dtype: int64

In [6]:
#Check Basic Statistics
basic_stats = products.describe()
print("Basic Statistics:")
print(basic_stats)

Basic Statistics:
                 id          cost  retail_price  distribution_center_id
count  29094.000000  29094.000000  29094.000000            29094.000000
mean   14558.540764     28.490452     59.239441                4.984773
std     8406.897280     30.634308     65.909439                2.901743
min        1.000000      0.008300      0.020000                1.000000
25%     7276.250000     11.275338     24.000000                2.000000
50%    14558.500000     19.688765     39.990002                5.000000
75%    21838.750000     34.450375     69.949997                8.000000
max    29120.000000    557.151002    999.000000               10.000000


# EDA

### Distribution of Categories

In [7]:
# Calculate the distribution of product categories and create a bar plot to visualize it

category_distribution = products['category'].value_counts().reset_index()
category_distribution.columns = ['category', 'count']

fig1 = px.bar(category_distribution, x='category', y='count', title='Distribution of Categories')
fig1.show()

### Distribution of Brands


In [8]:
import plotly.graph_objects as go

# Calculate brand distribution
brand_distribution = products['brand'].value_counts().reset_index()
brand_distribution.columns = ['brand', 'count']

# Create a table with Plotly
fig = go.Figure(data=[go.Table(
    header=dict(values=['Brand', 'Count'],
                fill_color='paleturquoise',
                align='left'),
    cells=dict(values=[brand_distribution['brand'], brand_distribution['count']],
               fill_color='lavender',
               align='left'))
])

fig.update_layout(title='Distribution of Brands')
fig.show()

In [9]:
# Distribution of Departments
department_distribution = products['department'].value_counts().reset_index()
department_distribution.columns = ['department', 'count']

fig_department = px.bar(department_distribution, x='department', y='count', title='Distribution of Departments')
fig_department.show()

In [10]:
# Group by brand and calculate the average retail price
avg_price_per_brand = products.groupby('brand')['retail_price'].mean().reset_index()
avg_price_per_brand.columns = ['brand', 'avg_retail_price']

print(avg_price_per_brand)

                 brand  avg_retail_price
0            !it Jeans         51.247692
1            '47 Brand         69.000000
2          007Lingerie         11.540000
3              10 Deep        103.949997
4            106Shades          8.290000
...                ...               ...
2751  tasc Performance         28.500000
2752     turkishtowels        149.990005
2753          under.me         24.000000
2754      vip boutique         15.490000
2755         wear ease         53.000000

[2756 rows x 2 columns]


In [11]:
# Sort the DataFrame in descending order of average retail price
avg_price_per_brand_sorted = avg_price_per_brand.sort_values(by='avg_retail_price', ascending=False)

# Create a table trace
table_trace = go.Table(
    header=dict(values=['Brand', 'Average Retail Price'],
                fill_color='paleturquoise',
                align='left'),
    cells=dict(values=[avg_price_per_brand_sorted['brand'], avg_price_per_brand_sorted['avg_retail_price']],
               fill_color='lavender',
               align='left'))

# Create figure
fig = go.Figure(data=[table_trace])

# Update layout
fig.update_layout(title='Average Retail Price per Brand')

# Show plot
fig.show()

In [12]:

# Group by category and calculate the average retail price
avg_price_per_category = products.groupby('category')['retail_price'].mean().reset_index()
avg_price_per_category.columns = ['category', 'avg_retail_price']

# Sort the DataFrame in descending order of average retail price
avg_price_per_category_sorted = avg_price_per_category.sort_values(by='avg_retail_price', ascending=False)

# Create a table trace
table_trace_category = go.Table(
    header=dict(values=['Category', 'Average Retail Price'],
                fill_color='paleturquoise',
                align='left'),
    cells=dict(values=[avg_price_per_category_sorted['category'], avg_price_per_category_sorted['avg_retail_price']],
               fill_color='lavender',
               align='left'))

# Create figure
fig_category = go.Figure(data=[table_trace_category])

# Update layout
fig_category.update_layout(title='Average Retail Price per Category')

# Show plot
fig_category.show()

In [13]:
# Group by category and calculate the average cost and retail price
avg_cost_per_category = products.groupby('category')['cost'].mean().reset_index()
avg_retail_price_per_category = products.groupby('category')['retail_price'].mean().reset_index()

# Sort the DataFrames in alphabetical order of categories
avg_cost_per_category_sorted = avg_cost_per_category.sort_values(by='category')
avg_retail_price_per_category_sorted = avg_retail_price_per_category.sort_values(by='category')

# Create line traces for average cost and retail price
trace_cost = go.Scatter(x=avg_cost_per_category_sorted['category'], y=avg_cost_per_category_sorted['cost'],
                        mode='lines+markers', name='Average Cost', marker=dict(color='blue'))
trace_retail_price = go.Scatter(x=avg_retail_price_per_category_sorted['category'], y=avg_retail_price_per_category_sorted['retail_price'],
                                mode='lines+markers', name='Average Retail Price', marker=dict(color='red'))

# Create figure
fig_line = go.Figure(data=[trace_cost, trace_retail_price])

# Update layout
fig_line.update_layout(title='Comparison of Average Cost and Average Retail Price per Category',
                       xaxis_title='Category',
                       yaxis_title='Price',
                       legend=dict(x=0, y=1, traceorder='normal'))

# Show plot
fig_line.show()

In [14]:
# Group by category and distribution center, and count the number of products in each combination
category_distribution_center = products.groupby(['category', 'distribution_center_id']).size().reset_index()
category_distribution_center.columns = ['category', 'distribution_center_id', 'count']

# Create a bar plot
fig = px.bar(category_distribution_center, x='category', y='count', color='distribution_center_id', title='Category Distribution Across Distribution Centers')
fig.update_layout(xaxis_title='Category', yaxis_title='Number of Products', legend_title='Distribution Center')
fig.show()

In [4]:
distribution_centers.head()

Unnamed: 0,id,name,latitude,longitude
0,1,Memphis TN,35.1174,-89.9711
1,2,Chicago IL,41.8369,-87.6847
2,3,Houston TX,29.7604,-95.3698
3,4,Los Angeles CA,34.05,-118.25
4,5,New Orleans LA,29.95,-90.0667


In [16]:
# Rename 'id' column to 'distribution_center_id'
distribution_centers.rename(columns={'id': 'distribution_center_id'}, inplace=True)

# Plotting the distribution centers
fig = go.Figure()

# Add Scattergeo trace for distribution centers
fig.add_trace(go.Scattergeo(
    lon=distribution_centers['longitude'],
    lat=distribution_centers['latitude'],
    text=distribution_centers['name'],
    mode='markers',
    marker=dict(size=20),
    name='Distribution Centers'
))

# Update layout
fig.update_layout(
    title='Distribution Centers in the USA',
    geo_scope='usa',  # Set the geo_scope to 'usa'
    height=650,
)

# Show plot
fig.show()


In [17]:
# Calculate SKU count per distribution center
sku_count_per_center = products.groupby('distribution_center_id').size().reset_index()
sku_count_per_center.columns = ['distribution_center_id', 'sku_count']


In [21]:
# Merge SKU count with distribution centers data
distribution_centers_with_sku_count = pd.merge(distribution_centers, sku_count_per_center, on='distribution_center_id', how='left')
distribution_centers_with_sku_count

Unnamed: 0,distribution_center_id,name,latitude,longitude,sku_count
0,1,Memphis TN,35.1174,-89.9711,3890
1,2,Chicago IL,41.8369,-87.6847,3928
2,3,Houston TX,29.7604,-95.3698,3643
3,4,Los Angeles CA,34.05,-118.25,2761
4,5,New Orleans LA,29.95,-90.0667,2112
5,6,Port Authority of New York/New Jersey NY/NJ,40.634,-73.7834,2572
6,7,Philadelphia PA,39.95,-75.1667,2669
7,8,Mobile AL,30.6944,-88.0431,2919
8,9,Charleston SC,32.7833,-79.9333,2719
9,10,Savannah GA,32.0167,-81.1167,1881


In [27]:
#Plotting The Scatter Map

fig = go.Figure(data=go.Scattergeo(
    lon = distribution_centers_with_sku_count['longitude'],
    lat = distribution_centers_with_sku_count['latitude'],
    text = distribution_centers_with_sku_count[['name', 'sku_count']],
    mode = 'markers',
    locationmode= 'USA-states',
    marker = dict(size=distribution_centers_with_sku_count['sku_count'],
                 sizemode = 'diameter',
                  sizeref= 100,
                 colorscale = 'Blues',
                 cmin = distribution_centers_with_sku_count['sku_count'].min(),
                 color = distribution_centers_with_sku_count['sku_count'],
                 cmax = distribution_centers_with_sku_count['sku_count'].max(),
                 colorbar_title = 'Number of SKU'),))
        

fig.update_layout(title_text = 'The Volume of SKU Handled By Distributrion Center',
                  geo_scope = 'usa',
                 height = 650,)

fig.show()