In [98]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [99]:
# Evaluate seller dataset
# Lets load the data
sellers_data = pd.read_csv("/kaggle/input/brazilian-ecommerce/olist_sellers_dataset.csv")
sellers_data.head(5)

Unnamed: 0,seller_id,seller_zip_code_prefix,seller_city,seller_state
0,3442f8959a84dea7ee197c632cb2df15,13023,campinas,SP
1,d1b65fc7debc3361ea86b5f14c68d2e2,13844,mogi guacu,SP
2,ce3ad9de960102d0677a81f5d0bb7b2d,20031,rio de janeiro,RJ
3,c0f3eea2e14555b6faeea3dd58c1b1c3,4195,sao paulo,SP
4,51a04a8a6bdcb23deccc82b0b80742cf,12914,braganca paulista,SP


In [132]:
# How many sellers are in the dataset?
total_seller = sellers_data.shape[0]
total_seller

3095

In [101]:
# How many sellers are there in each state?
sellers_data.groupby("seller_state").size()

seller_state
AC       1
AM       1
BA      19
CE      13
DF      30
ES      23
GO      40
MA       1
MG     244
MS       5
MT       4
PA       1
PB       6
PE       9
PI       1
PR     349
RJ     171
RN       5
RO       2
RS     129
SC     190
SE       2
SP    1849
dtype: int64

In [102]:
# Which state has the most sellers?
sellers_data.groupby("seller_state").size().reset_index(name = "seller_count").sort_values("seller_count", ascending = False).head(1)

Unnamed: 0,seller_state,seller_count
22,SP,1849


In [103]:
# Which city has the highest number of sellers?
sellers_data.groupby("seller_city").size().reset_index(name = "seller_count").sort_values("seller_count", ascending = False).head(1)

Unnamed: 0,seller_city,seller_count
517,sao paulo,694


In [104]:
# What is the average number of sellers per state?
seller_per_state = (
    sellers_data.groupby("seller_state")
    .size()
    .reset_index(name = "seller_count")
)
avg_seller_per_state = seller_per_state["seller_count"].mean().round(2)
avg_seller_per_state

134.57

In [105]:
# Which states have more than 100 sellers?
seller_per_state = (
    sellers_data.groupby("seller_state")
    .size()
    .reset_index(name = "seller_count")
)
seller_per_state = seller_per_state[seller_per_state["seller_count"] > 100]
seller_per_state

Unnamed: 0,seller_state,seller_count
8,MG,244
15,PR,349
16,RJ,171
19,RS,129
20,SC,190
22,SP,1849


In [106]:
# How many sellers are in the top 10 cities by seller count?
seller_per_city = (
    sellers_data.groupby("seller_city")
    .size()
    .reset_index(name = "seller_count")
    .sort_values("seller_count", ascending = False)
    .head(10)
)
sellers_in_top10 = seller_per_city["seller_count"].sum()
sellers_in_top10

1262

In [107]:
# Which cities have only one seller?
seller_per_city = (
    sellers_data.groupby("seller_city")
    .size()
    .reset_index(name = "seller_count")
    .sort_values("seller_count", ascending = False)
)
seller_per_city = seller_per_city[seller_per_city["seller_count"] == 1]
seller_per_city.head(10)

Unnamed: 0,seller_city,seller_count
599,viana,1
433,presidente epitacio,1
410,piracanjuba,1
434,presidente getulio,1
600,vicente de carvalho,1
397,pedregulho,1
557,taruma,1
436,queimados,1
408,pinhalao,1
443,ribeirao preto / sao paulo,1


In [108]:
# Rank cities by the number of sellers within each state?
state_city_seller = (
    sellers_data.groupby(["seller_state", "seller_city"])
    .size()
    .reset_index(name = "seller_count")
)

state_city_seller["rank"] = (
    state_city_seller.groupby("seller_state")["seller_count"]
    .rank(method = "first", ascending = False)
)
state_city_seller = state_city_seller.sort_values(["seller_state", "rank"])
state_city_seller.head(10)

Unnamed: 0,seller_state,seller_city,seller_count,rank
0,AC,rio branco,1,1.0
1,AM,manaus,1,1.0
13,BA,salvador,7,1.0
11,BA,lauro de freitas,2,2.0
2,BA,arraial d'ajuda (porto seguro),1,3.0
3,BA,bahia,1,4.0
4,BA,barro alto,1,5.0
5,BA,eunapolis,1,6.0
6,BA,feira de santana,1,7.0
7,BA,guanambi,1,8.0


In [114]:
# Find states where a single city accounts for more than 50% of sellers in that state?
seller_per_state = (
    sellers_data.groupby("seller_state")
    .size()
    .reset_index(name = "seller_count")
)

merged = state_city_seller.merge(seller_per_state, on = "seller_state")
merged["perc_seller_per_state"] = (merged["seller_count_x"] / merged["seller_count_y"]) * 100
merged = merged[merged["perc_seller_per_state"] > 50]
merged[["seller_state", "perc_seller_per_state"]]

Unnamed: 0,seller_state,perc_seller_per_state
0,AC,100.0
1,AM,100.0
14,CE,53.846154
21,DF,93.333333
35,GO,57.5
47,MA,100.0
130,MS,80.0
135,PA,100.0
141,PE,55.555556
145,PI,100.0


In [130]:
# Which state has the highest concentration of sellers in its top 3 cities?
top3_city_state_totals = state_city_seller[state_city_seller["rank"] <= 3]
top3_totals = (
    top3_city_state_totals.groupby("seller_state")["seller_count"]
    .sum()
    .reset_index(name="top3_seller_count")
)
result = top3_totals.merge(seller_per_state, on="seller_state")
result["top3_concentration"] = (result["top3_seller_count"] / result["seller_count"]) * 100
result.head(10)

Unnamed: 0,seller_state,top3_seller_count,seller_count,top3_concentration
0,AC,1,1,100.0
1,AM,1,1,100.0
2,BA,10,19,52.631579
3,CE,9,13,69.230769
4,DF,30,30,100.0
5,ES,12,23,52.173913
6,GO,29,40,72.5
7,MA,1,1,100.0
8,MG,90,244,36.885246
9,MS,5,5,100.0


In [135]:
# Calculate the percentage of sellers in each state compared to the total number of sellers.
seller_per_state["perc_wise"] = np.round((seller_per_state["seller_count"] / total_seller) * 100, 2)
seller_per_state.head(10)

Unnamed: 0,seller_state,seller_count,perc_wise
0,AC,1,0.03
1,AM,1,0.03
2,BA,19,0.61
3,CE,13,0.42
4,DF,30,0.97
5,ES,23,0.74
6,GO,40,1.29
7,MA,1,0.03
8,MG,244,7.88
9,MS,5,0.16


In [139]:
# Identify cities that belong to states with more than 500 sellers but themselves have fewer than 10 sellers.
seller_state_city_stats = state_city_seller.merge(seller_per_state, on="seller_state")
result = seller_state_city_stats[(seller_state_city_stats["seller_count_y"] > 500) & (seller_state_city_stats["seller_count_x"] < 10)]
result[["seller_city"]]

Unnamed: 0,seller_city
409,diadema
410,itaquaquecetuba
411,pedreira
412,porto ferreira
413,salto
...,...
631,varzea paulista
632,vera cruz
633,vicente de carvalho
634,vila velha


In [149]:
 # Find the top 5 states that contribute to 80% of all sellers (Pareto principle).
seller_per_state = (
    sellers_data.groupby("seller_state")
    .size()
    .reset_index(name = "seller_count")
    .sort_values("seller_count", ascending = False)
)

seller_per_state["cum_seller_count"] = seller_per_state["seller_count"].cumsum()
seller_per_state["cum_seller_perc"] = np.round((seller_per_state["cum_seller_count"] / total_seller) * 100, 2)
seller_per_state = seller_per_state[seller_per_state["cum_seller_perc"] <= 80]
seller_per_state

Unnamed: 0,seller_state,seller_count,cum_seller_count,cum_seller_perc
22,SP,1849,1849,59.74
15,PR,349,2198,71.02
8,MG,244,2442,78.9


In [160]:
# Which state has the lowest seller-to-city ratio (sellers per city)?
seller_state_city_stats = (
    sellers_data.groupby("seller_state")
    .agg(
        total_seller = ("seller_id", "size"),
        unique_city = ("seller_city", "nunique")
    )
    .reset_index()
)
seller_state_city_stats["ratio"] = np.round(seller_state_city_stats["total_seller"] / seller_state_city_stats["unique_city"])
seller_state_city_stats.sort_values("ratio", ascending = True).head(1)[["seller_state", "ratio"]]

Unnamed: 0,seller_state,ratio
0,AC,1.0


In [167]:
# Identify the median number of sellers per city for each state and rank the states by this median.
seller_state_city_stats = (
    sellers_data.groupby(["seller_state", "seller_city"])
    .size()
    .reset_index(name = "statewise_city_seller_count")
)
seller_state_city_stats = (
    seller_state_city_stats.groupby("seller_state")["statewise_city_seller_count"]
    .quantile(0.5)
    .reset_index(name = "median")
)
seller_state_city_stats["rank"] = seller_state_city_stats["median"].rank(method = "first", ascending = True)
seller_state_city_stats.sort_values("rank")
seller_state_city_stats.head(10)

Unnamed: 0,seller_state,median,rank
0,AC,1.0,1.0
1,AM,1.0,2.0
2,BA,1.0,3.0
3,CE,1.0,4.0
4,DF,1.0,5.0
5,ES,1.0,6.0
6,GO,1.0,7.0
7,MA,1.0,8.0
8,MG,1.0,9.0
9,MS,2.5,23.0


In [169]:
# Detect outliers: Which cities have unusually high numbers of sellers compared to others in the same state?
seller_state_city_stats = (
    sellers_data.groupby(["seller_state", "seller_city"])
    .size()
    .reset_index(name = "seller_count")
)
q1 = seller_state_city_stats["seller_count"].quantile(0.25)
q3 = seller_state_city_stats["seller_count"].quantile(0.75)
iqr = q3 - q1

lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

outliers = seller_state_city_stats[(seller_state_city_stats["seller_count"] < lower_bound) | (seller_state_city_stats["seller_count"] > upper_bound)]
outliers = outliers.sort_values("seller_count", ascending = False)
outliers.head(10)

Unnamed: 0,seller_state,seller_city,seller_count
590,SP,sao paulo,694
166,PR,curitiba,124
238,RJ,rio de janeiro,93
57,MG,belo horizonte,66
555,SP,ribeirao preto,52
458,SP,guarulhos,50
461,SP,ibitinga,49
573,SP,santo andre,45
415,SP,campinas,41
188,PR,maringa,40


In [189]:
# For each state, calculate the percentage contribution of its largest city to the total sellers in that state.
seller_state_city_stats = (
    sellers_data.groupby(["seller_state", "seller_city"])
    .size()
    .reset_index(name = "seller_count_citywise")
)

seller_per_state = (
    sellers_data.groupby("seller_state")
    .size()
    .reset_index(name = "seller_count_statewise")
)

merged_result = seller_state_city_stats.merge(seller_per_state, on = "seller_state")

merged_result["rank"] = (
    merged_result.groupby("seller_state")["seller_count_citywise"]
    .rank(method = "first", ascending = False)
)

merged_result = merged_result.sort_values(["seller_state", "rank"])

top1_city_state_rankwise = merged_result[merged_result["rank"] == 1].copy()
top1_city_state_rankwise["perc_contribution"] = np.round(
    (top1_city_state_rankwise["seller_count_citywise"] / top1_city_state_rankwise["seller_count_statewise"]) * 100, 2
)
top1_city_state_rankwise.sort_values("perc_contribution", ascending=False).head(10)

Unnamed: 0,seller_state,seller_city,seller_count_citywise,seller_count_statewise,rank,perc_contribution
0,AC,rio branco,1,1,1.0,100.0
47,MA,sao luis,1,1,1.0,100.0
145,PI,teresina,1,1,1.0,100.0
1,AM,manaus,1,1,1.0,100.0
135,PA,marechal candido rondon,1,1,1.0,100.0
21,DF,brasilia,28,30,1.0,93.33
130,MS,campo grande,4,5,1.0,80.0
39,GO,goiania,23,40,1.0,57.5
143,PE,recife,5,9,1.0,55.56
238,RJ,rio de janeiro,93,171,1.0,54.39


In [200]:
# Create a cumulative distribution of sellers by state and find the state where cumulative percentage crosses 70%.
cumulative_distribution = (
    sellers_data.groupby("seller_state")
    .size()
    .reset_index(name = "seller_count")
)

cumulative_distribution["cum_seller_count"] = seller_per_state["seller_count"].cumsum()

# 5. Calculate cumulative percentage
cumulative_distribution["perc_seller"] = np.round(
    (cumulative_distribution["cum_seller_count"] / total_seller) * 100, 2
)
state_crossing_70 = cumulative_distribution[cumulative_distribution["perc_seller"] >= 70].head(1)
state_crossing_70

Unnamed: 0,seller_state,seller_count,cum_seller_count,perc_seller
22,SP,1849,3095,100.0


In [202]:
# Which states have an equal number of sellers and unique cities?
states_stats = (
    sellers_data.groupby("seller_state")
    .agg(
        total_seller = ("seller_id", "size"),
        unique_city = ("seller_city", "nunique")
    )
    .reset_index()
)
states_stats = states_stats[states_stats["total_seller"] == states_stats["unique_city"]]
states_stats

Unnamed: 0,seller_state,total_seller,unique_city
0,AC,1,1
1,AM,1,1
7,MA,1,1
11,PA,1,1
14,PI,1,1
18,RO,2,2
21,SE,2,2
