In [25]:
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as ss

date_cols = {
    'olist_orders_dataset.csv': [
        'order_purchase_timestamp',
        'order_approved_at',
        'order_delivered_carrier_date',
        'order_delivered_customer_date',
        'order_estimated_delivery_date',
    ],
    'olist_order_items_dataset.csv': [
        'shipping_limit_date',
    ],
    'olist_order_reviews_dataset.csv': [
        'review_creation_date',
        'review_answer_timestamp',
    ],
    # The following datasets have NO date columns:
    # 'olist_customers_dataset.csv'
    # 'olist_geolocation_dataset.csv'
    # 'olist_order_payments_dataset.csv'
    # 'olist_products_dataset.csv'
    # 'olist_sellers_dataset.csv'
    # 'product_category_name_translation.csv'
    'olist_orders_delivered_clean.csv': [
        'order_purchase_timestamp',
        'order_approved_at',
        'order_delivered_carrier_date',
        'order_delivered_customer_date',
        'order_estimated_delivery_date',
    ],
}

def read_olist_csv(path):
    """
    Reads an Olist CSV and parses dates for the correct columns.
    Args:
        path (str): Path to the CSV file.
    Returns:
        pd.DataFrame: Loaded dataframe with date columns parsed as datetime.
    """
    # Extract just the filename, e.g., 'olist_orders_dataset.csv'
    filename = os.path.basename(path)
    # Get the correct date columns for this file, or an empty list
    parse_dates = date_cols.get(filename, [])
    # Read the CSV, parsing the specified date columns (if any)
    return pd.read_csv(path, parse_dates=parse_dates)

df_orders      = read_olist_csv('../data/olist_orders_dataset.csv')
df_customers   = read_olist_csv('../data/olist_customers_dataset.csv')
df_items       = read_olist_csv('../data/olist_order_items_dataset.csv')
df_payments    = read_olist_csv('../data/olist_order_payments_dataset.csv')
df_reviews     = read_olist_csv('../data/olist_order_reviews_dataset.csv')
df_products    = read_olist_csv('../data/olist_products_dataset.csv')
df_prod_cat_tr = read_olist_csv('../data/product_category_name_translation.csv')
df_sellers     = read_olist_csv('../data/olist_sellers_dataset.csv')
df_geolocation = read_olist_csv('../data/olist_geolocation_dataset.csv')

# Cleaned dataset
df_orders_delivered_clean   = read_olist_csv('../data/cleaned_data/olist_orders_delivered_clean.csv')
df_geolocation_clean        = read_olist_csv('../data/cleaned_data/olist_geolocation_dataset_city_cleaned_final_Jo.csv')

In [43]:
import plotly.express as px
import plotly.graph_objects as go

In [16]:
# Load the sellers dataset:
df_sellers.head(10)

Unnamed: 0,seller_id,seller_zip_code_prefix,seller_city,seller_state
0,3442f8959a84dea7ee197c632cb2df15,13023,campinas,SP
1,d1b65fc7debc3361ea86b5f14c68d2e2,13844,mogi guacu,SP
2,ce3ad9de960102d0677a81f5d0bb7b2d,20031,rio de janeiro,RJ
3,c0f3eea2e14555b6faeea3dd58c1b1c3,4195,sao paulo,SP
4,51a04a8a6bdcb23deccc82b0b80742cf,12914,braganca paulista,SP
5,c240c4061717ac1806ae6ee72be3533b,20920,rio de janeiro,RJ
6,e49c26c3edfa46d227d5121a6b6e4d37,55325,brejao,PE
7,1b938a7ec6ac5061a66a3766e0e75f90,16304,penapolis,SP
8,768a86e36ad6aae3d03ee3c6433d61df,1529,sao paulo,SP
9,ccc4bbb5f32a6ab2b7066a4130f114e3,80310,curitiba,PR


| Column Name              | Description                                                                 |
|--------------------------|-----------------------------------------------------------------------------|
| `seller_id`              | Unique identifier for each seller.                                          |
| `seller_zip_code_prefix`| First 5 digits of the seller's ZIP code (used for regional analysis).       |
| `seller_city`            | City where the seller is located.                                           |
| `seller_state`           | State (abbreviated) where the seller is located (e.g., SP for Sao Paulo).  |

In [3]:
df_sellers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3095 entries, 0 to 3094
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   seller_id               3095 non-null   object
 1   seller_zip_code_prefix  3095 non-null   int64 
 2   seller_city             3095 non-null   object
 3   seller_state            3095 non-null   object
dtypes: int64(1), object(3)
memory usage: 96.8+ KB


In [4]:
df_sellers.describe()

Unnamed: 0,seller_zip_code_prefix
count,3095.0
mean,32291.059451
std,32713.45383
min,1001.0
25%,7093.5
50%,14940.0
75%,64552.5
max,99730.0


In [17]:
summary = []

for col in df_sellers.columns:
    unique_vals = df_sellers[col].dropna().unique()
    summary.append({
        'Column': col,
        'Unique Count': len(unique_vals),
        'Unique Values': unique_vals
    })

df_summary = pd.DataFrame(summary)
df_summary

Unnamed: 0,Column,Unique Count,Unique Values
0,seller_id,3095,"[3442f8959a84dea7ee197c632cb2df15, d1b65fc7deb..."
1,seller_zip_code_prefix,2246,"[13023, 13844, 20031, 4195, 12914, 20920, 5532..."
2,seller_city,611,"[campinas, mogi guacu, rio de janeiro, sao pau..."
3,seller_state,23,"[SP, RJ, PE, PR, GO, SC, BA, DF, RS, MG, RN, M..."


In [6]:
df_sellers.isna().sum()

seller_id                 0
seller_zip_code_prefix    0
seller_city               0
seller_state              0
dtype: int64

In [7]:
df_sellers.duplicated().sum()

0

In [18]:
unique_cities = df_sellers['seller_city'].unique()
sorted_cities = sorted(unique_cities)
for city in sorted_cities:
    print(city)

04482255
abadia de goias
afonso claudio
aguas claras df
alambari
alfenas
almirante tamandare
alvares machado
alvorada
americana
amparo
ampere
anapolis
andira-pr
andradas
angra dos reis
angra dos reis rj
ao bernardo do campo
aparecida
aparecida de goiania
aperibe
apucarana
aracaju
aracatuba
araguari
arapongas
araquari
ararangua
araraquara
araras
araucaria
araxa
arinos
armacao dos buzios
arraial d'ajuda (porto seguro)
artur nogueira
aruja
arvorezinha
assis
atibaia
auriflama
auriflama/sp
avare
bady bassitt
baependi
bage
bahia
balenario camboriu
balneario camboriu
bandeirantes
barbacena
barbacena/ minas gerais
bariri
barra mansa
barra velha
barretos
barrinha
barro alto
barueri
batatais
bauru
bebedouro
belford roxo
belo horizont
belo horizonte
bento goncalves
bertioga
betim
birigui
blumenau
bocaiuva do sul
bofete
boituva
bom jardim
bom jesus dos perdoes
bombinhas
bonfinopolis de minas
borda da mata
botucatu
braco do norte
braganca paulista
brasilia
brasilia df
brejao
brotas
brusque
buritama

In [19]:
# Filter cities that start with 's'
s_cities = [city for city in df_sellers['seller_city'].unique() if isinstance(city, str) and city.lower().startswith('s')]
s_cities.sort()

# Print the cities
for city in s_cities:
    print(city)

s jose do rio preto
sabara
salto
salvador
sando andre
santa barbara d oeste
santa barbara d'oeste
santa barbara d´oeste
santa catarina
santa cecilia
santa cruz do sul
santa maria
santa maria da serra
santa rita do sapucai
santa rosa
santa rosa de viterbo
santa terezinha de goias
santa terezinha de itaipu
santana de parnaiba
santo andre
santo andre/sao paulo
santo angelo
santo antonio da patrulha
santo antonio de padua
santo antonio de posse
santos
sao  jose dos pinhais
sao  paulo
sao bento
sao bento do sul
sao bernardo do campo
sao bernardo do capo
sao caetano do sul
sao carlos
sao francisco do sul
sao goncalo
sao joao da boa vista
sao joao de meriti
sao joao del rei
sao joaquim da barra
sao jose
sao jose do rio pardo
sao jose do rio pret
sao jose do rio preto
sao jose dos campos
sao jose dos pinhais
sao jose dos pinhas
sao leopoldo
sao ludgero
sao luis
sao miguel d'oeste
sao miguel do oeste
sao paluo
sao paulo
sao paulo - sp
sao paulo / sao paulo
sao paulo sp
sao paulop
sao pauo
sao p

## Data Cleaning

### Clean seller city names

In [29]:
# Normalise all city names to lowercase and strip spaces:
df_sellers['seller_city'] = df_sellers['seller_city'].str.lower().str.strip()

# Define manual fixes:
manual_city_fixes = {
    'sao paulo sp': 'sao paulo',
    'sao paulo - sp': 'sao paulo',
    'são paulo': 'sao paulo',
    'sao paulo / sao paulo': 'sao paulo',
    'sp': 'sao paulo',
    'sp / sp': 'sao paulo',
    'sao  paulo': 'sao paulo',
    'sao pauo': 'sao paulo',
    'sao paulop': 'sao paulo',
    'sao paluo': 'sao paulo',

    'mogi das cruzes / sp': 'mogi das cruzes',
    'ribeirao preto / sao paulo': 'ribeirao preto',
    'jacarei / sao paulo': 'jacarei',
    'carapicuiba / sao paulo': 'carapicuiba',
    'cariacica / es': 'cariacica',
    'barbacena/ minas gerais': 'barbacena',
    'auriflama/sp': 'auriflama',
    'santo andre/sao paulo': 'santo andre',
    'maua/sao paulo': 'maua',
    'sao sebastiao da grama/sp': 'sao sebastiao da grama',
    'balenario camboriu': 'balneario camboriu',
    'belo horizont': 'belo horizonte',
    'angra dos reis rj': 'angra dos reis',
    'arraial d\'ajuda (porto seguro)': 'arraial d\'ajuda',
    'brasilia df': 'brasilia',
    'lages - sc': 'lages',
    'pinhais/pr': 'pinhais',
    'novo hamburgo, rio grande do sul, brasil': 'novo hamburgo',
    'rio de janeiro / rio de janeiro': 'rio de janeiro',
    'rio de janeiro \\rio de janeiro': 'rio de janeiro',
    'rio de janeiro, rio de janeiro, brasil': 'rio de janeiro',
    'robeirao preto': 'ribeirao preto',
    'riberao preto': 'ribeirao preto',
    'ribeirao pretp': 'ribeirao preto',
    'garulhos': 'guarulhos',
    'ferraz de  vasconcelos': 'ferraz de vasconcelos',
    'sando andre': 'santo andre',
    'scao jose do rio pardo': 'sao jose do rio pardo',
    'ao bernardo do campo': 'sao bernardo do campo',
    'paincandu': 'paicandu',
    'andira-pr': 'andira',
    'angra dos reis rj': 'angra dos reis',
    'floranopolis': 'florianopolis',
    's jose do rio preto': 'sao jose do rio preto',
    'santa barbara d oeste': 'santa barbara d\'oeste',
    'santa barbara d´oeste': 'santa barbara d\'oeste',
    'sbc': 'sao bernardo do campo',
    'sbc/sp': 'sao bernardo do campo',
    'tabao da serra': 'taboao da serra',

    # To be dropped:
    '04482255': None,
    'vendas@creditparts.com.br': None,
    'bahia': None,
    'centro': None,
    'minas gerais': None
}

# Apply manual corrections:
df_sellers['seller_city_cleaned'] = df_sellers['seller_city'].replace(manual_city_fixes)

In [30]:
# Show seller city rows that couldn't be cleaned (still NaN):
df_sellers[df_sellers['seller_city_cleaned'].isna()]

Unnamed: 0,seller_id,seller_zip_code_prefix,seller_city,seller_state,seller_city_cleaned
405,4221a7df464f1fe2955934e30ff3a5a1,48602,bahia,BA,
517,ceb7b4fb9401cd378de7886317ad1b47,22790,04482255,RJ,
2183,d5c530f4884a75ae0dba9c148718d278,35660,centro,MG,
2258,4b5f66b7adcf57f1ecc0d3c07dd6b177,87025,vendas@creditparts.com.br,PR,
2380,1703bc09972dab9782e7a9194943b69f,37165,minas gerais,MG,


- Bahia is a state, not a city.
- 04482255 is an invalid entry.
- Centro is a neighbourhood, ambiguous.
- vendas@creditparts.com.br is clearly invalid.
- Minas Gerais is a state, not specific enough.

In [33]:
# Show one example row for each of the zip codes in question
unique_zip_codes = [48602, 22790, 35660, 87025, 37165]
result_df = pd.DataFrame()

for zip_code in unique_zip_codes:
    # Get the first row for each zip code
    sample_row = df_geolocation_clean[df_geolocation_clean['geolocation_zip_code_prefix'] == zip_code].head(1)
    result_df = pd.concat([result_df, sample_row])

result_df

Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
702411,48602,-9.402775,-38.219764,paulo afonso,BA
454148,22790,-23.011335,-43.450256,rio de janeiro,RJ
598304,35660,-19.84717,-44.601957,para de minas,MG
892109,87025,-23.377397,-51.931006,maringa,PR
624315,37165,-21.112446,-45.832343,campo do meio,MG


The zip_code_prefix of these invalid entries have valid entries in the `geolocation` dataset. So, we will replace those invalid entries from sellers with the valid info from geolocation instead.

In [34]:
# Fix the missing city names by looking up in geolocation data
# For the remaining NaN values in seller_city_cleaned, look up the city name based on zip code
for index, row in df_sellers[df_sellers['seller_city_cleaned'].isna()].iterrows():
    # Get the zip code prefix
    zip_code_prefix = row['seller_zip_code_prefix']
    
    # Look up the corresponding city in the geolocation dataset
    matching_city = df_geolocation_clean[df_geolocation_clean['geolocation_zip_code_prefix'] == zip_code_prefix]
    
    if not matching_city.empty:
        # Use the most common city for this zip code
        city_counts = matching_city['geolocation_city'].value_counts()
        most_common_city = city_counts.index[0]
        
        # Update the seller_city_cleaned value
        df_sellers.at[index, 'seller_city_cleaned'] = most_common_city

In [38]:
df_sellers.iloc[[405, 517, 2183, 2258, 2380]]

Unnamed: 0,seller_id,seller_zip_code_prefix,seller_city,seller_state,seller_city_cleaned
405,4221a7df464f1fe2955934e30ff3a5a1,48602,bahia,BA,paulo afonso
517,ceb7b4fb9401cd378de7886317ad1b47,22790,04482255,RJ,rio de janeiro
2183,d5c530f4884a75ae0dba9c148718d278,35660,centro,MG,para de minas
2258,4b5f66b7adcf57f1ecc0d3c07dd6b177,87025,vendas@creditparts.com.br,PR,maringa
2380,1703bc09972dab9782e7a9194943b69f,37165,minas gerais,MG,campo do meio


Now, we can safely replace `seller_city` values with the cleaned version

In [39]:
# Now replace the seller_city column with our cleaned version
df_sellers['seller_city'] = df_sellers['seller_city_cleaned']

# Drop the seller_city_cleaned column as it's now redundant
df_sellers = df_sellers.drop(columns=['seller_city_cleaned'])

# Show the first few rows to confirm the changes
df_sellers.head()

Unnamed: 0,seller_id,seller_zip_code_prefix,seller_city,seller_state
0,3442f8959a84dea7ee197c632cb2df15,13023,campinas,SP
1,d1b65fc7debc3361ea86b5f14c68d2e2,13844,mogi guacu,SP
2,ce3ad9de960102d0677a81f5d0bb7b2d,20031,rio de janeiro,RJ
3,c0f3eea2e14555b6faeea3dd58c1b1c3,4195,sao paulo,SP
4,51a04a8a6bdcb23deccc82b0b80742cf,12914,braganca paulista,SP


In [44]:
df_sellers.to_csv('../data/cleaned_data/olist_sellers_dataset_cleaned.csv', index=False)

## Univariate Analysis

In [45]:
# Number of unique seller cities:
state_counts = df_sellers['seller_state'].value_counts().reset_index()
state_counts.columns = ['seller_state', 'count']

fig = px.bar(state_counts, x='seller_state', y='count', 
             title='Number of Sellers by State', 
             labels={'count': 'Number of Sellers'})
fig.show()

In [46]:
# Top 10 seller cities:
top_cities = df_sellers['seller_city'].value_counts().nlargest(10).reset_index()
top_cities.columns = ['seller_city', 'count']

fig = px.bar(top_cities, x='count', y='seller_city', 
             title='Top 10 Seller Cities', orientation='h',
             labels={'count': 'Number of Sellers', 'seller_city': 'City'})
fig.update_layout(yaxis={'categoryorder':'total ascending'})
fig.show()

## Bivariate Analysis

In [47]:
# Top 10 cities by number of sellers, coloured by state:
city_state = df_sellers.groupby(['seller_city', 'seller_state'])['seller_id'].nunique().reset_index()
city_state.columns = ['seller_city', 'seller_state', 'seller_count']

top_city_state = city_state.sort_values('seller_count', ascending=False).head(10)

fig = px.bar(top_city_state, x='seller_count', y='seller_city', color='seller_state',
             title='Top 10 Cities by Number of Sellers (Coloured by State)',
             labels={'seller_count': 'Number of Sellers', 'seller_city': 'City'})
fig.update_layout(yaxis={'categoryorder':'total ascending'})
fig.show()

## Multivariate Analysis

In [48]:
# Treemap of number of sellers by state and city:
city_state_counts = df_sellers.groupby(['seller_state', 'seller_city'])['seller_id'].nunique().reset_index()
city_state_counts.columns = ['seller_state', 'seller_city', 'seller_count']

fig = px.treemap(city_state_counts,
                 path=['seller_state', 'seller_city'],
                 values='seller_count',
                 title='Number of Sellers by State and City')
fig.show()

In [49]:
summary = df_sellers.groupby(['seller_state'])['seller_id'].nunique().reset_index()
summary.columns = ['seller_state', 'num_unique_sellers']
summary.sort_values('num_unique_sellers', ascending=False)

Unnamed: 0,seller_state,num_unique_sellers
22,SP,1849
15,PR,349
8,MG,244
20,SC,190
16,RJ,171
19,RS,129
6,GO,40
4,DF,30
5,ES,23
2,BA,19
