In [None]:
# Import libraries
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [7]:
# Set display options for better readability in output
#pd.set_option("display.max_colwidth", None)
pd.set_option("display.float_format", '{:,.2f}'.format)

In [8]:
# Load all CSV files from the Brazilian E-Commerce dataset into separate DataFrames
data_path = "../data/brazilian/raw/"
files = os.listdir(data_path)

customers = pd.read_csv(data_path + "olist_customers_dataset.csv")
geolocation = pd.read_csv(data_path + "olist_geolocation_dataset.csv")
orders = pd.read_csv(data_path + "olist_orders_dataset.csv")
items = pd.read_csv(data_path + "olist_order_items_dataset.csv")
payments = pd.read_csv(data_path + "olist_order_payments_dataset.csv")
reviews = pd.read_csv(data_path + "olist_order_reviews_dataset.csv")
products = pd.read_csv(data_path + "olist_products_dataset.csv")
sellers = pd.read_csv(data_path + "olist_sellers_dataset.csv")
translation = pd.read_csv(data_path + "product_category_name_translation.csv")

# Store all DataFrames in a dictionary for easier looping and inspection
dataframes = {
    'customers': customers,
    'geolocation': geolocation,
    'orders': orders,
    'items': items,
    'payments': payments,
    'reviews': reviews,
    'products': products,
    'sellers': sellers,
    'translation': translation
}

In [None]:
# Define a function to summarize a dictionary of DataFrames
def summarize_dataframes(df_dict=dataframes):
    """
    Takes a dictionary of DataFrames and returns a summary DataFrame
        with the following information for each:
        - name: the key name from the dictionary
        - rows: number of rows in the DataFrame
        - columns: number of columns
        - column_names: a list of column names
        - duplicates: number of duplicated rows
    """
    summary = []

    # Loop over each DataFrame in the dictionary
    for name, df in geolocation_clean_dict.items():
        summary.append(
            {
                "name": name,
                "rows": df.shape[0],
                "columns": df.shape[1],
                "column_names": list(df.columns),
                "duplicates": df.duplicated().sum(),
            }
        )
    # Return a summary DataFrame
    return pd.DataFrame(summary)


# Call the function and display the summary of all loaded DataFrames
summarize_dataframes(dataframes)

Unnamed: 0,name,rows,columns,column_names,duplicates
0,customers,99441,5,"[customer_id, customer_unique_id, customer_zip...",0
1,geolocation,1000163,5,"[geolocation_zip_code_prefix, geolocation_lat,...",261831
2,orders,99441,8,"[order_id, customer_id, order_status, order_pu...",0
3,items,112650,7,"[order_id, order_item_id, product_id, seller_i...",0
4,payments,103886,5,"[order_id, payment_sequential, payment_type, p...",0
5,reviews,99224,7,"[review_id, order_id, review_score, review_com...",0
6,products,32951,9,"[product_id, product_category_name, product_na...",0
7,sellers,3095,4,"[seller_id, seller_zip_code_prefix, seller_cit...",0
8,translation,71,2,"[product_category_name, product_category_name_...",0


In [None]:
# Display a random sample of 5 rows from each DataFrame for a quick visual inspection
for name, df in dataframes.items():
    print(f'{name.capitalize()}:')
    display(df.sample(5))
    print("-"*130)

Customers:


Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state
13560,b2e5428802f42f9f27ef4b4bdf20524e,5c96a1394a3f018245c5eac45e612efb,1419,sao paulo,SP
87221,df1936f12dd45f272699ab4d72a0b98f,8597fc7ec5841b2d3832b3ff62c5d2c4,88058,florianopolis,SC
93310,5e89152d3238fc7986e1c5222eb50e58,0eb34a6657a03ec649cfeb4559d0e22a,88070,florianopolis,SC
51027,c19d64724601030499c6e9035029fff0,903fea98271edfad787cc4f60428edc2,30230,belo horizonte,MG
33630,ec7ec1275ebdd28f29b049a22925adf9,fc72910296cdd10668e50af1dd057f1e,15997,matao,SP


----------------------------------------------------------------------------------------------------------------------------------
Geolocation:


Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
878813,85801,-24.97,-53.46,cascavel,PR
770744,68470,-2.0,-49.86,oeiras do para,PA
804139,75240,-16.96,-48.94,bela vista de goias,GO
803715,75180,-16.67,-48.61,silvania,GO
989411,97700,-29.18,-54.87,santiago,RS


----------------------------------------------------------------------------------------------------------------------------------
Orders:


Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
65007,9ee8cf0ffcf31bb51cfefb03220c63e8,644f81e8109d25d2df7ac2eb7b24ac88,delivered,2017-11-19 13:21:35,2017-11-21 03:51:26,2017-11-24 16:47:03,2017-12-07 16:17:54,2017-12-08 00:00:00
89220,dbf97707cd97d5209511c307f315a361,2683f227b4ca959e6183a3ebbd98b3d4,delivered,2018-02-28 14:06:00,2018-03-02 02:40:31,2018-03-02 18:45:29,2018-03-23 21:27:54,2018-03-20 00:00:00
75442,ad9703dda0e19971bf6e878b552bf117,bd43f44fa28ef40813b0c69b6619858a,delivered,2018-01-12 18:58:03,2018-01-12 19:10:31,2018-01-15 22:41:46,2018-02-02 18:14:09,2018-02-15 00:00:00
26713,d77c01a7f1b1ea093949dbc6fe3cc2fa,2e4bda9c3b716e596451491497474dd3,delivered,2018-04-25 17:25:13,2018-04-25 17:55:27,2018-04-30 06:21:00,2018-05-05 11:02:28,2018-05-28 00:00:00
72515,b7994d35101970f5e1a4621fe05cc1e9,79c07db4dd2ebdf63826672146b15128,delivered,2017-11-17 19:48:19,2017-11-17 20:35:43,2017-11-21 17:22:30,2017-11-22 17:18:42,2017-12-04 00:00:00


----------------------------------------------------------------------------------------------------------------------------------
Items:


Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value
78067,b1a4848ea79dd95fd703aa8eebcfdf28,1,84a87daa85c8b432d90bc1baa0cb4388,aafe36600ce604f205b86b5084d3d767,2017-10-19 15:14:22,59.9,16.99
75805,aca1243538a8bc195f2f017f8ea2456f,1,4b703444923b5e57bb73ba343f5ebab7,42b729f859728f5079499127a9c2ef37,2018-03-27 20:56:02,32.9,7.39
20617,2f40c8579236a29cb23f92d2533485b4,1,d17a8ef6d4228318ca645116dd6bea23,18a349e75d307f4b4cc646a691ed4216,2018-05-17 13:50:26,100.0,19.39
17026,270087d97c6abc409514129d62d3a513,1,c7bcab3a7039340cf46705f43bab3ee2,2d50d6282f8aa2257819a77bfaa0efe0,2017-09-06 13:55:17,49.0,12.69
77664,b0ca5afc6c53a1b2bd7c6af809913f9e,1,11dcc970f7a5581e1bf777b6da3f7096,06579cb253ecd5a3a12a9e6eb6bf8f47,2017-08-15 14:35:11,69.9,11.73


----------------------------------------------------------------------------------------------------------------------------------
Payments:


Unnamed: 0,order_id,payment_sequential,payment_type,payment_installments,payment_value
24346,fc0c50f90120412d596dbda411f6fbe4,1,credit_card,1,37.13
20563,3451e358dcd1b3eb514821e4e9772ccc,1,credit_card,10,541.79
29200,e0be387e4482cc1e135c5ffe3d896885,1,credit_card,2,126.54
49318,f943e18398079cee438234bf6482996e,1,credit_card,10,347.06
32187,7529dd1778a9d75bcd367a8ddc1e9d14,1,credit_card,2,77.57


----------------------------------------------------------------------------------------------------------------------------------
Reviews:


Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp
81838,0b5129fe1e50d163642e93fde165e271,62f808f5e8babdf5f0092f748ce5b554,5,,Tudo conforme acordado.,2018-02-24 00:00:00,2018-02-25 15:11:29
93099,32fe58c9d12a7cc69672f5146666350e,9588842b04b68104582a11aced293fa2,5,,,2018-07-07 00:00:00,2018-07-08 22:30:47
44354,1fb179f64b72e5f5bdb40114d0ad752b,113c27b79eb067ecd214d592db312190,5,,,2018-01-30 00:00:00,2018-01-30 22:00:55
40147,7e4b1b3777dd481631a7d15037fafe38,f95d91071966bc70023eae0e1750c1fc,5,,Loja confiável,2018-03-28 00:00:00,2018-03-29 02:16:37
88739,3f755ce3d3626de06573eda790500de5,0f57c7a28afbca568ab72eb6be1dfb81,4,,,2018-04-06 00:00:00,2018-04-06 12:46:00


----------------------------------------------------------------------------------------------------------------------------------
Products:


Unnamed: 0,product_id,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
31783,b316b3aedd7d2e26569a318e2611e6c0,automotivo,63.0,1541.0,4.0,2300.0,20.0,10.0,20.0
1027,6d07236d4abe99b5b1cbf368971bf6e7,brinquedos,49.0,170.0,6.0,200.0,17.0,7.0,12.0
5222,9f6b6b828b980352cbd59ff7a03a7277,cama_mesa_banho,53.0,210.0,1.0,675.0,35.0,20.0,35.0
14942,53f345740817185c3b226e722a2f729a,informatica_acessorios,49.0,152.0,6.0,200.0,18.0,7.0,12.0
29881,c24d97e07dcb8f32a8bfbe755731950a,cama_mesa_banho,46.0,471.0,1.0,10500.0,58.0,20.0,46.0


----------------------------------------------------------------------------------------------------------------------------------
Sellers:


Unnamed: 0,seller_id,seller_zip_code_prefix,seller_city,seller_state
2858,353e21e8bf8de2722b4fae1636664b76,92120,canoas,RS
2502,b7cc6c5e001441ae8cdd5c69a480cbe4,89520,curitibanos,SC
1700,615c3462099ffa940d37b17dfda19594,24855,rio de janeiro,RJ
810,9200e6ce317b67196b6b0bad4d4bd567,20771,rio de janeiro,RJ
463,a90a5da8b09e44264a177c0374a5ac87,84010,ponta grossa,PR


----------------------------------------------------------------------------------------------------------------------------------
Translation:


Unnamed: 0,product_category_name,product_category_name_english
64,cds_dvds_musicais,cds_dvds_musicals
34,artigos_de_festas,party_supplies
15,telefonia_fixa,fixed_telephony
3,cama_mesa_banho,bed_bath_table
23,malas_acessorios,luggage_accessories


----------------------------------------------------------------------------------------------------------------------------------


In [None]:
# Quick overview of column properties (dtypes, missing values, uniques) for all DataFrames
def overview(df_dict=dataframes):
    """
    Creates and displays a column-wise overview for each DataFrame in a dictionary.

    Parameters:
        df_dict (dict): A dictionary of DataFrames (e.g., {'orders': orders, ...})

    Displays:
        For each DataFrame:
            - Data type
            - Non-null count
            - Missing value count and percentage
            - Missing value percentage
            - Number of unique values
            - Unique values
    """
    for name, df in df_dict.items():
        print(f'{name.capitalize()}:')
        summary = pd.DataFrame(
                {
                    "dtype": df.dtypes,
                    "total": df.count(),
                    "missing_n": df.isna().sum(),
                    "missing_%": df.isna().mean() * 100,
                    "uniques_n": df.nunique(),
                    "uniques": [df[col].unique() for col in df.columns],
                }
        )
        display(summary)   
        print("-"*130)

overview(dataframes)


Customers:


Unnamed: 0,dtype,total,missing_n,missing_%,uniques_n,uniques
customer_id,object,99441,0,0.0,99441,"[06b8999e2fba1a1fbc88172c00ba8bc7, 18955e83d33..."
customer_unique_id,object,99441,0,0.0,96096,"[861eff4711a542e4b93843c6dd7febb0, 290c77bc529..."
customer_zip_code_prefix,int64,99441,0,0.0,14994,"[14409, 9790, 1151, 8775, 13056, 89254, 4534, ..."
customer_city,object,99441,0,0.0,4119,"[franca, sao bernardo do campo, sao paulo, mog..."
customer_state,object,99441,0,0.0,27,"[SP, SC, MG, PR, RJ, RS, PA, GO, ES, BA, MA, M..."


----------------------------------------------------------------------------------------------------------------------------------
Geolocation:


Unnamed: 0,dtype,total,missing_n,missing_%,uniques_n,uniques
geolocation_zip_code_prefix,int64,1000163,0,0.0,19015,"[1037, 1046, 1041, 1035, 1012, 1047, 1013, 102..."
geolocation_lat,float64,1000163,0,0.0,717360,"[-23.54562128115268, -23.54608112703553, -23.5..."
geolocation_lng,float64,1000163,0,0.0,717613,"[-46.63929204800168, -46.64482029837157, -46.6..."
geolocation_city,object,1000163,0,0.0,8011,"[sao paulo, são paulo, sao bernardo do campo, ..."
geolocation_state,object,1000163,0,0.0,27,"[SP, RN, AC, RJ, ES, MG, BA, SE, PE, AL, PB, C..."


----------------------------------------------------------------------------------------------------------------------------------
Orders:


Unnamed: 0,dtype,total,missing_n,missing_%,uniques_n,uniques
order_id,object,99441,0,0.0,99441,"[e481f51cbdc54678b7cc49136f2d6af7, 53cdb2fc8bc..."
customer_id,object,99441,0,0.0,99441,"[9ef432eb6251297304e76186b10a928d, b0830fb4747..."
order_status,object,99441,0,0.0,8,"[delivered, invoiced, shipped, processing, una..."
order_purchase_timestamp,object,99441,0,0.0,98875,"[2017-10-02 10:56:33, 2018-07-24 20:41:37, 201..."
order_approved_at,object,99281,160,0.16,90733,"[2017-10-02 11:07:15, 2018-07-26 03:24:27, 201..."
order_delivered_carrier_date,object,97658,1783,1.79,81018,"[2017-10-04 19:55:00, 2018-07-26 14:31:00, 201..."
order_delivered_customer_date,object,96476,2965,2.98,95664,"[2017-10-10 21:25:13, 2018-08-07 15:27:45, 201..."
order_estimated_delivery_date,object,99441,0,0.0,459,"[2017-10-18 00:00:00, 2018-08-13 00:00:00, 201..."


----------------------------------------------------------------------------------------------------------------------------------
Items:


Unnamed: 0,dtype,total,missing_n,missing_%,uniques_n,uniques
order_id,object,112650,0,0.0,98666,"[00010242fe8c5a6d1ba2dd792cb16214, 00018f77f2f..."
order_item_id,int64,112650,0,0.0,21,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."
product_id,object,112650,0,0.0,32951,"[4244733e06e7ecb4970a6e2683c13e61, e5f2d52b802..."
seller_id,object,112650,0,0.0,3095,"[48436dade18ac8b2bce089ec2a041202, dd7ddc04e1b..."
shipping_limit_date,object,112650,0,0.0,93318,"[2017-09-19 09:45:35, 2017-05-03 11:05:13, 201..."
price,float64,112650,0,0.0,5968,"[58.9, 239.9, 199.0, 12.99, 199.9, 21.9, 19.9,..."
freight_value,float64,112650,0,0.0,6999,"[13.29, 19.93, 17.87, 12.79, 18.14, 12.69, 11...."


----------------------------------------------------------------------------------------------------------------------------------
Payments:


Unnamed: 0,dtype,total,missing_n,missing_%,uniques_n,uniques
order_id,object,103886,0,0.0,99440,"[b81ef226f3fe1789b1e8b2acac839d17, a9810da8291..."
payment_sequential,int64,103886,0,0.0,29,"[1, 2, 4, 5, 3, 8, 6, 7, 10, 11, 17, 19, 27, 1..."
payment_type,object,103886,0,0.0,5,"[credit_card, boleto, voucher, debit_card, not..."
payment_installments,int64,103886,0,0.0,24,"[8, 1, 2, 3, 6, 5, 4, 10, 7, 12, 9, 13, 15, 24..."
payment_value,float64,103886,0,0.0,29077,"[99.33, 24.39, 65.71, 107.78, 128.45, 96.12, 8..."


----------------------------------------------------------------------------------------------------------------------------------
Reviews:


Unnamed: 0,dtype,total,missing_n,missing_%,uniques_n,uniques
review_id,object,99224,0,0.0,98410,"[7bc2406110b926393aa56f80a40eba40, 80e641a11e5..."
order_id,object,99224,0,0.0,98673,"[73fc7af87114b39712e6da79b0a377eb, a548910a1c6..."
review_score,int64,99224,0,0.0,5,"[4, 5, 1, 3, 2]"
review_comment_title,object,11568,87656,88.34,4527,"[nan, recomendo, Super recomendo, Não chegou m..."
review_comment_message,object,40977,58247,58.7,36159,"[nan, Recebi bem antes do prazo estipulado., P..."
review_creation_date,object,99224,0,0.0,636,"[2018-01-18 00:00:00, 2018-03-10 00:00:00, 201..."
review_answer_timestamp,object,99224,0,0.0,98248,"[2018-01-18 21:46:59, 2018-03-11 03:05:13, 201..."


----------------------------------------------------------------------------------------------------------------------------------
Products:


Unnamed: 0,dtype,total,missing_n,missing_%,uniques_n,uniques
product_id,object,32951,0,0.0,32951,"[1e9e8ef04dbcff4541ed26657ea517e5, 3aa071139cb..."
product_category_name,object,32341,610,1.85,73,"[perfumaria, artes, esporte_lazer, bebes, util..."
product_name_lenght,float64,32341,610,1.85,66,"[40.0, 44.0, 46.0, 27.0, 37.0, 60.0, 56.0, 57...."
product_description_lenght,float64,32341,610,1.85,2960,"[287.0, 276.0, 250.0, 261.0, 402.0, 745.0, 127..."
product_photos_qty,float64,32341,610,1.85,19,"[1.0, 4.0, 2.0, 3.0, 5.0, 9.0, 6.0, nan, 7.0, ..."
product_weight_g,float64,32949,2,0.01,2204,"[225.0, 1000.0, 154.0, 371.0, 625.0, 200.0, 18..."
product_length_cm,float64,32949,2,0.01,99,"[16.0, 30.0, 18.0, 26.0, 20.0, 38.0, 70.0, 40...."
product_height_cm,float64,32949,2,0.01,102,"[10.0, 18.0, 9.0, 4.0, 17.0, 5.0, 24.0, 8.0, 1..."
product_width_cm,float64,32949,2,0.01,95,"[14.0, 20.0, 15.0, 26.0, 13.0, 11.0, 44.0, 40...."


----------------------------------------------------------------------------------------------------------------------------------
Sellers:


Unnamed: 0,dtype,total,missing_n,missing_%,uniques_n,uniques
seller_id,object,3095,0,0.0,3095,"[3442f8959a84dea7ee197c632cb2df15, d1b65fc7deb..."
seller_zip_code_prefix,int64,3095,0,0.0,2246,"[13023, 13844, 20031, 4195, 12914, 20920, 5532..."
seller_city,object,3095,0,0.0,611,"[campinas, mogi guacu, rio de janeiro, sao pau..."
seller_state,object,3095,0,0.0,23,"[SP, RJ, PE, PR, GO, SC, BA, DF, RS, MG, RN, M..."


----------------------------------------------------------------------------------------------------------------------------------
Translation:


Unnamed: 0,dtype,total,missing_n,missing_%,uniques_n,uniques
product_category_name,object,71,0,0.0,71,"[beleza_saude, informatica_acessorios, automot..."
product_category_name_english,object,71,0,0.0,71,"[health_beauty, computers_accessories, auto, b..."


----------------------------------------------------------------------------------------------------------------------------------


In [None]:
# Generate dbdiagram.io-compatible table definitions from DataFrames
def generate_er_schema(df_dict=dataframes):
    """
    Generate table definitions in dbdiagram.io format from a dictionary of DataFrames.

    For each DataFrame:
    - Converts pandas dtypes to SQL-style types (int, varchar, decimal, timestamp, etc.)
    - Outputs formatted table definitions ready to paste into dbdiagram.io

    Parameters:
    df_dict (dict): A dictionary where keys are table names and values are pandas DataFrames.

    Returns:
    None (prints output to console)
    """

    dtype_map = {
        "int64": "int",
        "float64": "decimal",
        "object": "varchar",
        "bool": "boolean",
        "datetime64[ns]": "timestamp",
    }

    for name, df in df_dict.items():
        print(f"Table {name} {{")
        for col in df.columns:
            dtype = str(df[col].dtype)
            sql_type = dtype_map.get(dtype, "varchar")  # fallback to varchar if unknown
            print(f"  {col} {sql_type}")
        print("}\n")

generate_er_schema(dataframes)

Table customers {
  customer_id varchar
  customer_unique_id varchar
  customer_zip_code_prefix int
  customer_city varchar
  customer_state varchar
}

Table geolocation {
  geolocation_zip_code_prefix int
  geolocation_lat decimal
  geolocation_lng decimal
  geolocation_city varchar
  geolocation_state varchar
}

Table orders {
  order_id varchar
  customer_id varchar
  order_status varchar
  order_purchase_timestamp varchar
  order_approved_at varchar
  order_delivered_carrier_date varchar
  order_delivered_customer_date varchar
  order_estimated_delivery_date varchar
}

Table items {
  order_id varchar
  order_item_id int
  product_id varchar
  seller_id varchar
  shipping_limit_date varchar
  price decimal
  freight_value decimal
}

Table payments {
  order_id varchar
  payment_sequential int
  payment_type varchar
  payment_installments int
  payment_value decimal
}

Table reviews {
  review_id varchar
  order_id varchar
  review_score int
  review_comment_title varchar
  review_c

In [None]:
# Check duplicates in reviews table
print(reviews.duplicated(subset="review_id").sum())
print(reviews.duplicated(subset="order_id").sum())

814
551
