In [19]:
# Import libraries
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [20]:
# Set display options for better readability in output
#pd.set_option("display.max_colwidth", None)
pd.set_option("display.float_format", '{:,.2f}'.format)

## **Data Overview**

### Load and explore all datasets

➤ Load all raw CSV files into individual DataFrames and store them in a dictionary for easier handling and quick access during exploration.

In [21]:
# Load all CSV files from the Brazilian E-Commerce dataset into separate DataFrames
data_path = "../data/brazilian/raw/"
files = os.listdir(data_path)

customers_raw = pd.read_csv(data_path + "olist_customers_dataset.csv")
geolocation_raw = pd.read_csv(data_path + "olist_geolocation_dataset.csv")
orders_raw = pd.read_csv(data_path + "olist_orders_dataset.csv")
items_raw = pd.read_csv(data_path + "olist_order_items_dataset.csv")
payments_raw = pd.read_csv(data_path + "olist_order_payments_dataset.csv")
reviews_raw = pd.read_csv(data_path + "olist_order_reviews_dataset.csv")
products_raw = pd.read_csv(data_path + "olist_products_dataset.csv")
sellers_raw = pd.read_csv(data_path + "olist_sellers_dataset.csv")
translation_raw = pd.read_csv(data_path + "product_category_name_translation.csv")

# Store all DataFrames in a dictionary for easier looping and inspection
dataframes_raw = {
    "customers": customers_raw,
    "geolocation": geolocation_raw,
    "orders": orders_raw,
    "items": items_raw,
    "payments": payments_raw,
    "reviews": reviews_raw,
    "products": products_raw,
    "sellers": sellers_raw,
    "translation": translation_raw,
}


The following tables are included in the Brazilian E-Commerce dataset:

- `customers`: customer information  
- `geolocation`: geographical coordinates by zip code prefix  
- `orders`: order details including status and timestamps  
- `items`: product-level details for each order  
- `payments`: payment methods, amounts and installment information 
- `reviews`: customer reviews and ratings  
- `products`: product attributes including category and dimensions
- `sellers`: seller information  
- `translation`: Portuguese-to-English product category mapping 

*Note: Original file names such as `olist_customers_dataset.csv` were renamed to simpler identifiers like `customers` for ease of use.*

➤  Summary of all tables using `.shape`, column names, and duplicate counts.

In [22]:
# Define a function to summarize a dictionary of DataFrames
def summarize_dataframes(df_dict=dataframes_raw):
    """
    Takes a dictionary of DataFrames and returns a summary DataFrame
        with the following information for each:
        - name: the key name from the dictionary
        - rows: number of rows in the DataFrame
        - columns: number of columns
        - column_names: a list of column names
        - duplicates: number of duplicated rows
    """
    summary = []

    # Loop over each DataFrame in the dictionary
    for name, df in df_dict.items():
        summary.append(
            {
                "name": name,
                "rows": df.shape[0],
                "columns": df.shape[1],
                "column_names": list(df.columns),
                "duplicates": df.duplicated().sum(),
            }
        )
    # Return a summary DataFrame
    return pd.DataFrame(summary)


# Call the function and display the summary of all loaded DataFrames
summarize_dataframes(dataframes_raw)

Unnamed: 0,name,rows,columns,column_names,duplicates
0,customers,99441,5,"[customer_id, customer_unique_id, customer_zip...",0
1,geolocation,1000163,5,"[geolocation_zip_code_prefix, geolocation_lat,...",261831
2,orders,99441,8,"[order_id, customer_id, order_status, order_pu...",0
3,items,112650,7,"[order_id, order_item_id, product_id, seller_i...",0
4,payments,103886,5,"[order_id, payment_sequential, payment_type, p...",0
5,reviews,99224,7,"[review_id, order_id, review_score, review_com...",0
6,products,32951,9,"[product_id, product_category_name, product_na...",0
7,sellers,3095,4,"[seller_id, seller_zip_code_prefix, seller_cit...",0
8,translation,71,2,"[product_category_name, product_category_name_...",0


➤  Quick sampling of 5 rows from each table for visual inspection.

In [23]:
# Display a random sample of 5 rows from each DataFrame for a quick visual inspection
for name, df in dataframes_raw.items():
    print(f'{name.capitalize()}:')
    display(df.sample(5))
    print("-"*130)

Customers:


Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state
12869,149ac0fdc3fc6ed91440c95c1a92736b,f1bee02ceae8a430506b798786f766b4,60170,fortaleza,CE
67812,3678034a0428898b73d8c6a2b353e769,7f71136093e2c5653179b2380e1cb00b,53433,paulista,PE
12260,294f90edf9d7f04ae89f4af0f2b64ed6,4f66517d8ac49edf81c5a6b15782d6bc,28735,quissama,RJ
79700,0505fd0a593f5eb5078c380b6761e43b,3da044832d726bc92b3d898b249aab53,95181,farroupilha,RS
4392,3e2fe6f0fa65fc4802dc9b29141f3a07,645f3682208dc4f2b7d0c935177551e2,36320,prados,MG


----------------------------------------------------------------------------------------------------------------------------------
Geolocation:


Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
193531,7840,-23.32,-46.73,franco da rocha,SP
421455,21040,-22.86,-43.25,rio de janeiro,RJ
805837,75670,-17.98,-48.64,marzagão,GO
684250,44095,-12.29,-38.94,feira de santana,BA
469582,24020,-22.89,-43.12,niteroi,RJ


----------------------------------------------------------------------------------------------------------------------------------
Orders:


Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
2174,95ca5e38f67d0f792b4fa38683682a0d,58283a66f78970a1137fdea5a4faba2a,delivered,2017-08-14 16:23:54,2017-08-15 03:45:38,2017-08-15 20:17:08,2017-08-26 14:47:59,2017-09-06 00:00:00
70557,b4fc77bdadd28d0ee631208831bc0d49,55b963cc1b4611c655ed0dab98bc4959,delivered,2017-03-13 00:26:12,2017-03-13 00:26:12,2017-03-13 10:20:59,2017-03-20 13:44:58,2017-04-07 00:00:00
17917,9d7ab082d6d8a4b069683e5a77db1261,ff6ed6bbecff417c54e415deed2caa67,delivered,2017-08-30 10:11:34,2017-08-31 02:35:52,2017-09-01 20:55:45,2017-09-11 18:23:33,2017-10-09 00:00:00
48159,c81ac01b43070a78c7039041dd0578af,c28f25ff63f6e64318f04142b8220093,delivered,2017-08-01 22:40:02,2017-08-01 22:55:13,2017-08-07 17:57:49,2017-08-17 20:22:23,2017-08-25 00:00:00
92087,05434c77e67f2531d6b5a6c5f8bce535,abadbcbbb0bc8c1f6820c9fcb7ce8077,delivered,2018-08-08 17:57:14,2018-08-08 18:10:18,2018-08-09 12:42:00,2018-08-15 20:27:34,2018-08-21 00:00:00


----------------------------------------------------------------------------------------------------------------------------------
Items:


Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value
8909,1462290799412b71be32dd880eaf4e1b,1,d7faab3fa0091d1220a8ada9cae1bab3,3504c0cb71d7fa48d967e0e4c94d59d9,2017-08-28 04:10:55,29.9,14.1
5327,0c16e6184d4cf8be09b57e667eb67bcc,1,47f699d9462e071f977f612be2b5b67a,56642bcb79900e777d68e91915cb4267,2018-02-15 18:55:26,170.0,12.39
62242,8e1ac730ea9bca963b591024346e0679,1,ed996a90597b8541c387ce93622ac44c,6a8b085f816a1f75f92dbac6eb545f8f,2017-09-13 02:50:39,61.5,15.18
30834,45f7d9a94705872dad55bc0c688f9cfd,2,167b4b8c4bd0c401bea62f5e050d70a4,25c5c91f63607446a97b143d2d535d31,2018-04-10 14:15:17,110.0,36.1
32808,4a5a57c482bf855e998fb45fc1173c83,1,10717ff440b2320081989126e858b220,00ee68308b45bc5e2660cd833c3f81cc,2018-01-24 23:18:27,138.0,12.17


----------------------------------------------------------------------------------------------------------------------------------
Payments:


Unnamed: 0,order_id,payment_sequential,payment_type,payment_installments,payment_value
76325,bb24f9658de05baa4397140cb9116911,1,credit_card,5,282.5
31384,cb383b52e0b0fc081bddb3bd8a5c5373,1,credit_card,1,68.73
24123,85c8b9409449608b5f5c6c0a66c9146a,1,credit_card,1,439.33
31167,622313cbbf2c6fb616025bc038e3bba6,1,credit_card,2,979.45
99870,c20f5cf8451a562b0fc50dd48fc8d68a,1,credit_card,10,668.04


----------------------------------------------------------------------------------------------------------------------------------
Reviews:


Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp
84284,1956d4e13dce5a820f1d0e53d22e7a5c,41944cf624269caf43e02d939e33e8bb,1,Horrível,O produto é desproporcional e de péssima quali...,2018-06-07 00:00:00,2018-06-08 11:40:14
49631,7f906c60d59c8e0548a7775f8e86442f,f81bd34ec4dd1f36d89fe966cf87c4d7,5,,,2018-08-07 00:00:00,2018-08-07 19:43:42
63238,f1f1eecccb02d41af99b2ee9df976969,a9e8723ce1d27e389ff1e5ef7a71502b,5,Ótimo,Loja muito responsável e o smartphone é excele...,2018-08-24 00:00:00,2018-08-25 09:32:51
76802,a3ca5d9bdf7bcd904c56e03b4dbf1ad5,ea63989166a2d459220b1b769609582a,4,,,2018-03-29 00:00:00,2018-03-30 19:43:15
66879,7057f947741947a0a6a42cf3c174035c,419a5db6d2b2929fde6db58ea884dcb0,5,,Chegou antes do prazo. Tudo certinho! Obrigada.,2017-11-12 00:00:00,2017-11-12 20:54:59


----------------------------------------------------------------------------------------------------------------------------------
Products:


Unnamed: 0,product_id,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
25435,d1040cea3114b6e9f7e2170dc39af1db,perfumaria,42.0,367.0,2.0,250.0,21.0,4.0,15.0
6143,6bc7beedbbaefd0dc541f15678e48608,utilidades_domesticas,53.0,1564.0,1.0,1100.0,16.0,29.0,11.0
29694,01604238fbc11ce379e537a76b26bc94,eletronicos,29.0,510.0,1.0,125.0,21.0,17.0,17.0
10450,e344e7fbb51b08fbb06076b02f457dd9,informatica_acessorios,56.0,1548.0,1.0,167.0,17.0,8.0,14.0
15411,68fda662b16d76fbfc062c124171a29d,beleza_saude,58.0,161.0,1.0,200.0,25.0,6.0,15.0


----------------------------------------------------------------------------------------------------------------------------------
Sellers:


Unnamed: 0,seller_id,seller_zip_code_prefix,seller_city,seller_state
880,d598f929fc44e1e38678e7f47250ec04,73801,formosa,GO
1796,41b86b552e54e3a7009596125aa8b167,2470,sao paulo,SP
2023,9d7a1d34a5052409006425275ba1c2b4,14403,franca,SP
204,6650fcccd8cd2f7e55ffa524f30c4c59,8490,sao paulo,SP
1484,e9c6969d40e6a3d2d0f90013165c2b98,3448,sao paulo,SP


----------------------------------------------------------------------------------------------------------------------------------
Translation:


Unnamed: 0,product_category_name,product_category_name_english
15,telefonia_fixa,fixed_telephony
63,portateis_casa_forno_e_cafe,small_appliances_home_oven_and_coffee
41,instrumentos_musicais,musical_instruments
69,fashion_roupa_infanto_juvenil,fashion_childrens_clothes
58,bebidas,drinks


----------------------------------------------------------------------------------------------------------------------------------


➤ Column-wise overview including dtypes, missing values, and unique counts.

In [24]:
# Quick overview of column properties (dtypes, missing values, uniques) for all DataFrames
def overview(df_dict=dataframes_raw):
    """
    Creates and displays a column-wise overview for each DataFrame in a dictionary.

    Parameters:
        df_dict (dict): A dictionary of DataFrames (e.g., {'orders': orders, ...})

    Displays:
        For each DataFrame:
            - Data type
            - Non-null count
            - Missing value count and percentage
            - Missing value percentage
            - Number of unique values
            - Unique values
    """
    for name, df in df_dict.items():
        print(f'{name.capitalize()}:')
        summary = pd.DataFrame(
                {
                    "dtype": df.dtypes,
                    "total": df.count(),
                    "missing_n": df.isna().sum(),
                    "missing_%": df.isna().mean() * 100,
                    "uniques_n": df.nunique(),
                    "uniques": [df[col].unique() for col in df.columns],
                }
        )
        display(summary)   
        print("-"*130)

overview(dataframes_raw)


Customers:


Unnamed: 0,dtype,total,missing_n,missing_%,uniques_n,uniques
customer_id,object,99441,0,0.0,99441,"[06b8999e2fba1a1fbc88172c00ba8bc7, 18955e83d33..."
customer_unique_id,object,99441,0,0.0,96096,"[861eff4711a542e4b93843c6dd7febb0, 290c77bc529..."
customer_zip_code_prefix,int64,99441,0,0.0,14994,"[14409, 9790, 1151, 8775, 13056, 89254, 4534, ..."
customer_city,object,99441,0,0.0,4119,"[franca, sao bernardo do campo, sao paulo, mog..."
customer_state,object,99441,0,0.0,27,"[SP, SC, MG, PR, RJ, RS, PA, GO, ES, BA, MA, M..."


----------------------------------------------------------------------------------------------------------------------------------
Geolocation:


Unnamed: 0,dtype,total,missing_n,missing_%,uniques_n,uniques
geolocation_zip_code_prefix,int64,1000163,0,0.0,19015,"[1037, 1046, 1041, 1035, 1012, 1047, 1013, 102..."
geolocation_lat,float64,1000163,0,0.0,717360,"[-23.54562128115268, -23.54608112703553, -23.5..."
geolocation_lng,float64,1000163,0,0.0,717613,"[-46.63929204800168, -46.64482029837157, -46.6..."
geolocation_city,object,1000163,0,0.0,8011,"[sao paulo, são paulo, sao bernardo do campo, ..."
geolocation_state,object,1000163,0,0.0,27,"[SP, RN, AC, RJ, ES, MG, BA, SE, PE, AL, PB, C..."


----------------------------------------------------------------------------------------------------------------------------------
Orders:


Unnamed: 0,dtype,total,missing_n,missing_%,uniques_n,uniques
order_id,object,99441,0,0.0,99441,"[e481f51cbdc54678b7cc49136f2d6af7, 53cdb2fc8bc..."
customer_id,object,99441,0,0.0,99441,"[9ef432eb6251297304e76186b10a928d, b0830fb4747..."
order_status,object,99441,0,0.0,8,"[delivered, invoiced, shipped, processing, una..."
order_purchase_timestamp,object,99441,0,0.0,98875,"[2017-10-02 10:56:33, 2018-07-24 20:41:37, 201..."
order_approved_at,object,99281,160,0.16,90733,"[2017-10-02 11:07:15, 2018-07-26 03:24:27, 201..."
order_delivered_carrier_date,object,97658,1783,1.79,81018,"[2017-10-04 19:55:00, 2018-07-26 14:31:00, 201..."
order_delivered_customer_date,object,96476,2965,2.98,95664,"[2017-10-10 21:25:13, 2018-08-07 15:27:45, 201..."
order_estimated_delivery_date,object,99441,0,0.0,459,"[2017-10-18 00:00:00, 2018-08-13 00:00:00, 201..."


----------------------------------------------------------------------------------------------------------------------------------
Items:


Unnamed: 0,dtype,total,missing_n,missing_%,uniques_n,uniques
order_id,object,112650,0,0.0,98666,"[00010242fe8c5a6d1ba2dd792cb16214, 00018f77f2f..."
order_item_id,int64,112650,0,0.0,21,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."
product_id,object,112650,0,0.0,32951,"[4244733e06e7ecb4970a6e2683c13e61, e5f2d52b802..."
seller_id,object,112650,0,0.0,3095,"[48436dade18ac8b2bce089ec2a041202, dd7ddc04e1b..."
shipping_limit_date,object,112650,0,0.0,93318,"[2017-09-19 09:45:35, 2017-05-03 11:05:13, 201..."
price,float64,112650,0,0.0,5968,"[58.9, 239.9, 199.0, 12.99, 199.9, 21.9, 19.9,..."
freight_value,float64,112650,0,0.0,6999,"[13.29, 19.93, 17.87, 12.79, 18.14, 12.69, 11...."


----------------------------------------------------------------------------------------------------------------------------------
Payments:


Unnamed: 0,dtype,total,missing_n,missing_%,uniques_n,uniques
order_id,object,103886,0,0.0,99440,"[b81ef226f3fe1789b1e8b2acac839d17, a9810da8291..."
payment_sequential,int64,103886,0,0.0,29,"[1, 2, 4, 5, 3, 8, 6, 7, 10, 11, 17, 19, 27, 1..."
payment_type,object,103886,0,0.0,5,"[credit_card, boleto, voucher, debit_card, not..."
payment_installments,int64,103886,0,0.0,24,"[8, 1, 2, 3, 6, 5, 4, 10, 7, 12, 9, 13, 15, 24..."
payment_value,float64,103886,0,0.0,29077,"[99.33, 24.39, 65.71, 107.78, 128.45, 96.12, 8..."


----------------------------------------------------------------------------------------------------------------------------------
Reviews:


Unnamed: 0,dtype,total,missing_n,missing_%,uniques_n,uniques
review_id,object,99224,0,0.0,98410,"[7bc2406110b926393aa56f80a40eba40, 80e641a11e5..."
order_id,object,99224,0,0.0,98673,"[73fc7af87114b39712e6da79b0a377eb, a548910a1c6..."
review_score,int64,99224,0,0.0,5,"[4, 5, 1, 3, 2]"
review_comment_title,object,11568,87656,88.34,4527,"[nan, recomendo, Super recomendo, Não chegou m..."
review_comment_message,object,40977,58247,58.7,36159,"[nan, Recebi bem antes do prazo estipulado., P..."
review_creation_date,object,99224,0,0.0,636,"[2018-01-18 00:00:00, 2018-03-10 00:00:00, 201..."
review_answer_timestamp,object,99224,0,0.0,98248,"[2018-01-18 21:46:59, 2018-03-11 03:05:13, 201..."


----------------------------------------------------------------------------------------------------------------------------------
Products:


Unnamed: 0,dtype,total,missing_n,missing_%,uniques_n,uniques
product_id,object,32951,0,0.0,32951,"[1e9e8ef04dbcff4541ed26657ea517e5, 3aa071139cb..."
product_category_name,object,32341,610,1.85,73,"[perfumaria, artes, esporte_lazer, bebes, util..."
product_name_lenght,float64,32341,610,1.85,66,"[40.0, 44.0, 46.0, 27.0, 37.0, 60.0, 56.0, 57...."
product_description_lenght,float64,32341,610,1.85,2960,"[287.0, 276.0, 250.0, 261.0, 402.0, 745.0, 127..."
product_photos_qty,float64,32341,610,1.85,19,"[1.0, 4.0, 2.0, 3.0, 5.0, 9.0, 6.0, nan, 7.0, ..."
product_weight_g,float64,32949,2,0.01,2204,"[225.0, 1000.0, 154.0, 371.0, 625.0, 200.0, 18..."
product_length_cm,float64,32949,2,0.01,99,"[16.0, 30.0, 18.0, 26.0, 20.0, 38.0, 70.0, 40...."
product_height_cm,float64,32949,2,0.01,102,"[10.0, 18.0, 9.0, 4.0, 17.0, 5.0, 24.0, 8.0, 1..."
product_width_cm,float64,32949,2,0.01,95,"[14.0, 20.0, 15.0, 26.0, 13.0, 11.0, 44.0, 40...."


----------------------------------------------------------------------------------------------------------------------------------
Sellers:


Unnamed: 0,dtype,total,missing_n,missing_%,uniques_n,uniques
seller_id,object,3095,0,0.0,3095,"[3442f8959a84dea7ee197c632cb2df15, d1b65fc7deb..."
seller_zip_code_prefix,int64,3095,0,0.0,2246,"[13023, 13844, 20031, 4195, 12914, 20920, 5532..."
seller_city,object,3095,0,0.0,611,"[campinas, mogi guacu, rio de janeiro, sao pau..."
seller_state,object,3095,0,0.0,23,"[SP, RJ, PE, PR, GO, SC, BA, DF, RS, MG, RN, M..."


----------------------------------------------------------------------------------------------------------------------------------
Translation:


Unnamed: 0,dtype,total,missing_n,missing_%,uniques_n,uniques
product_category_name,object,71,0,0.0,71,"[beleza_saude, informatica_acessorios, automot..."
product_category_name_english,object,71,0,0.0,71,"[health_beauty, computers_accessories, auto, b..."


----------------------------------------------------------------------------------------------------------------------------------


➤ Quick statistical overview of all numeric columns in each raw table to spot any unusual values or patterns.

In [25]:
# Summarize basic statistics of all numeric columns for each DataFrame in the dictionary
def describe_numeric_columns(df_dict=dataframes_raw):
    """
    Displays a transposed summary of descriptive statistics (.describe().T)
    for all numeric columns in each DataFrame within the given dictionary.

    Parameters:
    df_dict (dict): A dictionary where keys are table names and values are pandas DataFrames.

    Notes:
    - If a DataFrame has no numeric columns, a message is printed instead.
    - The output includes a visual summary using display() for easier inspection in notebooks.
    """
    for name, df in df_dict.items():
        print(f"{name.capitalize()}:")
        numeric_df = df.select_dtypes(include="number")

        if numeric_df.empty:
            print("No numeric columns to describe.")
        else:
            display(numeric_df.describe().T)

        print("-" * 130)


describe_numeric_columns()

Customers:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
customer_zip_code_prefix,99441.0,35137.47,29797.94,1003.0,11347.0,24416.0,58900.0,99990.0


----------------------------------------------------------------------------------------------------------------------------------
Geolocation:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
geolocation_zip_code_prefix,1000163.0,36574.17,30549.34,1001.0,11075.0,26530.0,63504.0,99990.0
geolocation_lat,1000163.0,-21.18,5.72,-36.61,-23.6,-22.92,-19.98,45.07
geolocation_lng,1000163.0,-46.39,4.27,-101.47,-48.57,-46.64,-43.77,121.11


----------------------------------------------------------------------------------------------------------------------------------
Orders:
No numeric columns to describe.
----------------------------------------------------------------------------------------------------------------------------------
Items:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
order_item_id,112650.0,1.2,0.71,1.0,1.0,1.0,1.0,21.0
price,112650.0,120.65,183.63,0.85,39.9,74.99,134.9,6735.0
freight_value,112650.0,19.99,15.81,0.0,13.08,16.26,21.15,409.68


----------------------------------------------------------------------------------------------------------------------------------
Payments:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
payment_sequential,103886.0,1.09,0.71,1.0,1.0,1.0,1.0,29.0
payment_installments,103886.0,2.85,2.69,0.0,1.0,1.0,4.0,24.0
payment_value,103886.0,154.1,217.49,0.0,56.79,100.0,171.84,13664.08


----------------------------------------------------------------------------------------------------------------------------------
Reviews:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
review_score,99224.0,4.09,1.35,1.0,4.0,5.0,5.0,5.0


----------------------------------------------------------------------------------------------------------------------------------
Products:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
product_name_lenght,32341.0,48.48,10.25,5.0,42.0,51.0,57.0,76.0
product_description_lenght,32341.0,771.5,635.12,4.0,339.0,595.0,972.0,3992.0
product_photos_qty,32341.0,2.19,1.74,1.0,1.0,1.0,3.0,20.0
product_weight_g,32949.0,2276.47,4282.04,0.0,300.0,700.0,1900.0,40425.0
product_length_cm,32949.0,30.82,16.91,7.0,18.0,25.0,38.0,105.0
product_height_cm,32949.0,16.94,13.64,2.0,8.0,13.0,21.0,105.0
product_width_cm,32949.0,23.2,12.08,6.0,15.0,20.0,30.0,118.0


----------------------------------------------------------------------------------------------------------------------------------
Sellers:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
seller_zip_code_prefix,3095.0,32291.06,32713.45,1001.0,7093.5,14940.0,64552.5,99730.0


----------------------------------------------------------------------------------------------------------------------------------
Translation:
No numeric columns to describe.
----------------------------------------------------------------------------------------------------------------------------------


In [26]:
# Generate dbdiagram.io-compatible table definitions from DataFrames
def generate_er_schema(df_dict=dataframes_raw):
    """
    Generate table definitions in dbdiagram.io format from a dictionary of DataFrames.

    For each DataFrame:
    - Converts pandas dtypes to SQL-style types (int, varchar, decimal, timestamp, etc.)
    - Outputs formatted table definitions ready to paste into dbdiagram.io

    Parameters:
    df_dict (dict): A dictionary where keys are table names and values are pandas DataFrames.

    Returns:
    None (prints output to console)
    """

    dtype_map = {
        "int64": "int",
        "float64": "decimal",
        "object": "varchar",
        "bool": "boolean",
        "datetime64[ns]": "timestamp",
    }

    for name, df in df_dict.items():
        print(f"Table {name} {{")
        for col in df.columns:
            dtype = str(df[col].dtype)
            sql_type = dtype_map.get(dtype, "varchar")  # fallback to varchar if unknown
            print(f"  {col} {sql_type}")
        print("}\n")

generate_er_schema(dataframes_raw)

Table customers {
  customer_id varchar
  customer_unique_id varchar
  customer_zip_code_prefix int
  customer_city varchar
  customer_state varchar
}

Table geolocation {
  geolocation_zip_code_prefix int
  geolocation_lat decimal
  geolocation_lng decimal
  geolocation_city varchar
  geolocation_state varchar
}

Table orders {
  order_id varchar
  customer_id varchar
  order_status varchar
  order_purchase_timestamp varchar
  order_approved_at varchar
  order_delivered_carrier_date varchar
  order_delivered_customer_date varchar
  order_estimated_delivery_date varchar
}

Table items {
  order_id varchar
  order_item_id int
  product_id varchar
  seller_id varchar
  shipping_limit_date varchar
  price decimal
  freight_value decimal
}

Table payments {
  order_id varchar
  payment_sequential int
  payment_type varchar
  payment_installments int
  payment_value decimal
}

Table reviews {
  review_id varchar
  order_id varchar
  review_score int
  review_comment_title varchar
  review_c

## **Data Cleaning**

➤ Copy raw DataFrames into a new working dictionary to preserve the original data before cleaning.

In [27]:
# Create a new dictionary with copies of all raw DataFrames
def copy_raw_dataframes(raw_dict, exclude=None):
    """
    Creates copies of raw DataFrames to preserve the original data before any cleaning steps.

    Parameters:
        raw_dict (dict): Dictionary containing raw DataFrames.
        exclude (list): List of table names to exclude from copying.

    Returns:
        dict: A new dictionary with copies of the DataFrames.
    """
    exclude = exclude or []
    copy_dict = {}

    for name, df in raw_dict.items():
        if name in exclude:
            continue
        copy_dict[name] = df.copy()

    return copy_dict


dataframes = copy_raw_dataframes(dataframes_raw, exclude=["geolocation"])  # exclude 'geolocation', which is not used in the analysis

➤ Dropping unnecessary columns.

In [28]:
# Define columns to be dropped from specific DataFrames based on project scope
dropping_columns_dict = {
    "customers": "customer_zip_code_prefix",
    "reviews": ["review_comment_title", "review_comment_message"],
    "products": [
        "product_weight_g",
        "product_length_cm",
        "product_height_cm",
        "product_width_cm",
    ],
    "sellers": "seller_zip_code_prefix",
}

In [None]:
# Drop predefined columns from each DataFrame based on a dictionary mapping
def drop_columns(df_dict, drop_dict):
    """
    Drops specified columns from DataFrames within a dictionary.

    Parameters:
        df_dict (dict): Dictionary of DataFrames to be modified.
        drop_dict (dict): Dictionary mapping DataFrame names to the columns
                          that should be dropped (single string or list of strings).

    Modifies:
        The DataFrames in df_dict are updated in-place with the specified columns removed.
    """
    for name, df in df_dict.items():
        if name not in drop_dict:
            continue

        # Ensure drop_dict[name] is a list, even if a single column is provided
        cols_to_drop = drop_dict[name]
        if isinstance(cols_to_drop, str):
            cols_to_drop = [cols_to_drop]

        # Drop columns that exist in the current DataFrame
        for col in cols_to_drop:
            if col in df.columns:
                df.drop(columns=col, inplace=True)

drop_columns(dataframes, dropping_columns_dict)