In [None]:
# Import libraries
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# Set display options for better readability in output
#pd.set_option("display.max_colwidth", None)
pd.set_option("display.float_format", '{:,.2f}'.format)

## **Data Overview**

### Load and explore all datasets

➤ Load all raw CSV files into individual DataFrames and store them in a dictionary for easier handling and quick access during exploration.

In [None]:
# Load all CSV files from the Brazilian E-Commerce dataset into separate DataFrames
data_path = "../data/brazilian/raw/"
files = os.listdir(data_path)

customers_raw = pd.read_csv(data_path + "olist_customers_dataset.csv")
geolocation_raw = pd.read_csv(data_path + "olist_geolocation_dataset.csv")
orders_raw = pd.read_csv(data_path + "olist_orders_dataset.csv")
items_raw = pd.read_csv(data_path + "olist_order_items_dataset.csv")
payments_raw = pd.read_csv(data_path + "olist_order_payments_dataset.csv")
reviews_raw = pd.read_csv(data_path + "olist_order_reviews_dataset.csv")
products_raw = pd.read_csv(data_path + "olist_products_dataset.csv")
sellers_raw = pd.read_csv(data_path + "olist_sellers_dataset.csv")
translation_raw = pd.read_csv(data_path + "product_category_name_translation.csv")

# Store all DataFrames in a dictionary for easier looping and inspection
dataframes_raw = {
    "customers": customers_raw,
    "geolocation": geolocation_raw,
    "orders": orders_raw,
    "items": items_raw,
    "payments": payments_raw,
    "reviews": reviews_raw,
    "products": products_raw,
    "sellers": sellers_raw,
    "translation": translation_raw,
}


The following tables are included in the Brazilian E-Commerce dataset:

- `customers`: customer information  
- `geolocation`: geographical coordinates by zip code prefix  
- `orders`: order details including status and timestamps  
- `items`: product-level details for each order  
- `payments`: payment methods, amounts and installment information 
- `reviews`: customer reviews and ratings  
- `products`: product attributes including category and dimensions
- `sellers`: seller information  
- `translation`: Portuguese-to-English product category mapping 

*Note: Original file names such as `olist_customers_dataset.csv` were renamed to simpler identifiers like `customers` for ease of use.*

➤  Summary of all tables using `.shape`, column names, and duplicate counts.

In [None]:
# Define a function to summarize a dictionary of DataFrames
def summarize_dataframes(df_dict=dataframes_raw):
    """
    Takes a dictionary of DataFrames and returns a summary DataFrame
        with the following information for each:
        - name: the key name from the dictionary
        - rows: number of rows in the DataFrame
        - columns: number of columns
        - column_names: a list of column names
        - duplicates: number of duplicated rows
    """
    summary = []

    # Loop over each DataFrame in the dictionary
    for name, df in df_dict.items():
        summary.append(
            {
                "name": name,
                "rows": df.shape[0],
                "columns": df.shape[1],
                "column_names": list(df.columns),
                "duplicates": df.duplicated().sum(),
            }
        )
    # Return a summary DataFrame
    return pd.DataFrame(summary)


# Call the function and display the summary of all loaded DataFrames
summarize_dataframes(dataframes_raw)

➤  Quick sampling of 5 rows from each table for visual inspection.

In [None]:
# Display a random sample of 5 rows from each DataFrame for a quick visual inspection
for name, df in dataframes_raw.items():
    print(f'{name.capitalize()}:')
    display(df.sample(5))
    print("-"*130)

➤ Column-wise overview including dtypes, missing values, and unique counts.

In [None]:
# Quick overview of column properties (dtypes, missing values, uniques) for all DataFrames
def overview(df_dict=dataframes_raw):
    """
    Creates and displays a column-wise overview for each DataFrame in a dictionary.

    Parameters:
        df_dict (dict): A dictionary of DataFrames (e.g., {'orders': orders, ...})

    Displays:
        For each DataFrame:
            - Data type
            - Non-null count
            - Missing value count and percentage
            - Missing value percentage
            - Number of unique values
            - Unique values
    """
    for name, df in df_dict.items():
        print(f'{name.capitalize()}:')
        summary = pd.DataFrame(
                {
                    "dtype": df.dtypes,
                    "total": df.count(),
                    "missing_n": df.isna().sum(),
                    "missing_%": df.isna().mean() * 100,
                    "uniques_n": df.nunique(),
                    "uniques": [df[col].unique() for col in df.columns],
                }
        )
        display(summary)   
        print("-"*130)

overview(dataframes_raw)


➤ Quick statistical overview of all numeric columns in each raw table to spot any unusual values or patterns.

In [None]:
# Summarize basic statistics of all numeric columns for each DataFrame in the dictionary
def describe_numeric_columns(df_dict=dataframes_raw):
    """
    Displays a transposed summary of descriptive statistics (.describe().T)
    for all numeric columns in each DataFrame within the given dictionary.

    Parameters:
    df_dict (dict): A dictionary where keys are table names and values are pandas DataFrames.

    Notes:
    - If a DataFrame has no numeric columns, a message is printed instead.
    - The output includes a visual summary using display() for easier inspection in notebooks.
    """
    for name, df in df_dict.items():
        print(f"{name.capitalize()}:")
        numeric_df = df.select_dtypes(include="number")

        if numeric_df.empty:
            print("No numeric columns to describe.")
        else:
            display(numeric_df.describe().T)

        print("-" * 130)


describe_numeric_columns()

## **Data Cleaning**

➤ Copy raw DataFrames into a new working dictionary to preserve the original data before cleaning.

In [None]:
# Create a new dictionary with copies of all raw DataFrames
def copy_raw_dataframes(raw_dict, exclude=None):
    """
    Creates copies of raw DataFrames to preserve the original data before any cleaning steps.

    Parameters:
        raw_dict (dict): Dictionary containing raw DataFrames.
        exclude (list): List of table names to exclude from copying.

    Returns:
        dict: A new dictionary with copies of the DataFrames.
    """
    exclude = exclude or []
    copy_dict = {}

    for name, df in raw_dict.items():
        if name in exclude:
            continue
        copy_dict[name] = df.copy()

    return copy_dict


dataframes = copy_raw_dataframes(dataframes_raw, exclude=["geolocation"])  # exclude 'geolocation', which is not used in the analysis

### Handling missing data

➤ Missing product-related information

In [None]:
# Copy the 'products' DataFrame from the dictionary for further processing
products = dataframes["products"].copy()

# Display the number of missing values in each column of the 'products' DataFrame
products.isna().sum()

In [None]:
# Count the number of rows where any important product-related column is missing
products_missing_cols = [
    "product_category_name",
    "product_name_lenght",
    "product_description_lenght",
    "product_photos_qty"
]

number_missing_all = products[products_missing_cols].isna().any(axis=1).sum()

# Print the number and percentage of affected rows
print(
    f"{number_missing_all} rows have missing values in key product-related columns "
    f"({number_missing_all / products.shape[0]:.1%} of the products table)."
)

In [None]:
# Drop rows where all key product-related columns are missing (category, name length, description length, and photo count)
products.dropna(subset=products_missing_cols, how="all", inplace=True)

# Display the number of missing values remaining in each column
products.isna().sum()

Rows missing `product_category_name` (610 in total) are dropped, as this column is critical for analyzing product-level trends in customer satisfaction. Without it, meaningful grouping and interpretation are not possible.

These rows also lack values in other key descriptive columns — such as `product_name_lenght`, `product_description_lenght`, and `product_photos_qty` — which are relevant for understanding how product presentation may affect customer perception. Since these fields are simultaneously missing, and no reliable imputation method is available, we exclude these rows entirely.

Other missing fields, such as product dimensions and weight, are not directly relevant to our current analysis. However, to ensure the dataset remains reusable for future projects, we will impute the missing values for `product_weight_g`, `product_length_cm`, `product_height_cm`, and `product_width_cm` using median values from products in the same category and a similar price range.

In [None]:
# Define the columns that describe the product's physical dimensions
size_cols = [
    "product_weight_g",
    "product_length_cm",
    "product_height_cm",
    "product_width_cm",
]

# View rows where at least one of the size columns has missing values
products[products[size_cols].isna().any(axis=1)]

There is now only one row in the dataset with missing values (`product_id` equal to `'09ff539a621711667c43eba6a3bd8466'`), and it happens to be missing all four size-related columns. This product belongs to the `'bebe'` category. Despite this being an isolated case, we will write the following code in a generalized way to ensure it can be reused for other projects or datasets with more missing values.

In [None]:
# Merge median item price into the products table to enable price-based imputation
items = dataframes["items"].copy()
price_by_product = items.groupby("product_id")["price"].median().reset_index()
products_price = products.merge(price_by_product, on="product_id", how="left")

In [None]:
# Create price bins (quartiles) within each product category, based on median price
# This helps identify similar products by both category and price level
products_price["price_bin"] = products_price.groupby("product_category_name")[
    "price"
].transform(lambda x: pd.qcut(x, q=4, duplicates="drop"))

In [None]:
# Impute missing size values using the median for each (category, price_bin) group
for col in size_cols:
    # Calculate the group-specific median for the current column
    group_medians = products_price.groupby(["product_category_name", "price_bin"])[
        col
    ].transform("median")
    # Fill missing values in the current column using the group medians
    products_price[col] = products_price[col].fillna(group_medians)


In [None]:
# Print the number of missing values remaining in each column for verification
# Note: 'price_bin' may have missing values if a product belongs to a category with only one product,
# since quartile-based binning (qcut) cannot be applied in such cases.
print(products_price.isna().sum())
# Verify that the one product with missing values was successfully imputed
products_price[products_price["product_id"] == '09ff539a621711667c43eba6a3bd8466']

In [None]:
# Remove the helper columns and save the cleaned/imputed data back to 'products'
products = products_price.drop(columns=["price", "price_bin"])