In [None]:
# Import libraries
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# Set display options for better readability in output
#pd.set_option("display.max_colwidth", None)
pd.set_option("display.float_format", '{:,.2f}'.format)

## **Data Overview**

### Load and explore all datasets

➤ Load all raw CSV files into individual DataFrames and store them in a dictionary for easier handling and quick access during exploration.

In [None]:
# Load all CSV files from the Brazilian E-Commerce dataset into separate DataFrames
data_path = "../data/brazilian/raw/"
files = os.listdir(data_path)

customers_raw = pd.read_csv(data_path + "olist_customers_dataset.csv")
geolocation_raw = pd.read_csv(data_path + "olist_geolocation_dataset.csv")
orders_raw = pd.read_csv(data_path + "olist_orders_dataset.csv")
items_raw = pd.read_csv(data_path + "olist_order_items_dataset.csv")
payments_raw = pd.read_csv(data_path + "olist_order_payments_dataset.csv")
reviews_raw = pd.read_csv(data_path + "olist_order_reviews_dataset.csv")
products_raw = pd.read_csv(data_path + "olist_products_dataset.csv")
sellers_raw = pd.read_csv(data_path + "olist_sellers_dataset.csv")
translation_raw = pd.read_csv(data_path + "product_category_name_translation.csv")

# Store all DataFrames in a dictionary for easier looping and inspection
dataframes_raw = {
    "customers": customers_raw,
    "geolocation": geolocation_raw,
    "orders": orders_raw,
    "items": items_raw,
    "payments": payments_raw,
    "reviews": reviews_raw,
    "products": products_raw,
    "sellers": sellers_raw,
    "translation": translation_raw,
}


The following tables are included in the Brazilian E-Commerce dataset:

- `customers`: customer information  
- `geolocation`: geographical coordinates by zip code prefix  
- `orders`: order details including status and timestamps  
- `items`: product-level details for each order  
- `payments`: payment methods, amounts and installment information 
- `reviews`: customer reviews and ratings  
- `products`: product attributes including category and dimensions
- `sellers`: seller information  
- `translation`: Portuguese-to-English product category mapping 

*Note: Original file names such as `olist_customers_dataset.csv` were renamed to simpler identifiers like `customers` for ease of use.*

➤  Summary of all tables using `.shape`, column names, and duplicate counts.

In [None]:
# Define a function to summarize a dictionary of DataFrames
def summarize_dataframes(df_dict=dataframes_raw):
    """
    Takes a dictionary of DataFrames and returns a summary DataFrame
        with the following information for each:
        - name: the key name from the dictionary
        - rows: number of rows in the DataFrame
        - columns: number of columns
        - column_names: a list of column names
        - duplicates: number of duplicated rows
    """
    summary = []

    # Loop over each DataFrame in the dictionary
    for name, df in df_dict.items():
        summary.append(
            {
                "name": name,
                "rows": df.shape[0],
                "columns": df.shape[1],
                "column_names": list(df.columns),
                "duplicates": df.duplicated().sum(),
            }
        )
    # Return a summary DataFrame
    return pd.DataFrame(summary)


# Call the function and display the summary of all loaded DataFrames
summarize_dataframes(dataframes_raw)

➤  Quick sampling of 5 rows from each table for visual inspection.

In [None]:
# Display a random sample of 5 rows from each DataFrame for a quick visual inspection
for name, df in dataframes_raw.items():
    print(f'{name.capitalize()}:')
    display(df.sample(5))
    print("-"*130)

➤ Column-wise overview including dtypes, missing values, and unique counts.

In [None]:
# Quick overview of column properties (dtypes, missing values, uniques) for all DataFrames
def overview(df_dict=dataframes_raw):
    """
    Creates and displays a column-wise overview for each DataFrame in a dictionary.

    Parameters:
        df_dict (dict): A dictionary of DataFrames (e.g., {'orders': orders, ...})

    Displays:
        For each DataFrame:
            - Data type
            - Non-null count
            - Missing value count and percentage
            - Missing value percentage
            - Number of unique values
            - Unique values
    """
    for name, df in df_dict.items():
        print(f'{name.capitalize()}:')
        summary = pd.DataFrame(
                {
                    "dtype": df.dtypes,
                    "total": df.count(),
                    "missing_n": df.isna().sum(),
                    "missing_%": df.isna().mean() * 100,
                    "uniques_n": df.nunique(),
                    "uniques": [df[col].unique() for col in df.columns],
                }
        )
        display(summary)   
        print("-"*130)

overview(dataframes_raw)


➤ Quick statistical overview of all numeric columns in each raw table to spot any unusual values or patterns.

In [None]:
# Summarize basic statistics of all numeric columns for each DataFrame in the dictionary
def describe_numeric_columns(df_dict=dataframes_raw):
    """
    Displays a transposed summary of descriptive statistics (.describe().T)
    for all numeric columns in each DataFrame within the given dictionary.

    Parameters:
    df_dict (dict): A dictionary where keys are table names and values are pandas DataFrames.

    Notes:
    - If a DataFrame has no numeric columns, a message is printed instead.
    - The output includes a visual summary using display() for easier inspection in notebooks.
    """
    for name, df in df_dict.items():
        print(f"{name.capitalize()}:")
        numeric_df = df.select_dtypes(include="number")

        if numeric_df.empty:
            print("No numeric columns to describe.")
        else:
            display(numeric_df.describe().T)

        print("-" * 130)


describe_numeric_columns()

## **Data Cleaning**

➤ Copy raw DataFrames into a new working dictionary to preserve the original data before cleaning.

In [None]:
# Create a new dictionary with copies of all raw DataFrames
def copy_raw_dataframes(raw_dict, exclude=None):
    """
    Creates copies of raw DataFrames to preserve the original data before any cleaning steps.

    Parameters:
        raw_dict (dict): Dictionary containing raw DataFrames.
        exclude (list): List of table names to exclude from copying.

    Returns:
        dict: A new dictionary with copies of the DataFrames.
    """
    exclude = exclude or []
    copy_dict = {}

    for name, df in raw_dict.items():
        if name in exclude:
            continue
        copy_dict[name] = df.copy()

    return copy_dict


dataframes = copy_raw_dataframes(dataframes_raw, exclude=["geolocation"])  # exclude 'geolocation', which is not used in the analysis

➤ Dropping unnecessary columns.

In [None]:
# Define columns to be dropped from specific DataFrames based on project scope
dropping_columns_dict = {
    "customers": "customer_zip_code_prefix",
    "reviews": ["review_comment_title", "review_comment_message"],
    "products": [
        "product_weight_g",
        "product_length_cm",
        "product_height_cm",
        "product_width_cm",
    ],
    "sellers": "seller_zip_code_prefix",
}

In [None]:
# Drop predefined columns from each DataFrame based on a dictionary mapping
def drop_columns(df_dict, drop_dict):
    """
    Drops specified columns from DataFrames within a dictionary.

    Parameters:
        df_dict (dict): Dictionary of DataFrames to be modified.
        drop_dict (dict): Dictionary mapping DataFrame names to the columns
                          that should be dropped (single string or list of strings).

    Modifies:
        The DataFrames in df_dict are updated in-place with the specified columns removed.
    """
    for name, df in df_dict.items():
        if name not in drop_dict:
            continue

        # Ensure drop_dict[name] is a list, even if a single column is provided
        cols_to_drop = drop_dict[name]
        if isinstance(cols_to_drop, str):
            cols_to_drop = [cols_to_drop]

        # Drop columns that exist in the current DataFrame
        for col in cols_to_drop:
            if col in df.columns:
                df.drop(columns=col, inplace=True)

drop_columns(dataframes, dropping_columns_dict)

➤ Converting data types.

In [None]:
# Define desired data types for specific columns in each DataFrame
# Used later to optimize memory and ensure correct formats (e.g., datetime, category)
dtype_conversion_dict = {
    "customers": {"customer_city": "category", "customer_state": "category"},
    "orders": {
        "order_status": "category",
        "order_purchase_timestamp": "datetime64[ns]",
        "order_approved_at": "datetime64[ns]",
        "order_delivered_carrier_date": "datetime64[ns]",
        "order_delivered_customer_date": "datetime64[ns]",
        "order_estimated_delivery_date": "datetime64[ns]",
    },
    "items": {"shipping_limit_date": "datetime64[ns]"},
    "payments": {"payment_type": "category"},
    "reviews": {
        "review_score": "category",
        "review_creation_date": "datetime64[ns]",
        "review_answer_timestamp": "datetime64[ns]",
    },
    "sellers": {"seller_city": "category", "seller_state": "category"},
    "products": {"product_category_name": "category"},
    "translation": {
        "product_category_name": "category",
        "product_category_name_english": "category",
    }
}


In [None]:
# Convert column data types for each DataFrame based on a predefined mapping
def apply_dtypes_conversions(df_dict, conversion_dict):
    """
    Applies data type conversions to specific columns in a dictionary of DataFrames.

    Parameters:
        df_dict (dict): A dictionary where keys are table names and values are pandas DataFrames.
        conversion_dict (dict): A nested dictionary specifying columns and their target data types
                                for each corresponding DataFrame.

    Notes:
    - This function modifies the DataFrames in place.
    - Useful for memory optimization (e.g., converting to 'category') and for parsing dates properly.
    """
    for name, df in df_dict.items():
        if name not in conversion_dict:
            continue  # skip DataFrames not listed in the conversion dictionary

        for col, dtype in conversion_dict[name].items():
            if col in df.columns:
                df[col] = df[col].astype(dtype)

apply_dtypes_conversions(dataframes, dtype_conversion_dict)