In [None]:
# Import libraries
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# Set display options for better readability in output
#pd.set_option("display.max_colwidth", None)
pd.set_option("display.float_format", '{:,.2f}'.format)

In [None]:
# Load all CSV files from the Brazilian E-Commerce dataset into separate DataFrames
data_path = "../data/brazilian/raw/"
files = os.listdir(data_path)

customers = pd.read_csv(data_path + "olist_customers_dataset.csv")
geolocation = pd.read_csv(data_path + "olist_geolocation_dataset.csv")
orders = pd.read_csv(data_path + "olist_orders_dataset.csv")
items = pd.read_csv(data_path + "olist_order_items_dataset.csv")
payments = pd.read_csv(data_path + "olist_order_payments_dataset.csv")
reviews = pd.read_csv(data_path + "olist_order_reviews_dataset.csv")
products = pd.read_csv(data_path + "olist_products_dataset.csv")
sellers = pd.read_csv(data_path + "olist_sellers_dataset.csv")
translation = pd.read_csv(data_path + "product_category_name_translation.csv")

# Store all DataFrames in a dictionary for easier looping and inspection
dataframes = {
    'customers': customers,
    'geolocation': geolocation,
    'orders': orders,
    'items': items,
    'payments': payments,
    'reviews': reviews,
    'products': products,
    'sellers': sellers,
    'translation': translation
}

In [None]:
# Define a function to summarize a dictionary of DataFrames
def summarize_dataframes(df_dict=dataframes):
    """
    Takes a dictionary of DataFrames and returns a summary DataFrame
        with the following information for each:
        - name: the key name from the dictionary
        - rows: number of rows in the DataFrame
        - columns: number of columns
        - column_names: a list of column names
        - duplicates: number of duplicated rows
    """
    summary = []

    # Loop over each DataFrame in the dictionary
    for name, df in df_dict.items():
        summary.append(
            {
                "name": name,
                "rows": df.shape[0],
                "columns": df.shape[1],
                "column_names": list(df.columns),
                "duplicates": df.duplicated().sum(),
            }
        )
    # Return a summary DataFrame
    return pd.DataFrame(summary)


# Call the function and display the summary of all loaded DataFrames
summarize_dataframes(dataframes)

In [None]:
# Display a random sample of 5 rows from each DataFrame for a quick visual inspection
for name, df in dataframes.items():
    print(f'{name.capitalize()}:')
    display(df.sample(5))
    print("-"*130)

In [None]:
# Quick overview of column properties (dtypes, missing values, uniques) for all DataFrames
def overview(df_dict=dataframes):
    """
    Creates and displays a column-wise overview for each DataFrame in a dictionary.

    Parameters:
        df_dict (dict): A dictionary of DataFrames (e.g., {'orders': orders, ...})

    Displays:
        For each DataFrame:
            - Data type
            - Non-null count
            - Missing value count and percentage
            - Missing value percentage
            - Number of unique values
            - Unique values
    """
    for name, df in df_dict.items():
        print(f'{name.capitalize()}:')
        summary = pd.DataFrame(
                {
                    "dtype": df.dtypes,
                    "total": df.count(),
                    "missing_n": df.isna().sum(),
                    "missing_%": df.isna().mean() * 100,
                    "uniques_n": df.nunique(),
                    "uniques": [df[col].unique() for col in df.columns],
                }
        )
        display(summary)   
        print("-"*130)

overview(dataframes)


In [None]:
# Generate dbdiagram.io-compatible table definitions from DataFrames
def generate_er_schema(df_dict=dataframes):
    """
    Generate table definitions in dbdiagram.io format from a dictionary of DataFrames.

    For each DataFrame:
    - Converts pandas dtypes to SQL-style types (int, varchar, decimal, timestamp, etc.)
    - Outputs formatted table definitions ready to paste into dbdiagram.io

    Parameters:
    df_dict (dict): A dictionary where keys are table names and values are pandas DataFrames.

    Returns:
    None (prints output to console)
    """

    dtype_map = {
        "int64": "int",
        "float64": "decimal",
        "object": "varchar",
        "bool": "boolean",
        "datetime64[ns]": "timestamp",
    }

    for name, df in df_dict.items():
        print(f"Table {name} {{")
        for col in df.columns:
            dtype = str(df[col].dtype)
            sql_type = dtype_map.get(dtype, "varchar")  # fallback to varchar if unknown
            print(f"  {col} {sql_type}")
        print("}\n")

generate_er_schema(dataframes)

In [None]:
# Check duplicates in reviews table
print(reviews.duplicated(subset="review_id").sum())
print(reviews.duplicated(subset="order_id").sum())