In [None]:
# Import libraries
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# Set display options for better readability in output
pd.set_option("display.max_colwidth", None)
pd.set_option("display.float_format", '{:,.2f}'.format)

In [None]:
# Load all CSV files from the Brazilian E-Commerce dataset into separate DataFrames
data_path = "../data/brazilian/raw/"
files = os.listdir(data_path)

customers = pd.read_csv(data_path + "olist_customers_dataset.csv")
geolocation = pd.read_csv(data_path + "olist_geolocation_dataset.csv")
orders = pd.read_csv(data_path + "olist_orders_dataset.csv")
items = pd.read_csv(data_path + "olist_order_items_dataset.csv")
payments = pd.read_csv(data_path + "olist_order_payments_dataset.csv")
reviews = pd.read_csv(data_path + "olist_order_reviews_dataset.csv")
products = pd.read_csv(data_path + "olist_products_dataset.csv")
sellers = pd.read_csv(data_path + "olist_sellers_dataset.csv")
translation = pd.read_csv(data_path + "product_category_name_translation.csv")

# Store all DataFrames in a dictionary for easier looping and inspection
dataframes = {
    'customers': customers,
    'geolocation': geolocation,
    'orders': orders,
    'items': items,
    'payments': payments,
    'reviews': reviews,
    'products': products,
    'sellers': sellers,
    'translation': translation
}

In [None]:
# Define a function to summarize a dictionary of DataFrames
def summarize_dataframes(df_dict=dataframes):
    """
    Takes a dictionary of DataFrames and returns a summary DataFrame
        with the following information for each:
        - name: the key name from the dictionary
        - rows: number of rows in the DataFrame
        - columns: number of columns
        - column_names: a list of column names
        - duplicates: number of duplicated rows
    """
    summary = []

    # Loop over each DataFrame in the dictionary
    for name, df in df_dict.items():
        summary.append(
            {
                "name": name,
                "rows": df.shape[0],
                "columns": df.shape[1],
                "column_names": list(df.columns),
                "duplicates": df.duplicated().sum(),
            }
        )
    # Return a summary DataFrame
    return pd.DataFrame(summary)

# Call the function and display the summary of all loaded DataFrames
summarize_dataframes(dataframes)

In [None]:
# Display a random sample of 5 rows from each DataFrame for a quick visual inspection
for name, df in dataframes.items():
    print(f'{name.capitalize()}:')
    display(df.sample(5))
    print("-"*130)