In [9]:
#Package to download datasets from kaggle
#%pip install kagglehub
import kagglehub
import pandas as pd
import os

# Download latest version
path = kagglehub.dataset_download("olistbr/brazilian-ecommerce")

print("Path to dataset files:", path)



Path to dataset files: C:\Users\simon\.cache\kagglehub\datasets\olistbr\brazilian-ecommerce\versions\2


In [10]:
def data_profile(df):
    """
    Displays a quick profile of a pandas DataFrame.
    Purpose: provide a high-level overview of the dataset structure,
    data quality, and basic statistics (quick EDA).
    """

    # Shape of the DataFrame: (number of rows, number of columns)
    print(f"Data shape: {df.shape}")

    # List of column names
    print(f"\nData columns: {df.columns.tolist()}")

    # Data types of each column (int, float, object, datetime, etc.)
    print(f"\nData types: {df.dtypes}")

    # Descriptive statistics for numerical variables
    # Includes: count, mean, std, min, quartiles, and max
    print(f"\nData description: {df.describe()}")

    # Number of missing values per column
    # Helps identify data quality issues
    print(f"\nData missing values: {df.isnull().sum()}")

    # Number of duplicated rows in the DataFrame
    # Important to detect potential bias in analysis
    print(f"\nData duplicates: {df.duplicated().sum()}")


### 1. Orders Table

#### 1.1 Data Profiling & Understanding

- Reviewed the dataset structure, including shape, columns, data types, and sample records
- Assessed data quality by checking missing values and the distribution of order statuses
- Identified and documented key columns relevant to business and analytical objectives


In [None]:
#Load the orders dataset
orders = pd.read_csv(os.path.join(path, "olist_orders_dataset.csv"))
# Quick data profile
data_profile(orders)

#### 1.2 Orders Data Cleaning

- Selected only the columns required for order lifecycle and delivery analysis:
  - `order_id`
  - `customer_id`
  - `order_status`
  - `order_purchase_timestamp`
  - `order_delivered_customer_date`
  - `order_estimated_delivery_date`

- Converted all timestamp fields to `datetime` format for time-based analysis

- Standardized `order_status` values to lowercase to ensure consistency

- Removed early-stage order statuses considered as noise:
  - `created` (5 rows)
  - `approved` (2 rows)

- Kept missing `order_delivered_customer_date` values unchanged to handle them later during delivery performance analysis

- Saved the cleaned dataset to:
  - `data_cleaned/orders_clean.csv`


In [None]:
# ------------------------------------------------------------------------------
# 1. Column selection
# Keep only the columns required for order lifecycle and delivery analysis
# ------------------------------------------------------------------------------
orders = orders[[
    "order_id",
    "customer_id",
    "order_status",
    "order_purchase_timestamp",
    "order_delivered_customer_date",
    "order_estimated_delivery_date"
]]

# ------------------------------------------------------------------------------
# 2. Date parsing
# Convert timestamp columns to pandas datetime for time-based analysis
# Invalid or malformed dates are coerced to NaT
# ------------------------------------------------------------------------------
date_cols = [
    "order_purchase_timestamp",
    "order_delivered_customer_date",
    "order_estimated_delivery_date"
]

for col in date_cols:
    orders[col] = pd.to_datetime(orders[col], errors="coerce")

# ------------------------------------------------------------------------------
# 3. Categorical normalization
# Standardize order_status values to lowercase to avoid case-related issues
# ------------------------------------------------------------------------------
orders["order_status"] = orders["order_status"].str.lower()

# ------------------------------------------------------------------------------
# 4. Business rule filtering
# Remove orders that are still in early lifecycle stages
# (not yet relevant for delivery or performance analysis)
# ------------------------------------------------------------------------------
orders = orders[~orders["order_status"].isin(["created", "approved"])]

# ------------------------------------------------------------------------------
# 5. Export cleaned dataset
# Save the cleaned orders table for downstream analysis and dashboarding
# ------------------------------------------------------------------------------
os.makedirs("data_cleaned", exist_ok=True)
orders.to_csv("data_cleaned/orders_clean.csv", index=False)

print("✅ orders_clean.csv saved!")
print("Final shape:", orders.shape)
print(orders.head())


### 2. Customers Table

#### 2.1 Data Profiling & Understanding

- Explored the dataset structure, including shape, columns, data types, and sample records
- Assessed data quality by checking for missing values
- Identified key columns aligned with business and analytical objectives:
  - `customer_id`: primary key used to join with the orders table
  - `customer_state`: used for regional and geographic analysis
- Documented non-essential columns not required for the current analysis scope:
  - `customer_unique_id`
  - `customer_zip_code_prefix`
  - `customer_city`


In [None]:
# Load the customers dataset
customers = pd.read_csv(os.path.join(path, "olist_customers_dataset.csv"))
# Quick data profile
data_profile(customers) 

#### 2.2 Customers Data Cleaning

- Retained only the columns required for the analysis:
  - `customer_id`
  - `customer_state`
- Removed non-essential columns not aligned with current business objectives:
  - `customer_unique_id`
  - `customer_zip_code_prefix`
  - `customer_city`
- Verified that no missing values remain in the cleaned dataset
- Saved the cleaned dataset to:
  - `data_cleaned/customers_clean.csv`


In [20]:
# ------------------------------------------------------------------------------
# 1. Column selection
# Keep only the columns required for customer-level and regional analysis
# ------------------------------------------------------------------------------
customers = customers[[
    "customer_id",
    "customer_state"
]]

# ------------------------------------------------------------------------------
# 2. Export cleaned dataset
# Ensure the output directory exists and save the cleaned table
# ------------------------------------------------------------------------------
os.makedirs("data_cleaned", exist_ok=True)
customers.to_csv("data_cleaned/customers_clean.csv", index=False)

# ------------------------------------------------------------------------------
# 3. Sanity checks
# Display basic information to validate the cleaning process
# ------------------------------------------------------------------------------
print("✅ customers_clean.csv saved!")
print("Final shape:", customers.shape)
print(customers.head())


✅ customers_clean.csv saved!
Final shape: (99441, 2)
                        customer_id customer_state
0  06b8999e2fba1a1fbc88172c00ba8bc7             SP
1  18955e83d337fd6b2def6b18a428ac77             SP
2  4e7b3e00288586ebd08712fdd0374a03             SP
3  b2b6027bc5c5109e529d4dc6358b12c3             SP
4  4f2d8ab171c80ec8364f7c12e35b23ad             SP
