# **DATA CLEANING AND TRANSFORMATION**

In [1]:
# Initializing Spark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

spark = SparkSession.builder \
    .appName("E-Commerce Data Cleaning & Transformation") \
    .config("spark.driver.memory", "4g") \
    .config("spark.sql.shuffle.partitions", "8") \
    .getOrCreate()

print("Spark initialized for Data Cleaning & Transformation")

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/11/27 22:31:44 WARN Utils: Your hostname, Roberts-MacBook-Air.local, resolves to a loopback address: 127.0.0.1; using 192.168.0.43 instead (on interface en0)
25/11/27 22:31:44 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/27 22:31:44 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/11/27 22:31:45 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


Spark initialized for Data Cleaning & Transformation


In [2]:
# Loading all the datasets
customers = spark.read.csv("/Users/robert/Desktop/ecommerce-pipeline/data/raw/olist_customers_dataset.csv", header=True, inferSchema=True)
orders = spark.read.csv("/Users/robert/Desktop/ecommerce-pipeline/data/raw/olist_orders_dataset.csv", header=True, inferSchema=True)
order_items = spark.read.csv("/Users/robert/Desktop/ecommerce-pipeline/data/raw/olist_order_items_dataset.csv", header=True, inferSchema=True)
payments = spark.read.csv("/Users/robert/Desktop/ecommerce-pipeline/data/raw/olist_order_payments_dataset.csv", header=True, inferSchema=True)
reviews = spark.read.csv("/Users/robert/Desktop/ecommerce-pipeline/data/raw/olist_order_reviews_dataset.csv", header=True, inferSchema=True)
products = spark.read.csv("/Users/robert/Desktop/ecommerce-pipeline/data/raw/olist_products_dataset.csv", header=True, inferSchema=True)
sellers = spark.read.csv("/Users/robert/Desktop/ecommerce-pipeline/data/raw/olist_sellers_dataset.csv", header=True, inferSchema=True)

print("All datasets loaded")
print(f"  Customers: {customers.count():,}")
print(f"  Orders: {orders.count():,}")
print(f"  Order Items: {order_items.count():,}")
print(f"  Payments: {payments.count():,}")
print(f"  Reviews: {reviews.count():,}")
print(f"  Products: {products.count():,}")
print(f"  Sellers: {sellers.count():,}")

All datasets loaded
  Customers: 99,441
  Orders: 99,441
  Order Items: 112,650
  Payments: 103,886
  Reviews: 104,162
  Products: 32,951
  Sellers: 3,095


In [3]:
# Data Quality Assessment
print("\n" + "="*70)
print("DATA QUALITY ASSESSMENT")
print("="*70 + "\n")

def assess_data_quality(df, dataset_name):
    """Assess data quality for a dataset"""
    total_rows = df.count()
    print(f"\n{dataset_name}:")
    print(f"  Total rows: {total_rows:,}")
    
    # Check null values for each column
    null_counts = df.select([
        count(when(col(c).isNull(), c)).alias(c) for c in df.columns
    ]).collect()[0].asDict()
    
    print("  Null values:")
    for col_name, null_count in null_counts.items():
        if null_count > 0:
            pct = (null_count / total_rows) * 100
            print(f"    {col_name}: {null_count:,} ({pct:.2f}%)")
    
    if all(count == 0 for count in null_counts.values()):
        print("    No null values found")
    
    return null_counts

# Assess each dataset
customers_quality = assess_data_quality(customers, "CUSTOMERS")
orders_quality = assess_data_quality(orders, "ORDERS")
items_quality = assess_data_quality(order_items, "ORDER ITEMS")
payments_quality = assess_data_quality(payments, "PAYMENTS")
reviews_quality = assess_data_quality(reviews, "REVIEWS")

print("\n" + "="*70)


DATA QUALITY ASSESSMENT


CUSTOMERS:
  Total rows: 99,441
  Null values:
    No null values found

ORDERS:
  Total rows: 99,441
  Null values:
    order_approved_at: 160 (0.16%)
    order_delivered_carrier_date: 1,783 (1.79%)
    order_delivered_customer_date: 2,965 (2.98%)

ORDER ITEMS:
  Total rows: 112,650
  Null values:
    No null values found

PAYMENTS:
  Total rows: 103,886
  Null values:
    No null values found

REVIEWS:
  Total rows: 104,162
  Null values:
    review_id: 1 (0.00%)
    order_id: 2,236 (2.15%)
    review_score: 2,380 (2.28%)
    review_comment_title: 92,157 (88.47%)
    review_comment_message: 63,079 (60.56%)
    review_creation_date: 8,764 (8.41%)
    review_answer_timestamp: 8,785 (8.43%)



In [4]:
# Cleaning orders datasets
print("\n" + "="*70)
print("CLEANING ORDERS DATASET")
print("="*70 + "\n")

# Filter out orders with invalid statuses or missing critical timestamps
orders_clean = orders.filter(
    col("order_purchase_timestamp").isNotNull()
)

# Filter for delivered orders only (for accurate analysis)
orders_delivered = orders_clean.filter(
    col("order_status") == "delivered"
)

print(f"Original orders: {orders.count():,}")
print(f"After removing null timestamps: {orders_clean.count():,}")
print(f"Delivered orders only: {orders_delivered.count():,}")

# Calculate delivery performance metrics
orders_with_metrics = orders_delivered.withColumn(
    "estimated_delivery_days",
    datediff(col("order_estimated_delivery_date"), col("order_purchase_timestamp"))
).withColumn(
    "actual_delivery_days",
    datediff(col("order_delivered_customer_date"), col("order_purchase_timestamp"))
).withColumn(
    "delivery_performance",
    when(col("order_delivered_customer_date") <= col("order_estimated_delivery_date"), "on_time")
    .when(col("order_delivered_customer_date") > col("order_estimated_delivery_date"), "late")
    .otherwise("unknown")
)

print("\nDelivery Performance:")
orders_with_metrics.groupBy("delivery_performance").count().show()

print("Orders dataset cleaned")


CLEANING ORDERS DATASET

Original orders: 99,441
After removing null timestamps: 99,441
Delivered orders only: 96,478

Delivery Performance:
+--------------------+-----+
|delivery_performance|count|
+--------------------+-----+
|             unknown|    8|
|             on_time|88644|
|                late| 7826|
+--------------------+-----+

Orders dataset cleaned


In [5]:
# Cleaning and enriching order items
print("\n" + "="*70)
print("CLEANING ORDER ITEMS DATASET")
print("="*70 + "\n")

# Remove items with null or zero prices
items_clean = order_items.filter(
    (col("price").isNotNull()) & (col("price") > 0)
)

# Calculate total item value (price + freight)
items_enriched = items_clean.withColumn(
    "total_item_value",
    col("price") + col("freight_value")
)

print(f"Original items: {order_items.count():,}")
print(f"After removing invalid prices: {items_clean.count():,}")
print(f"Items removed: {order_items.count() - items_clean.count():,}")

# Show price statistics
print("\nPrice Statistics:")
items_enriched.select(
    round(avg("price"), 2).alias("avg_price"),
    round(min("price"), 2).alias("min_price"),
    round(max("price"), 2).alias("max_price"),
    round(avg("freight_value"), 2).alias("avg_freight"),
    round(avg("total_item_value"), 2).alias("avg_total_value")
).show()

print("Order items cleaned and enriched")


CLEANING ORDER ITEMS DATASET

Original items: 112,650
After removing invalid prices: 112,650
Items removed: 0

Price Statistics:
+---------+---------+---------+-----------+---------------+
|avg_price|min_price|max_price|avg_freight|avg_total_value|
+---------+---------+---------+-----------+---------------+
|   120.65|     0.85|   6735.0|      19.99|         140.64|
+---------+---------+---------+-----------+---------------+

Order items cleaned and enriched


In [6]:
# Aggregating payments by order
print("\n" + "="*70)
print("AGGREGATING PAYMENTS")
print("="*70 + "\n")

# Aggregate total payment per order (some orders have multiple payment methods)
payments_aggregated = payments.groupBy("order_id").agg(
    sum("payment_value").alias("total_payment"),
    count("*").alias("payment_count"),
    collect_list("payment_type").alias("payment_types"),
    max("payment_installments").alias("max_installments")
)

print(f"Original payment records: {payments.count():,}")
print(f"Unique orders with payments: {payments_aggregated.count():,}")

print("\nPayment Methods Distribution:")
payments.groupBy("payment_type").agg(
    count("*").alias("count"),
    round(avg("payment_value"), 2).alias("avg_value")
).orderBy(desc("count")).show()

print("Payments aggregated by order")


AGGREGATING PAYMENTS

Original payment records: 103,886
Unique orders with payments: 99,440

Payment Methods Distribution:
+------------+-----+---------+
|payment_type|count|avg_value|
+------------+-----+---------+
| credit_card|76795|   163.32|
|      boleto|19784|   145.03|
|     voucher| 5775|     65.7|
|  debit_card| 1529|   142.57|
| not_defined|    3|      0.0|
+------------+-----+---------+

Payments aggregated by order


In [7]:
# Building master datasets by joining all tables
print("\n" + "="*70)
print("BUILDING MASTER ANALYTICAL DATASET")
print("="*70 + "\n")

# Step 1: Join orders with customers
master_df = orders_with_metrics.join(
    customers,
    on="customer_id",
    how="left"
)

print(f"Step 1: Orders + Customers = {master_df.count():,} records")

# Step 2: Aggregate order items per order
order_aggregated = items_enriched.groupBy("order_id").agg(
    count("*").alias("items_count"),
    sum("price").alias("total_price"),
    sum("freight_value").alias("total_freight"),
    sum("total_item_value").alias("order_value"),
    round(avg("price"), 2).alias("avg_item_price"),
    collect_set("product_id").alias("product_ids"),
    collect_set("seller_id").alias("seller_ids")
)

# Join aggregated items
master_df = master_df.join(
    order_aggregated,
    on="order_id",
    how="left"
)

print(f"Step 2: + Order Items = {master_df.count():,} records")

# Step 3: Join payments
master_df = master_df.join(
    payments_aggregated,
    on="order_id",
    how="left"
)

print(f"Step 3: + Payments = {master_df.count():,} records")

# Step 4: Join reviews
reviews_agg = reviews.groupBy("order_id").agg(
    max("review_score").alias("review_score"),
    count("*").alias("review_count")
)

master_df = master_df.join(
    reviews_agg,
    on="order_id",
    how="left"
)

print(f"Step 4: + Reviews = {master_df.count():,} records")

print("\nMaster dataset created!")
print(f"  Total columns: {len(master_df.columns)}")
print(f"  Total records: {master_df.count():,}")


BUILDING MASTER ANALYTICAL DATASET

Step 1: Orders + Customers = 96,478 records
Step 2: + Order Items = 96,478 records
Step 3: + Payments = 96,478 records
Step 4: + Reviews = 96,478 records

Master dataset created!
  Total columns: 28
  Total records: 96,478


In [8]:
# Performing Feature Engineering - Temporal features
print("\n" + "="*70)
print("FEATURE ENGINEERING: TEMPORAL FEATURES")
print("="*70 + "\n")

# Extract temporal features
master_df = master_df.withColumn(
    "order_year",
    year("order_purchase_timestamp")
).withColumn(
    "order_month",
    month("order_purchase_timestamp")
).withColumn(
    "order_day_of_week",
    dayofweek("order_purchase_timestamp")
).withColumn(
    "order_hour",
    hour("order_purchase_timestamp")
).withColumn(
    "order_quarter",
    quarter("order_purchase_timestamp")
).withColumn(
    "is_weekend",
    when(col("order_day_of_week").isin([1, 7]), 1).otherwise(0)
).withColumn(
    "time_of_day",
    when(col("order_hour").between(6, 11), "morning")
    .when(col("order_hour").between(12, 17), "afternoon")
    .when(col("order_hour").between(18, 21), "evening")
    .otherwise("night")
)

print("Temporal features created:")
print("  - order_year, order_month, order_quarter")
print("  - order_day_of_week, order_hour")
print("  - is_weekend, time_of_day")

# Show distribution
print("\nOrders by Time of Day:")
master_df.groupBy("time_of_day").count().orderBy(desc("count")).show()

print("\nOrders by Day of Week:")
master_df.groupBy("order_day_of_week").count().orderBy("order_day_of_week").show()


FEATURE ENGINEERING: TEMPORAL FEATURES

Temporal features created:
  - order_year, order_month, order_quarter
  - order_day_of_week, order_hour
  - is_weekend, time_of_day

Orders by Time of Day:
+-----------+-----+
|time_of_day|count|
+-----------+-----+
|  afternoon|37183|
|    evening|23433|
|    morning|21594|
|      night|14268|
+-----------+-----+


Orders by Day of Week:
+-----------------+-----+
|order_day_of_week|count|
+-----------------+-----+
|                1|11635|
|                2|15701|
|                3|15503|
|                4|15076|
|                5|14323|
|                6|13685|
|                7|10555|
+-----------------+-----+



In [9]:
# Feature Engineering - Customer Segmentation Prep
print("\n" + "="*70)
print("FEATURE ENGINEERING: CUSTOMER METRICS")
print("="*70 + "\n")

# Calculate customer-level metrics
customer_metrics = master_df.groupBy("customer_unique_id").agg(
    count("order_id").alias("total_orders"),
    sum("order_value").alias("total_revenue"),
    round(avg("order_value"), 2).alias("avg_order_value"),
    max("order_purchase_timestamp").alias("last_order_date"),
    min("order_purchase_timestamp").alias("first_order_date"),
    round(avg("review_score"), 2).alias("avg_review_score"),
    sum("items_count").alias("total_items_purchased")
)

# Calculate recency (days since last order) - using max date in dataset as reference
max_date = master_df.select(max("order_purchase_timestamp")).collect()[0][0]

customer_metrics = customer_metrics.withColumn(
    "recency_days",
    datediff(lit(max_date), col("last_order_date"))
).withColumn(
    "customer_lifetime_days",
    datediff(col("last_order_date"), col("first_order_date"))
).withColumn(
    "is_repeat_customer",
    when(col("total_orders") > 1, 1).otherwise(0)
)

print(f"Customer metrics calculated for {customer_metrics.count():,} unique customers")

print("\nCustomer Behavior Summary:")
customer_metrics.select(
    round(avg("total_orders"), 2).alias("avg_orders_per_customer"),
    round(avg("total_revenue"), 2).alias("avg_customer_ltv"),
    round(avg("recency_days"), 2).alias("avg_recency_days"),
    sum("is_repeat_customer").alias("repeat_customers")
).show()

print("\nRepeat Customer Analysis:")
customer_metrics.groupBy("is_repeat_customer").count().show()


FEATURE ENGINEERING: CUSTOMER METRICS

Customer metrics calculated for 93,358 unique customers

Customer Behavior Summary:
+-----------------------+----------------+----------------+----------------+
|avg_orders_per_customer|avg_customer_ltv|avg_recency_days|repeat_customers|
+-----------------------+----------------+----------------+----------------+
|                   1.03|          165.17|          237.48|            2801|
+-----------------------+----------------+----------------+----------------+


Repeat Customer Analysis:
+------------------+-----+
|is_repeat_customer|count|
+------------------+-----+
|                 1| 2801|
|                 0|90557|
+------------------+-----+



In [10]:
# Performing RFM Segmentation
print("\n" + "="*70)
print("RFM SEGMENTATION")
print("="*70 + "\n")

# Calculate RFM scores (1-5 scale, 5 is best)
# Use ntile for quintile-based scoring

# Recency: Lower is better (more recent purchase)
# Frequency: Higher is better (more orders)
# Monetary: Higher is better (more revenue)

customer_rfm = customer_metrics.withColumn(
    "recency_score",
    6 - ntile(5).over(Window.orderBy(col("recency_days")))  # Invert so recent = high score
).withColumn(
    "frequency_score",
    ntile(5).over(Window.orderBy(col("total_orders")))
).withColumn(
    "monetary_score",
    ntile(5).over(Window.orderBy(col("total_revenue")))
).withColumn(
    "rfm_score",
    concat(col("recency_score"), col("frequency_score"), col("monetary_score"))
).withColumn(
    "rfm_total",
    col("recency_score") + col("frequency_score") + col("monetary_score")
)

# Create customer segments based on RFM
customer_rfm = customer_rfm.withColumn(
    "customer_segment",
    when(col("rfm_total") >= 13, "Champions")
    .when((col("rfm_total") >= 10) & (col("recency_score") >= 4), "Loyal Customers")
    .when((col("rfm_total") >= 10) & (col("recency_score") < 4), "At Risk")
    .when((col("rfm_total") >= 7) & (col("recency_score") >= 3), "Potential Loyalists")
    .when((col("rfm_total") >= 7) & (col("recency_score") < 3), "Cannot Lose Them")
    .when((col("rfm_total") >= 5) & (col("frequency_score") <= 2), "New Customers")
    .otherwise("Lost")
)

print("RFM segmentation complete")

print("\nCustomer Segments Distribution:")
segment_distribution = customer_rfm.groupBy("customer_segment").agg(
    count("*").alias("customer_count"),
    round(avg("total_revenue"), 2).alias("avg_revenue"),
    round(avg("total_orders"), 2).alias("avg_orders")
).orderBy(desc("customer_count"))

segment_distribution.show(truncate=False)

# Calculate segment value
total_customers = customer_rfm.count()
segment_distribution = segment_distribution.withColumn(
    "percentage",
    round((col("customer_count") / total_customers) * 100, 2)
)

print("\nSegment Percentage:")
segment_distribution.select("customer_segment", "customer_count", "percentage").show(truncate=False)


RFM SEGMENTATION

RFM segmentation complete

Customer Segments Distribution:


25/11/27 22:32:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/27 22:32:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/27 22:32:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/27 22:32:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/27 22:32:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/27 22:32:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/27 2

+-------------------+--------------+-----------+----------+
|customer_segment   |customer_count|avg_revenue|avg_orders|
+-------------------+--------------+-----------+----------+
|Potential Loyalists|32864         |74.08      |1.0       |
|Cannot Lose Them   |22678         |77.29      |1.01      |
|At Risk            |20724         |304.59     |1.06      |
|Loyal Customers    |14439         |301.12     |1.01      |
|Champions          |1397          |373.67     |2.16      |
|Lost               |847           |40.01      |1.0       |
|New Customers      |409           |40.22      |1.0       |
+-------------------+--------------+-----------+----------+


Segment Percentage:


25/11/27 22:32:01 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/27 22:32:01 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/27 22:32:01 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/27 22:32:01 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/27 22:32:01 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/27 22:32:01 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/27 2

+-------------------+--------------+----------+
|customer_segment   |customer_count|percentage|
+-------------------+--------------+----------+
|Potential Loyalists|32864         |35.2      |
|Cannot Lose Them   |22678         |24.29     |
|At Risk            |20724         |22.2      |
|Loyal Customers    |14439         |15.47     |
|Champions          |1397          |1.5       |
|Lost               |847           |0.91      |
|New Customers      |409           |0.44      |
+-------------------+--------------+----------+



25/11/27 22:32:02 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/27 22:32:02 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/27 22:32:02 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/27 22:32:02 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/27 22:32:02 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/27 22:32:02 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


In [11]:
# Saving the cleaned & transformed Data
print("\n" + "="*70)
print("SAVING PROCESSED DATA")
print("="*70 + "\n")

# Save master dataset
output_path = "../data/processed"

master_df.write.mode("overwrite").parquet(f"{output_path}/master_orders_dataset.parquet")
print(f"Master dataset saved: {master_df.count():,} records")

customer_rfm.write.mode("overwrite").parquet(f"{output_path}/customer_rfm_segments.parquet")
print(f"Customer RFM segments saved: {customer_rfm.count():,} records")

print("\nAll processed data saved to ../data/processed/")
print("  - master_orders_dataset.parquet")
print("  - customer_rfm_segments.parquet")


SAVING PROCESSED DATA



25/11/27 22:32:02 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

Master dataset saved: 96,478 records


25/11/27 22:32:06 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/27 22:32:06 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/27 22:32:06 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/27 22:32:06 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/27 22:32:06 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/27 22:32:06 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/27 2

Customer RFM segments saved: 93,358 records

All processed data saved to ../data/processed/
  - master_orders_dataset.parquet
  - customer_rfm_segments.parquet


In [12]:
# Data Quality Report
from pyspark.sql.functions import countDistinct

print("\n" + "="*70)
print("FINAL DATA QUALITY REPORT")
print("="*70 + "\n")

print("MASTER DATASET SUMMARY:")
print(f"  Total Orders: {master_df.count():,}")
print(f"  Total Columns: {len(master_df.columns)}")
print(f"  Date Range: {master_df.select(min('order_purchase_timestamp'), max('order_purchase_timestamp')).collect()[0]}")

print("\nKEY METRICS:")
master_summary = master_df.select(
    count("*").alias("total_orders"),
    countDistinct("customer_unique_id").alias("unique_customers"),
    round(sum("order_value"), 2).alias("total_revenue"),
    round(avg("order_value"), 2).alias("avg_order_value"),
    round(avg("items_count"), 2).alias("avg_items_per_order"),
    round(avg("review_score"), 2).alias("avg_review_score")
).collect()[0]

for field, value in master_summary.asDict().items():
    if isinstance(value, (int, float)):
        print(f"  {field}: {value:,.2f}")
    else:
        print(f"  {field}: {value}")

print("\nCUSTOMER SEGMENTATION:")
print(f"  Total Unique Customers: {customer_rfm.count():,}")
print(f"  Segments Created: {customer_rfm.select('customer_segment').distinct().count()}")

print("\nDELIVERY PERFORMANCE:")
delivery_stats = master_df.groupBy("delivery_performance").count()
delivery_stats.show()

print("\nORDERS BY YEAR:")
master_df.groupBy("order_year").count().orderBy("order_year").show()


FINAL DATA QUALITY REPORT

MASTER DATASET SUMMARY:
  Total Orders: 96,478
  Total Columns: 35
  Date Range: Row(min(order_purchase_timestamp)=datetime.datetime(2016, 9, 15, 12, 16, 38), max(order_purchase_timestamp)=datetime.datetime(2018, 8, 29, 15, 0, 37))

KEY METRICS:
  total_orders: 96,478.00
  unique_customers: 93,358.00
  total_revenue: 15,419,773.75
  avg_order_value: 159.83
  avg_items_per_order: 1.14
  avg_review_score: 4.16

CUSTOMER SEGMENTATION:
  Total Unique Customers: 93,358


25/11/27 22:32:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/27 22:32:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/27 22:32:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/27 22:32:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/27 22:32:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/27 22:32:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/27 2

  Segments Created: 7

DELIVERY PERFORMANCE:
+--------------------+-----+
|delivery_performance|count|
+--------------------+-----+
|             unknown|    8|
|             on_time|88644|
|                late| 7826|
+--------------------+-----+


ORDERS BY YEAR:
+----------+-----+
|order_year|count|
+----------+-----+
|      2016|  267|
|      2017|43428|
|      2018|52783|
+----------+-----+

