## Amazon Sales Analysis - Exploratory Data Analysis

Statistical analysis and top performers identification.

In [0]:
# Configuration
CATALOG = "db_ecom_project"
SCHEMA = "amazon_sales_schema"
CLEANED_TABLE = f"{CATALOG}.{SCHEMA}.amazon_sales_cleaned"

print(f"ðŸ“Š Analyzing: {CLEANED_TABLE}")

### 1. Load Data

In [0]:
df = spark.table(CLEANED_TABLE)
print(f"Total records: {df.count():,}")
display(df.limit(5))

### 2. Dataset Overview

In [0]:
from pyspark.sql.functions import sum as spark_sum, count, mean

basic_metrics = df.agg(
    count("*").alias("Total_Orders"),
    spark_sum("total_revenue").alias("Total_Revenue"),
    spark_sum("quantity_sold").alias("Total_Units_Sold"),
    mean("total_revenue").alias("Avg_Order_Value"),
    mean("rating").alias("Avg_Rating")
)

display(basic_metrics)

In [0]:
from pyspark.sql.functions import min as spark_min, max as spark_max

date_range = df.agg(
    spark_min("order_date").alias("Start_Date"),
    spark_max("order_date").alias("End_Date")
)

display(date_range)

In [0]:
from pyspark.sql.functions import countDistinct

unique_counts = df.agg(
    countDistinct("product_id").alias("Unique_Products"),
    countDistinct("product_category").alias("Unique_Categories"),
    countDistinct("customer_region").alias("Unique_Regions"),
    countDistinct("payment_method").alias("Payment_Methods")
)

display(unique_counts)

### 3. Statistical Summary

In [0]:
stats = df.select("price", "discount_percent", "quantity_sold", "total_revenue", "rating").summary()
display(stats)

### 4. Top Performers

In [0]:
from pyspark.sql.functions import col

top_categories = df.groupBy("product_category") \
    .agg(
        spark_sum("total_revenue").alias("Total_Revenue"),
        mean("total_revenue").alias("Avg_Order_Value"),
        count("*").alias("Order_Count")
    ) \
    .orderBy(col("Total_Revenue").desc())

display(top_categories)

In [0]:
top_regions = df.groupBy("customer_region") \
    .agg(
        spark_sum("total_revenue").alias("Total_Revenue"),
        mean("rating").alias("Avg_Rating"),
        count("*").alias("Order_Count")
    ) \
    .orderBy(col("Total_Revenue").desc())

display(top_regions)

In [0]:
payment_analysis = df.groupBy("payment_method") \
    .agg(
        spark_sum("total_revenue").alias("Total_Revenue"),
        count("*").alias("Order_Count")
    ) \
    .orderBy(col("Total_Revenue").desc())

display(payment_analysis)