In [1]:
!yarn application -list

2025-07-20 19:41:39,142 INFO client.DefaultNoHARMFailoverProxyProvider: Connecting to ResourceManager at cluster-3609-m.us-central1-c.c.fluid-shadow-459720-q0.internal./10.128.0.12:8032
2025-07-20 19:41:39,618 INFO client.AHSProxy: Connecting to Application History server at cluster-3609-m.us-central1-c.c.fluid-shadow-459720-q0.internal./10.128.0.12:10200
Total number of applications (application-types: [], states: [SUBMITTED, ACCEPTED, RUNNING] and tags: []):1
                Application-Id	    Application-Name	    Application-Type	      User	     Queue	             State	       Final-State	       Progress	                       Tracking-URL
application_1753034603172_0005	        PySparkShell	               SPARK	      root	   default	           RUNNING	         UNDEFINED	            10%	http://cluster-3609-m.us-central1-c.c.fluid-shadow-459720-q0.internal:44969


In [2]:
#yarn application -kill application_1753034603172_0003

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import Window

In [4]:
# 🔧 SOLUTION 1: Check if Spark session exists, create if needed
def get_spark_session():
    try:
        # Try to get existing active session
        spark = SparkSession.getActiveSession()
        if spark is None:
            # Create new session if none exists
            spark = SparkSession.builder \
                .appName("Module-3") \
                .config("spark.jars.packages", "com.google.cloud.spark:spark-bigquery-with-dependencies_2.12:0.23.2") \
                .getOrCreate()
        return spark
    except:
        # If anything goes wrong, create fresh session
        spark = SparkSession.builder \
            .appName("Module-3") \
            .config("spark.jars.packages", "com.google.cloud.spark:spark-bigquery-with-dependencies_2.12:0.23.2") \
            .getOrCreate()
        return spark

In [5]:
# Get or create Spark session
spark = get_spark_session()

In [6]:
print("✅ Spark Session Status:")
print(f"   App Name: {spark.sparkContext.appName}")
print(f"   Master: {spark.sparkContext.master}")
print(f"   Spark Version: {spark.version}")
print(f"   Active: {not spark.sparkContext._jsc.sc().isStopped()}")

✅ Spark Session Status:
   App Name: PySparkShell
   Master: yarn
   Spark Version: 3.5.3
   Active: True


In [7]:
# Define the base GCS bucket path
gcs_bucket_path = "gs://retail-order-data-bucket/output/"

In [8]:
# Read each Parquet file into a DataFrame
# Customers DataFrame
customers_df = spark.read.parquet(f"{gcs_bucket_path}customers_clean_df.parquet")
# Geolocation DataFrame
geolocation_df = spark.read.parquet(f"{gcs_bucket_path}geolocation_clean_df.parquet/")
# Order Items DataFrame
order_items_df = spark.read.parquet(f"{gcs_bucket_path}order_items_clean_df.parquet/")
# Order Payments DataFrame
order_payments_df = spark.read.parquet(f"{gcs_bucket_path}order_payments_clean_df.parquet/")
# Orders DataFrame
orders_df = spark.read.parquet(f"{gcs_bucket_path}orders_clean_df.parquet/")
# Sellers DataFrame
sellers_df = spark.read.parquet(f"{gcs_bucket_path}sellers_clean_df.parquet/")
# Order Reviews DataFrame
order_reviews_df = spark.read.parquet(f"{gcs_bucket_path}orders_reviews_clean_df.parquet/")
# Products DataFrame
products_df = spark.read.parquet(f"{gcs_bucket_path}products_clean_df.parquet/")

                                                                                

## 🚀 ADVANCED WINDOW FUNCTIONS

### 7. Ranking & Percentiles

In [9]:
#Q7.1: Rank customers by their total spending within each state
co = customers_df.join(orders_df,"customer_id","inner")
oi = co.join(order_items_df,"order_id","inner")

x = oi.groupBy("customer_id","customer_state").agg(sum("total_value").alias("tot"))

window_spec = Window.partitionBy("customer_state").orderBy(col("tot").desc())
cos = x.withColumn("ranking",rank().over(window_spec))

cos.show()



                                                                                

+--------------------+--------------+-------+-------+
|         customer_id|customer_state|    tot|ranking|
+--------------------+--------------+-------+-------+
|25dcca1d4dd5e5ae8...|            AM|1853.75|      1|
|3a486addcf71802e8...|            AM|1384.74|      2|
|19faaa8953bbd5166...|            AM|1259.04|      3|
|75356ef9427992199...|            AM| 725.69|      4|
|af0e505d980484f4c...|            AM| 638.66|      5|
|7afffca41e83ce5da...|            AM| 637.09|      6|
|1434c23ad06a9986f...|            AM| 628.18|      7|
|53e9e25ac4476a44a...|            AM| 604.76|      8|
|f21af0b30cca19e2f...|            AM| 592.49|      9|
|b9374bfb8d4215e1f...|            AM| 545.08|     10|
|b67cda5c14480b995...|            AM| 517.20|     11|
|389efb68674e38efb...|            AM| 488.16|     12|
|9eec05db89d3d8863...|            AM| 447.09|     13|
|b482b6eb1e1daa52f...|            AM| 420.13|     14|
|6b170d60eabcac609...|            AM| 371.19|     15|
|2009e285643a9f9aa...|      

In [11]:
#Q7.2: Find the top 3 products by revenue in each category
poi = products_df.join(order_items_df,"product_id","inner")
y = poi.groupBy("product_id","product_category_name").agg(sum("total_value").alias("tot"))
window_spec = Window.partitionBy("product_category_name").orderBy(col("tot").desc())

p = y.withColumn("ranking",rank().over(window_spec))
# Filter for the top 3 products in each category
top_3_products_by_category = p.filter(col("ranking") <= 3)

# Show the result
top_3_products_by_category.show()



+--------------------+---------------------+--------+-------+
|          product_id|product_category_name|     tot|ranking|
+--------------------+---------------------+--------+-------+
|5a848e4ab52fd5445...|                 NULL|27286.05|      1|
|eed5cbd74fac3bd79...|                 NULL|10223.82|      2|
|b1d207586fca400a2...|                 NULL| 7689.86|      3|
|11250b0d4b709fee9...| agro industria e ...| 9710.48|      1|
|423a6644f0aa529e8...| agro industria e ...| 8609.73|      2|
|672e757f331900b9d...| agro industria e ...| 7355.34|      3|
|73326828aa5efe1ba...|            alimentos| 5092.34|      1|
|89321f94e35fc6d79...|            alimentos| 4883.44|      2|
|ed2067a9c1f795530...|            alimentos| 4027.24|      3|
|992197904e1d4f0bf...|    alimentos bebidas| 2859.77|      1|
|90f97298579cd2041...|    alimentos bebidas| 1605.26|      2|
|84f5c4f480ad6c999...|    alimentos bebidas| 1400.43|      3|
|4fe644d766c7566db...|                artes|12852.22|      1|
|1bdf5e6

                                                                                

In [12]:
# Q7.3: Calculate the 25th, 50th, and 75th percentile of order values by month
oi = orders_df.join(order_items_df,"order_id","inner")
ov = oi.groupBy("order_id", month("order_purchase_timestamp").alias("order_month")).agg(
    sum("total_value").alias("total_order_value")
)

# Define the window specification, partitioning by order_month
window_spec_month = Window.partitionBy("order_month")

z = ov.groupBy("order_month").agg(
    percentile_approx(col("total_order_value"), lit(0.25), 1000).alias("p25_order_value"),
    percentile_approx(col("total_order_value"), lit(0.50), 1000).alias("p50_order_value"), # Median
    percentile_approx(col("total_order_value"), lit(0.75), 1000).alias("p75_order_value")
).orderBy("order_month")

z.show()





+-----------+---------------+---------------+---------------+
|order_month|p25_order_value|p50_order_value|p75_order_value|
+-----------+---------------+---------------+---------------+
|          1|          60.59|         104.87|         175.32|
|          2|          58.62|         102.96|         170.43|
|          3|          60.79|         105.50|         179.04|
|          4|          62.77|         108.21|         184.27|
|          5|          63.13|         106.30|         176.83|
|          6|          62.22|         107.44|         176.12|
|          7|          62.31|         105.32|         177.24|
|          8|          61.29|         100.44|         171.77|
|          9|          64.11|         104.37|         181.78|
|         10|          63.11|         106.30|         181.55|
|         11|          61.77|         103.55|         175.56|
|         12|          63.73|         106.17|         176.38|
+-----------+---------------+---------------+---------------+



                                                                                

In [18]:
#--- 1. Pre-process Geolocation Data ---
# Average lat/lng for each zip code prefix to get a unique coordinate set.
avg_geolocation_df = geolocation_df.groupBy("geolocation_zip_code_prefix").agg(
    avg("geolocation_lat").alias("avg_lat"),
    avg("geolocation_lng").alias("avg_lng")
)

# --- 2. Join Core Order, Customer, Payment, and Review Data using INNER JOIN ---
# Only orders with matches in all these core tables will be kept.
combined_df = orders_df.alias("o") \
    .join(customers_df.alias("c"), col("o.customer_id") == col("c.customer_id"), "inner") \
    .join(order_payments_df.alias("op"), col("o.order_id") == col("op.order_id"), "inner") \
    .join(order_reviews_df.alias("orv"), col("o.order_id") == col("orv.order_id"), "inner")

# --- 3. Join Order Items, Products, and Sellers Data using INNER JOIN ---
# Only records with matches in these will be kept.
combined_df = combined_df \
    .join(order_items_df.alias("oi"), col("o.order_id") == col("oi.order_id"), "inner") \
    .join(products_df.alias("p"), col("oi.product_id") == col("p.product_id"), "inner") \
    .join(sellers_df.alias("s"), col("oi.seller_id") == col("s.seller_id"), "inner")

# --- 4. Join Geolocation Data for Customers using INNER JOIN ---
# Only records where customer zip code has a geolocation match will be kept.
combined_df = combined_df.join(
    avg_geolocation_df.alias("geo_cust"),
    col("c.customer_zip_code_prefix") == col("geo_cust.geolocation_zip_code_prefix"),
    "inner"
).withColumnRenamed("avg_lat", "customer_lat") \
 .withColumnRenamed("avg_lng", "customer_lng")

# --- 5. Join Geolocation Data for Sellers using INNER JOIN ---
# Only records where seller zip code has a geolocation match will be kept.
combined_df = combined_df.join(
    avg_geolocation_df.alias("geo_seller"),
    col("s.seller_zip_code_prefix") == col("geo_seller.geolocation_zip_code_prefix"),
    "inner"
).withColumnRenamed("avg_lat", "seller_lat") \
 .withColumnRenamed("avg_lng", "seller_lng")

# --- 6. Select and Rename Columns for Final Output ---
# This step is crucial to avoid duplicate column names and make the schema clean
final_combined_df = combined_df.select(
    # Order Details
    col("o.order_id"),
    col("o.order_status"),
    col("o.order_purchase_timestamp"),
    col("o.order_approved_at"),
    col("o.order_delivered_carrier_date"),
    col("o.order_delivered_customer_date"),
    col("o.order_estimated_delivery_date"),

    # Customer Details
    col("c.customer_id"),
    col("c.customer_unique_id"),
    col("c.customer_zip_code_prefix"),
    col("c.customer_city"),
    col("c.customer_state"),
    col("customer_lat"), # from geo_cust join
    col("customer_lng"), # from geo_cust join

    # Order Item Details
    col("oi.order_item_id"),
    col("oi.product_id"),
    col("oi.price").alias("item_price"), # Renamed to avoid conflict with payment_value
    col("oi.freight_value").alias("item_freight_value"),
    col("oi.total_value").alias("item_total_value"), # total_value from order_items
    col("oi.shipping_limit_date"),
    col("oi.shipping_date"),
    col("oi.shipping_year"),
    col("oi.shipping_month"),

    # Product Details
    col("p.product_category_name"),
    col("p.product_name_lenght"),
    col("p.product_description_lenght"),
    col("p.product_photos_qty"),
    col("p.product_weight_g"),
    col("p.product_length_cm"),
    col("p.product_height_cm"),
    col("p.product_width_cm"),
    col("p.product_weight_kg"),
    col("p.product_category_clean"),

    # Seller Details
    col("s.seller_id"),
    col("s.seller_zip_code_prefix"),
    col("s.seller_city"),
    col("s.seller_state"),
    col("seller_lat"), # from geo_seller join
    col("seller_lng"), # from geo_seller join

    # Payment Details
    col("op.payment_sequential"),
    col("op.payment_type"),
    col("op.payment_installments"),
    col("op.payment_value").alias("order_payment_value"), # Renamed for clarity
    col("op.installment_value"),

    # Review Details
    col("orv.review_id"),
    col("orv.review_score"),
    col("orv.review_comment_title"),
    col("orv.review_comment_message"),
    col("orv.review_creation_date"),
    col("orv.review_answer_timestamp")
)

In [19]:
# Set your temp GCS bucket path
temp_gcs_bucket = "temp-buck-111"  # just the bucket name, without gs://

In [20]:
try:
    final_combined_df.write \
        .format("bigquery") \
        .option("temporaryGcsBucket", temp_gcs_bucket) \
        .option("table", bigquery_table) \
        .mode("overwrite") \
        .save()
    print(f"✅ Successfully wrote combined data to BigQuery table: {bigquery_table}")
except Exception as e:
    print(f"❌ Error writing to BigQuery: {e}")

                                                                                

✅ Successfully wrote combined data to BigQuery table: fluid-shadow-459720-q0.olist_dataset.ecomm_data


In [22]:
spark.stop()