In [0]:
%run ../config/pipeline_config

In [0]:
# Business Objectives
# # 1.Total searches
# 2. Total expansion signals
# 3. Signal ratio
# 4. apply threshold
# 5. Rank cities
# 6. suggestion based on ranks and searches
# 7. Select Top N

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

spark = SparkSession.builder.getOrCreate()

print(f"\nSTEP 1: Reading data from silver table")
df_silver = spark.table(SILVER_TABLE_FQN)

silver_count = df_silver.count()
print(f"Records in Silver: {silver_count:,}")


In [0]:
# Aggregate City-Level Metrics
print(f"\nSTEP 2: Aggregating city-level metrics...")
df_gold_base = df_silver.groupBy("city").agg(
    count("*").alias("Total_Searches"),
    sum(
        when(col("error_type").isin(EXPANSION_ERROR_TYPES), 1).otherwise(0)
    ).alias("expansion_signal_count")
)

In [0]:
# check the total searches and expension signal count in desc order
df_gold_base = df_gold_base.orderBy(col("Total_Searches").desc(),
                                    col("expansion_signal_count").desc())

In [0]:
display(df_gold_base)

In [0]:
# Calculate Signal Ratio
df_gold_base = df_gold_base.withColumn(
    "signal_ratio_pct",
    round((col("expansion_signal_count") / col("Total_Searches"))*100, 2)
)

In [0]:
display(df_gold_base)

In [0]:
# minimum threshold
print("\nStep 3: Applying minimum search threshold...")

df_gold_filtered = df_gold_base.filter(
    col("total_searches") >= MIN_SEARCHES_FOR_SIGNAL
)

In [0]:
# Ranking
print("\nStep 4: Ranking cities...")

ranking_window = Window.orderBy(
    col("expansion_signal_count").desc(),
    col("signal_ratio_pct").desc()
)

df_gold_ranked = df_gold_filtered.withColumn(
    "rank",
    dense_rank().over(ranking_window)
)

In [0]:
#  Top N
df_gold_final = df_gold_ranked.filter(
    col("rank") <= TOP_N_CITIES
)

In [0]:
display(df_gold_final.show())

In [0]:
# create gold table
print(f"\n Writing into Gold table...{GOLD_TABLE_FQN}")
df_gold_final.write \
    .format("delta") \
    .mode(WRITE_MODE) \
    .option("overwriteSchema", "true") \
    .saveAsTable(GOLD_TABLE_FQN)
print("Gold table created successfully!")

In [0]:
# validation
print("\n validation data in gold table output...")
df_verify = spark.table(GOLD_TABLE_FQN)
df_verify.orderBy("rank").show(truncate=False)
