In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, window,avg,max,count,sum,expr
from pyspark.sql.types import *

spark = (
    SparkSession.builder.appName("Incident-Escalations-Metrics").config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.3") \
    .config("spark.sql.shuffle.partitions", "6") \
    .getOrCreate())
spark.sparkContext.setLogLevel("WARN")
spark

In [2]:
escalation_schema = StructType([
    StructField("incident_id", StringType()),
    StructField("service", StringType()),
    StructField("type", StringType()),
    StructField("severity", StringType()),
    StructField("p95_latency", IntegerType(), True),
    StructField("breach_count", IntegerType(), True),
    StructField("window_start", LongType()),
    StructField("window_end", LongType()),
    StructField("escalation_reason", StringType(), True)
])

In [9]:
raw = (
    spark.readStream
    .format("kafka")
    .option("kafka.bootstrap.servers", "broker2:29094")
    .option("subscribe", "incident_escalations")
    .option("startingOffsets", "latest")
    .load()
)

parsed = (
    raw.selectExpr("CAST(value AS STRING) AS json")
    .select(from_json(col("json"), escalation_schema).alias("data"))
    .select("data.*")
)

alerts = parsed.withColumn(
    "event_time",
    (col("window_end").cast("timestamp"))
)

# ---------------- Windowed Aggregation ----------------
agg_metrics = (
    alerts
    .withWatermark("event_time", "2 minutes")
    .groupBy(
        col("service"),
        window(col("event_time"), "1 minute")
    )
    .agg(
        count("*").alias("total_escalations"),
        count(expr("CASE WHEN type='LATENCY_SLO_BREACH' THEN 1 END")).alias("latency_breaches"),
        count(expr("CASE WHEN type='ERROR_RATE_SPIKE' THEN 1 END")).alias("error_spikes"),
        avg("p95_latency").alias("avg_p95_latency"),
        max("breach_count").alias("max_breach_count"),
        (sum(expr("CASE WHEN severity='CRITICAL' THEN 1 ELSE 0 END")) / count("*")).alias("critical_ratio")
    )
    .select(
        col("service"),
        col("window.start").alias("window_start"),
        col("window.end").alias("window_end"),
        "total_escalations",
        "latency_breaches",
        "error_spikes",
        "avg_p95_latency",
        "max_breach_count",
        "critical_ratio"
    )
)

def print_non_empty(batch_df, batch_id):
    if batch_df.count() > 0:
        print(f"\n========= BATCH {batch_id} =========")
        batch_df.show(truncate=False)

query = (
    agg_metrics
    .writeStream
    .outputMode("update")
    .trigger(processingTime="2 minutes")
    .foreachBatch(print_non_empty)
    .start()
)

query.awaitTermination()

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/venv/lib/python3.12/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/venv/lib/python3.12/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/socket.py", line 707, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 


+---------------+-------------------+-------------------+-----------------+----------------+------------+---------------+----------------+--------------+
|service        |window_start       |window_end         |total_escalations|latency_breaches|error_spikes|avg_p95_latency|max_breach_count|critical_ratio|
+---------------+-------------------+-------------------+-----------------+----------------+------------+---------------+----------------+--------------+
|order-service  |2026-01-06 06:00:00|2026-01-06 06:01:00|2                |1               |1           |652.5          |2               |0.0           |
|payment-service|2026-01-06 06:00:00|2026-01-06 06:01:00|1                |0               |1           |659.0          |1               |0.0           |
|auth-service   |2026-01-06 06:00:00|2026-01-06 06:01:00|1                |0               |1           |813.0          |3               |0.0           |
+---------------+-------------------+-------------------+-----------------+