In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, DoubleType
from pyspark.sql.functions import *

In [0]:
traffic_path = "file:/Workspace/Shared/traffic_logs.csv"
registry_path = "file:/Workspace/Shared/vehicle_registry.csv"


In [0]:
manual_schema = StructType([
    StructField("LogID", StringType(), True),
    StructField("VehicleID", StringType(), True),
    StructField("EntryPoint", StringType(), True),
    StructField("ExitPoint", StringType(), True),
    StructField("EntryTime", TimestampType(), True),
    StructField("ExitTime", TimestampType(), True),
    StructField("VehicleType", StringType(), True),
    StructField("SpeedKMH", DoubleType(), True),
    StructField("TollPaid", DoubleType(), True)
])

In [0]:
traffic_df = spark.read.option("header", True).schema(manual_schema).csv(traffic_path)
traffic_df.show()

+-----+---------+----------+---------+-------------------+-------------------+-----------+--------+--------+
|LogID|VehicleID|EntryPoint|ExitPoint|          EntryTime|           ExitTime|VehicleType|SpeedKMH|TollPaid|
+-----+---------+----------+---------+-------------------+-------------------+-----------+--------+--------+
| L001|     V001|     GateA|    GateC|2024-05-01 08:01:00|2024-05-01 08:20:00|        Car|    60.0|    50.0|
| L002|     V002|     GateB|    GateC|2024-05-01 08:10:00|2024-05-01 08:45:00|      Truck|    45.0|   100.0|
| L003|     V003|     GateA|    GateD|2024-05-01 09:00:00|2024-05-01 09:18:00|       Bike|    55.0|    30.0|
| L004|     V004|     GateC|    GateD|2024-05-01 09:15:00|2024-05-01 09:35:00|        Car|    80.0|    50.0|
| L005|     V005|     GateB|    GateA|2024-05-01 10:05:00|2024-05-01 10:40:00|        Bus|    40.0|    70.0|
+-----+---------+----------+---------+-------------------+-------------------+-----------+--------+--------+



In [0]:
traffic_df = traffic_df.withColumn("TripDurationMinutes", (unix_timestamp("ExitTime") - unix_timestamp("EntryTime")) / 60)
traffic_df = traffic_df.withColumn("IsOverspeed", col("SpeedKMH") > 60)


In [0]:
avg_speed = traffic_df.groupBy("VehicleType").agg(avg("SpeedKMH").alias("AvgSpeed"))
total_toll_per_gate = traffic_df.groupBy("EntryPoint").agg(sum("TollPaid").alias("TotalToll"))
most_used_exit = traffic_df.groupBy("ExitPoint").count().orderBy(col("count").desc())


In [0]:
from pyspark.sql.window import Window

speed_rank_window = Window.partitionBy("VehicleType").orderBy(col("SpeedKMH").desc())
traffic_df = traffic_df.withColumn("SpeedRank", rank().over(speed_rank_window))

vehicle_window = Window.partitionBy("VehicleID").orderBy("EntryTime")
traffic_df = traffic_df.withColumn("LastExitTime", lag("ExitTime").over(vehicle_window))



In [0]:

traffic_df = traffic_df.withColumn("IdleTimeMinutes",
                                   (unix_timestamp("EntryTime") - unix_timestamp("LastExitTime")) / 60)


In [0]:
anomaly_speed = traffic_df.filter((col("SpeedKMH") > 70) & (col("TripDurationMinutes") < 10))
anomaly_toll = traffic_df.filter((col("TripDurationMinutes") > 30) & (col("TollPaid") < 40))
anomaly_backtrack = traffic_df.filter(expr("EntryPoint > ExitPoint"))  # example backtracking logic


In [0]:
registry_schema = StructType([
    StructField("VehicleID", StringType(), True),
    StructField("OwnerName", StringType(), True),
    StructField("Model", StringType(), True),
    StructField("RegisteredCity", StringType(), True)
])
vehicle_registry_df = spark.read.option("header", True).schema(registry_schema).csv(registry_path)

enriched_df = traffic_df.join(vehicle_registry_df, on="VehicleID", how="left")
trips_by_city = enriched_df.groupBy("RegisteredCity").count()


In [0]:
delta_path = "/mnt/data/traffic_delta"
enriched_df.write.format("delta").mode("overwrite").save(delta_path)

from delta.tables import DeltaTable
delta_table = DeltaTable.forPath(spark, delta_path)

In [0]:
delta_table.update("VehicleType = 'Bike'", {"TollPaid": "40"})

In [0]:
delta_table.delete("(unix_timestamp(ExitTime) - unix_timestamp(EntryTime))/60 > 60")

In [0]:
delta_table.history().show()
version_0_df = spark.read.format("delta").option("versionAsOf", 0).load(delta_path)


+-------+-------------------+----------------+--------------------+---------+--------------------+----+------------------+--------------------+-----------+-----------------+-------------+--------------------+------------+--------------------+
|version|          timestamp|          userId|            userName|operation| operationParameters| job|          notebook|           clusterId|readVersion|   isolationLevel|isBlindAppend|    operationMetrics|userMetadata|          engineInfo|
+-------+-------------------+----------------+--------------------+---------+--------------------+----+------------------+--------------------+-----------+-----------------+-------------+--------------------+------------+--------------------+
|      3|2025-06-19 04:40:29|7868838587549447|azuser3557_mml.lo...|   DELETE|{predicate -> ["(...|NULL|{1222209826929585}|0619-042535-5t46f450|          2|WriteSerializable|        false|{numRemovedFiles ...|        NULL|Databricks-Runtim...|
|      2|2025-06-19 04:40:21

In [0]:
enriched_df = enriched_df.withColumn("TripType",
    when(col("TripDurationMinutes") < 15, "Short")
    .when(col("TripDurationMinutes") <= 30, "Medium")
    .otherwise("Long")
)

trip_daily = enriched_df.withColumn("TripDate", to_date("EntryTime")) \
    .groupBy("VehicleID", "TripDate") \
    .agg(count("*").alias("DailyTripCount")) \
    .withColumn("TripFlag", when(col("DailyTripCount") > 3, "Frequent").otherwise("Normal"))


In [0]:
output_parquet = "file:/Workspace/Shared/final_output_parquet"
output_csv = "file:/Workspace/Shared/final_output_csv"


In [0]:
enriched_df.write.partitionBy("VehicleType").parquet(output_parquet, mode="overwrite")
enriched_df.write.option("header", True).mode("overwrite").csv(output_csv)


In [0]:
enriched_df.createOrReplaceTempView("traffic_view")
summary_sql = spark.sql("""
SELECT VehicleType, ExitPoint, SUM(TollPaid) AS TotalToll
FROM traffic_view
GROUP BY VehicleType, ExitPoint
""")
summary_sql.show()

+-----------+---------+---------+
|VehicleType|ExitPoint|TotalToll|
+-----------+---------+---------+
|        Car|    GateD|     50.0|
|      Truck|    GateC|    100.0|
|       Bike|    GateD|     30.0|
|        Bus|    GateA|     70.0|
|        Car|    GateC|     50.0|
+-----------+---------+---------+

