In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("TrafficMonitoringSystem").getOrCreate()

df = spark.read.csv("file:/Workspace/Shared/traffic_logs.csv", header=True, inferSchema=True)
df.printSchema()


root
 |-- LogID: string (nullable = true)
 |-- VehicleID: string (nullable = true)
 |-- EntryPoint: string (nullable = true)
 |-- ExitPoint: string (nullable = true)
 |-- EntryTime: timestamp (nullable = true)
 |-- ExitTime: timestamp (nullable = true)
 |-- VehicleType: string (nullable = true)
 |-- SpeedKMH: integer (nullable = true)
 |-- TollPaid: integer (nullable = true)



In [0]:
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, IntegerType, DoubleType

manual_schema = StructType([
    StructField("LogID", StringType(), True),
    StructField("VehicleID", StringType(), True),
    StructField("EntryPoint", StringType(), True),
    StructField("ExitPoint", StringType(), True),
    StructField("EntryTime", TimestampType(), True),
    StructField("ExitTime", TimestampType(), True),
    StructField("VehicleType", StringType(), True),
    StructField("SpeedKMH", IntegerType(), True),
    StructField("TollPaid", IntegerType(), True)
])

df_manual = spark.read.csv("file:/Workspace/Shared/traffic_logs.csv", header=True, schema=manual_schema)
df_manual.printSchema()


root
 |-- LogID: string (nullable = true)
 |-- VehicleID: string (nullable = true)
 |-- EntryPoint: string (nullable = true)
 |-- ExitPoint: string (nullable = true)
 |-- EntryTime: timestamp (nullable = true)
 |-- ExitTime: timestamp (nullable = true)
 |-- VehicleType: string (nullable = true)
 |-- SpeedKMH: integer (nullable = true)
 |-- TollPaid: integer (nullable = true)



In [0]:
# Derived Column Creation

# Calculate TripDurationMinutes = ExitTime - EntryTime

from pyspark.sql.functions import col, unix_timestamp, round

df=df.withColumn("TripDurationMinutes", round((unix_timestamp(col("ExitTime")) - unix_timestamp(col("EntryTime"))) / 60, 2))
df.display()

# Add IsOverspeed = SpeedKMH > 60

from pyspark.sql.functions import when

df.withColumn("IsOverspeed",when(col("SpeedKMH") > 60, True).otherwise(False)).show()


LogID,VehicleID,EntryPoint,ExitPoint,EntryTime,ExitTime,VehicleType,SpeedKMH,TollPaid,TripDurationMinutes
L001,V001,GateA,GateC,2024-05-01T08:01:00Z,2024-05-01T08:20:00Z,Car,60,50,19.0
L002,V002,GateB,GateC,2024-05-01T08:10:00Z,2024-05-01T08:45:00Z,Truck,45,100,35.0
L003,V003,GateA,GateD,2024-05-01T09:00:00Z,2024-05-01T09:18:00Z,Bike,55,30,18.0
L004,V004,GateC,GateD,2024-05-01T09:15:00Z,2024-05-01T09:35:00Z,Car,80,50,20.0
L005,V005,GateB,GateA,2024-05-01T10:05:00Z,2024-05-01T10:40:00Z,Bus,40,70,35.0


+-----+---------+----------+---------+-------------------+-------------------+-----------+--------+--------+-------------------+-----------+
|LogID|VehicleID|EntryPoint|ExitPoint|          EntryTime|           ExitTime|VehicleType|SpeedKMH|TollPaid|TripDurationMinutes|IsOverspeed|
+-----+---------+----------+---------+-------------------+-------------------+-----------+--------+--------+-------------------+-----------+
| L001|     V001|     GateA|    GateC|2024-05-01 08:01:00|2024-05-01 08:20:00|        Car|      60|      50|               19.0|      false|
| L002|     V002|     GateB|    GateC|2024-05-01 08:10:00|2024-05-01 08:45:00|      Truck|      45|     100|               35.0|      false|
| L003|     V003|     GateA|    GateD|2024-05-01 09:00:00|2024-05-01 09:18:00|       Bike|      55|      30|               18.0|      false|
| L004|     V004|     GateC|    GateD|2024-05-01 09:15:00|2024-05-01 09:35:00|        Car|      80|      50|               20.0|       true|
| L005|     V

In [0]:
# Vehicle Behavior Aggregations

# Average speed per VehicleType
df.groupBy("VehicleType").avg("SpeedKMH").withColumnRenamed("avg(SpeedKMH)", "AvgSpeed").show()

# Total toll collected per gate (EntryPoint)
df.groupBy("EntryPoint").sum("TollPaid").withColumnRenamed("sum(TollPaid)", "TotalTollCollected").show()

# Most used ExitPoint
from pyspark.sql.functions import count

df.groupBy("ExitPoint").agg(count("*").alias("ExitUsageCount")) .orderBy(col("ExitUsageCount").desc()).show(1)



+-----------+--------+
|VehicleType|AvgSpeed|
+-----------+--------+
|       Bike|    55.0|
|        Car|    70.0|
|      Truck|    45.0|
|        Bus|    40.0|
+-----------+--------+

+----------+------------------+
|EntryPoint|TotalTollCollected|
+----------+------------------+
|     GateA|                80|
|     GateB|               170|
|     GateC|                50|
+----------+------------------+

+---------+--------------+
|ExitPoint|ExitUsageCount|
+---------+--------------+
|    GateD|             2|
+---------+--------------+
only showing top 1 row



In [0]:
# Window Functions

# Rank vehicles by speed within VehicleType

from pyspark.sql.window import Window
from pyspark.sql.functions import rank
from pyspark.sql.functions import col

speed_window = Window.partitionBy("VehicleType").orderBy(col("SpeedKMH").desc())

df_rank = df.withColumn("SpeedRank", rank().over(speed_window))

df_rank.select("LogID", "VehicleID", "VehicleType", "SpeedKMH", "SpeedRank").show()

# Find last exit time for each vehicle using lag()
from pyspark.sql.functions import lag

vehicle_window = Window.partitionBy("VehicleID").orderBy("ExitTime")

df_lag = df.withColumn("PreviousExitTime", lag("ExitTime").over(vehicle_window))

df_lag.select("LogID", "VehicleID", "ExitTime", "PreviousExitTime").show()




+-----+---------+-----------+--------+---------+
|LogID|VehicleID|VehicleType|SpeedKMH|SpeedRank|
+-----+---------+-----------+--------+---------+
| L003|     V003|       Bike|      55|        1|
| L005|     V005|        Bus|      40|        1|
| L004|     V004|        Car|      80|        1|
| L001|     V001|        Car|      60|        2|
| L002|     V002|      Truck|      45|        1|
+-----+---------+-----------+--------+---------+

+-----+---------+-------------------+----------------+
|LogID|VehicleID|           ExitTime|PreviousExitTime|
+-----+---------+-------------------+----------------+
| L001|     V001|2024-05-01 08:20:00|            NULL|
| L002|     V002|2024-05-01 08:45:00|            NULL|
| L003|     V003|2024-05-01 09:18:00|            NULL|
| L004|     V004|2024-05-01 09:35:00|            NULL|
| L005|     V005|2024-05-01 10:40:00|            NULL|
+-----+---------+-------------------+----------------+



In [0]:
# Session Segmentation
# Group by VehicleID to simulate route sessions
# Find duration between subsequent trips (idle time)

from pyspark.sql.window import Window
from pyspark.sql.functions import lag, unix_timestamp, round, col

# Partition by VehicleID and order by EntryTime to simulate trip sequence
session_window = Window.partitionBy("VehicleID").orderBy("EntryTime")

df_sessions = df.withColumn("PreviousExitTime", lag("ExitTime").over(session_window))

df_sessions = df_sessions.withColumn("IdleTime",
    round((unix_timestamp("EntryTime") - unix_timestamp("PreviousExitTime")) / 60, 2)
)

df_sessions.select(
    "LogID", "VehicleID", "EntryTime", "ExitTime", "PreviousExitTime", "IdleTime"
).orderBy("VehicleID", "EntryTime").show(truncate=False)


+-----+---------+-------------------+-------------------+----------------+--------+
|LogID|VehicleID|EntryTime          |ExitTime           |PreviousExitTime|IdleTime|
+-----+---------+-------------------+-------------------+----------------+--------+
|L001 |V001     |2024-05-01 08:01:00|2024-05-01 08:20:00|NULL            |NULL    |
|L002 |V002     |2024-05-01 08:10:00|2024-05-01 08:45:00|NULL            |NULL    |
|L003 |V003     |2024-05-01 09:00:00|2024-05-01 09:18:00|NULL            |NULL    |
|L004 |V004     |2024-05-01 09:15:00|2024-05-01 09:35:00|NULL            |NULL    |
|L005 |V005     |2024-05-01 10:05:00|2024-05-01 10:40:00|NULL            |NULL    |
+-----+---------+-------------------+-------------------+----------------+--------+



In [0]:
# Anomaly Detection
# Identify vehicles with speed > 70 and TripDuration < 10 minutes

df.filter((col("SpeedKMH") > 70) & (col("TripDurationMinutes") < 10))
df.select("LogID", "VehicleID", "SpeedKMH", "TripDurationMinutes").show()

# Vehicles that paid less toll for longer trips
from pyspark.sql.functions import avg, sum

toll_duration_df = df.groupBy("VehicleID").agg(
    avg("TripDurationMinutes").alias("AvgDuration"),
    sum("TollPaid").alias("TotalToll")
)
toll_duration_df.show()
suspicious_toll = toll_duration_df.filter((col("AvgDuration") > 30) & (col("TotalToll") < 50))
suspicious_toll.show()

# Suspicious backtracking (ExitPoint earlier than EntryPoint)
from pyspark.sql.functions import asc

backtrack = df.filter(col("ExitPoint") < col("EntryPoint"))
backtrack.select("LogID", "VehicleID", "EntryPoint", "ExitPoint").show()


+-----+---------+--------+-------------------+
|LogID|VehicleID|SpeedKMH|TripDurationMinutes|
+-----+---------+--------+-------------------+
| L001|     V001|      60|               19.0|
| L002|     V002|      45|               35.0|
| L003|     V003|      55|               18.0|
| L004|     V004|      80|               20.0|
| L005|     V005|      40|               35.0|
+-----+---------+--------+-------------------+

+---------+-----------+---------+
|VehicleID|AvgDuration|TotalToll|
+---------+-----------+---------+
|     V004|       20.0|       50|
|     V005|       35.0|       70|
|     V001|       19.0|       50|
|     V003|       18.0|       30|
|     V002|       35.0|      100|
+---------+-----------+---------+

+---------+-----------+---------+
|VehicleID|AvgDuration|TotalToll|
+---------+-----------+---------+
+---------+-----------+---------+

+-----+---------+----------+---------+
|LogID|VehicleID|EntryPoint|ExitPoint|
+-----+---------+----------+---------+
| L005|     V00

In [0]:
# 7. Join with Metadata
# Prepare a small vehicle_registry.csv :
# VehicleID,OwnerName,Model,RegisteredCity
# V001,Anil,Hyundai i20,Delhi
# V002,Rakesh,Tata Truck,Chennai
# V003,Sana,Yamaha R15,Mumbai
# V004,Neha,Honda City,Bangalore
# V005,Zoya,Volvo Bus,Pune
# Join and group trips by RegisteredCity
# Load vehicle registry CSV

registry_df = spark.read.csv("file:/Workspace/Shared/vehicle_registry.csv", header=True, inferSchema=True)

registry_df.printSchema()
registry_df.show()

# Perform an inner join on VehicleID
joined_df = df.join(registry_df, on="VehicleID", how="inner")

joined_df.select("LogID", "VehicleID", "OwnerName", "RegisteredCity", "EntryPoint", "ExitPoint").show()
from pyspark.sql.functions import count

joined_df.groupBy("RegisteredCity").agg(count("LogID").alias("TotalTrips")).orderBy("TotalTrips", ascending=False).show()



root
 |-- VehicleID: string (nullable = true)
 |-- OwnerName: string (nullable = true)
 |-- Model: string (nullable = true)
 |-- RegisteredCity: string (nullable = true)

+---------+---------+-----------+--------------+
|VehicleID|OwnerName|      Model|RegisteredCity|
+---------+---------+-----------+--------------+
|     V001|     Anil|Hyundai i20|         Delhi|
|     V002|   Rakesh| Tata Truck|       Chennai|
|     V003|     Sana| Yamaha R15|        Mumbai|
|     V004|     Neha| Honda City|     Bangalore|
|     V005|     Zoya|  Volvo Bus|          Pune|
+---------+---------+-----------+--------------+

+-----+---------+---------+--------------+----------+---------+
|LogID|VehicleID|OwnerName|RegisteredCity|EntryPoint|ExitPoint|
+-----+---------+---------+--------------+----------+---------+
| L001|     V001|     Anil|         Delhi|     GateA|    GateC|
| L002|     V002|   Rakesh|       Chennai|     GateB|    GateC|
| L003|     V003|     Sana|        Mumbai|     GateA|    GateD|
| L

In [0]:
# 8. Delta Lake Features

# Save traffic_logs as Delta Table
df.write.format("delta").mode("overwrite").saveAsTable("traffic_logs")

# Apply MERGE INTO to update toll rates for all Bikes
from delta.tables import DeltaTable

# Load the Delta table
delta_table = DeltaTable.forName(spark, "traffic_logs")

#Apply MERGE INTO to update toll rates for all Bikes
bike_updates = df.filter(df.VehicleType == "Bike")
delta_table.alias("target").merge(
    source=bike_updates.alias("source"),
    condition="target.LogID = source.LogID"
).whenMatchedUpdate(set={
    "TollPaid": "40" }).execute()
    
# Delete trips longer than 60 minutes
delta_table.delete("TripDurationMinutes > 60")

# Use DESCRIBE HISTORY and VERSION AS OF
spark.sql("DESCRIBE HISTORY traffic_logs").show(truncate=False)

# Read previous version
spark.read.format("delta").option("versionAsOf", 0).table("traffic_logs").display()

spark.read.format('delta').table('traffic_logs').display()



+-------+-------------------+----------------+----------------------------------+---------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----+------------------+--------------------+-----------+-----------------+-------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

LogID,VehicleID,EntryPoint,ExitPoint,EntryTime,ExitTime,VehicleType,SpeedKMH,TollPaid,TripDurationMinutes,TripType,TripDate
L001,V001,GateA,GateC,2024-05-01T08:01:00Z,2024-05-01T08:20:00Z,Car,60,50,19.0,Medium,2024-05-01
L002,V002,GateB,GateC,2024-05-01T08:10:00Z,2024-05-01T08:45:00Z,Truck,45,100,35.0,Long,2024-05-01
L003,V003,GateA,GateD,2024-05-01T09:00:00Z,2024-05-01T09:18:00Z,Bike,55,30,18.0,Medium,2024-05-01
L004,V004,GateC,GateD,2024-05-01T09:15:00Z,2024-05-01T09:35:00Z,Car,80,50,20.0,Medium,2024-05-01
L005,V005,GateB,GateA,2024-05-01T10:05:00Z,2024-05-01T10:40:00Z,Bus,40,70,35.0,Long,2024-05-01


LogID,VehicleID,EntryPoint,ExitPoint,EntryTime,ExitTime,VehicleType,SpeedKMH,TollPaid,TripDurationMinutes,TripType,TripDate
L001,V001,GateA,GateC,2024-05-01T08:01:00Z,2024-05-01T08:20:00Z,Car,60,50,19.0,Medium,2024-05-01
L002,V002,GateB,GateC,2024-05-01T08:10:00Z,2024-05-01T08:45:00Z,Truck,45,100,35.0,Long,2024-05-01
L004,V004,GateC,GateD,2024-05-01T09:15:00Z,2024-05-01T09:35:00Z,Car,80,50,20.0,Medium,2024-05-01
L005,V005,GateB,GateA,2024-05-01T10:05:00Z,2024-05-01T10:40:00Z,Bus,40,70,35.0,Long,2024-05-01
L003,V003,GateA,GateD,2024-05-01T09:00:00Z,2024-05-01T09:18:00Z,Bike,55,40,18.0,Medium,2024-05-01


In [0]:
# Advanced Conditions
# when/otherwise : Tag trip type as:
# "Short" <15min
# "Medium" 15-30min
# "Long" >30min
from pyspark.sql.functions import when

df = df.withColumn("TripType", when(col("TripDurationMinutes") < 15, "Short")
                   .when((col("TripDurationMinutes") >= 15) & (col("TripDurationMinutes") <= 30), "Medium")
                   .otherwise("Long"))

df.select("LogID", "VehicleID", "TripDurationMinutes", "TripType").show()

# Flag vehicles with more than 3 trips in a day
from pyspark.sql.functions import to_date, count, col

df = df.withColumn("TripDate", to_date("EntryTime"))
trip_counts = df.groupBy("VehicleID", "TripDate").agg(count("*").alias("TripCount"))
trip_counts.filter(col("TripCount") > 3).show()



+-----+---------+-------------------+--------+
|LogID|VehicleID|TripDurationMinutes|TripType|
+-----+---------+-------------------+--------+
| L001|     V001|               19.0|  Medium|
| L002|     V002|               35.0|    Long|
| L003|     V003|               18.0|  Medium|
| L004|     V004|               20.0|  Medium|
| L005|     V005|               35.0|    Long|
+-----+---------+-------------------+--------+

+---------+--------+---------+
|VehicleID|TripDate|TripCount|
+---------+--------+---------+
+---------+--------+---------+



In [0]:
# Export & Reporting
# Write final enriched DataFrame to:

# Parquet partitioned by VehicleType
df.write.partitionBy("VehicleType").mode("overwrite").parquet("/mnt/data/traffic_logs_parquet/")

# CSV for dashboards
df.write.option("header", True).mode("overwrite").csv("/mnt/data/traffic_logs_dashboard_csv/")

# Create summary SQL View: total toll by VehicleType + ExitPoint
df.createOrReplaceTempView("traffic_logs_view")

spark.sql("""
    SELECT VehicleType, ExitPoint, SUM(TollPaid) AS TotalToll
    FROM traffic_logs_view
    GROUP BY VehicleType, ExitPoint
    ORDER BY VehicleType, ExitPoint
""").show()


+-----------+---------+---------+
|VehicleType|ExitPoint|TotalToll|
+-----------+---------+---------+
|       Bike|    GateD|       30|
|        Bus|    GateA|       70|
|        Car|    GateC|       50|
|        Car|    GateD|       50|
|      Truck|    GateC|      100|
+-----------+---------+---------+

