In [16]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.getOrCreate()

In [17]:
from pyspark.sql import functions as F
from pyspark.sql.types import *

#PHASE 1 – Ingestion

In [18]:
traffic_csv=spark.read \
        .option("header",True)\
        .option("inferSchema",False) \
        .csv("traffic_data_large.csv")
traffic_csv.show(3)

+---------+--------+----------+-------------+---------+-----------+-------------------+--------+
|sensor_id|location| road_name|vehicle_count|avg_speed|temperature|          timestamp|  status|
+---------+--------+----------+-------------+---------+-----------+-------------------+--------+
|     S105| Chennai|       OMR|      invalid|     NULL|         39|12/01/2026 06:00:00|INACTIVE|
|     S113| Chennai|Mount Road|          103|     73.5|         36|2026-01-12 06:00:05|  ACTIVE|
|     S228|   Delhi|   Janpath|           16|     20.0|         35|2026-01-12 06:00:10|  ACTIVE|
+---------+--------+----------+-------------+---------+-----------+-------------------+--------+
only showing top 3 rows


In [19]:
traffic_csv.printSchema()
traffic_csv.count()

root
 |-- sensor_id: string (nullable = true)
 |-- location: string (nullable = true)
 |-- road_name: string (nullable = true)
 |-- vehicle_count: string (nullable = true)
 |-- avg_speed: string (nullable = true)
 |-- temperature: string (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- status: string (nullable = true)



500000

#PHASE 2 – Cleaning

In [20]:
traffic_csv = traffic_csv.withColumn(
    "location",
    F.initcap(F.trim(F.col("location")))
)
traffic_csv = traffic_csv.withColumn(
    "road_name",
    F.initcap(F.trim(F.col("road_name")))
)
traffic_csv = traffic_csv.withColumn(
    "status",
     F.initcap(F.trim(F.col("status")))
)

1. Trim all string columns.

In [21]:
traffic_csv.show(3)

+---------+--------+----------+-------------+---------+-----------+-------------------+--------+
|sensor_id|location| road_name|vehicle_count|avg_speed|temperature|          timestamp|  status|
+---------+--------+----------+-------------+---------+-----------+-------------------+--------+
|     S105| Chennai|       Omr|      invalid|     NULL|         39|12/01/2026 06:00:00|Inactive|
|     S113| Chennai|Mount Road|          103|     73.5|         36|2026-01-12 06:00:05|  Active|
|     S228|   Delhi|   Janpath|           16|     20.0|         35|2026-01-12 06:00:10|  Active|
+---------+--------+----------+-------------+---------+-----------+-------------------+--------+
only showing top 3 rows


2. Clean vehicle_count:
Replace invalid and empty with null
Cast to IntegerType

In [22]:
traffic_csv = traffic_csv.withColumn(
    "vehicle_count",
    F.when(F.col("vehicle_count").rlike("^[0-9]+$"), F.col("vehicle_count").cast("int"))
     .otherwise(None)
)

3. Clean avg_speed:
Replace empty with null
Cast to DoubleType

In [23]:
traffic_csv.filter(F.col("vehicle_count").isNull()).count()

49873

In [24]:
traffic_csv=traffic_csv.withColumn(
        "avg_speed",
        F.when(
            F.regexp_replace(F.col("avg_speed"), "[^0-9.]", "") == "",
            None
        ).otherwise(
            F.regexp_replace(F.col("avg_speed"), "[^0-9.]", "").cast(DoubleType())
        )
)

Parse timestamp into:

event_time

with TimestampType supporting:
yyyy-MM-dd HH:mm:ss
dd/MM/yyyy HH:mm:ss
yyyy/MM/dd HH:mm:ss

In [25]:

from pyspark.sql.functions import to_timestamp, coalesce, col

traffic_csv =traffic_csv.withColumn(
    "event_time",
    coalesce(
        to_timestamp("timestamp", "yyyy-MM-dd HH:mm:ss"),
        to_timestamp("timestamp", "dd/MM/yyyy HH:mm:ss"),
        to_timestamp("timestamp", "yyyy/MM/dd HH:mm:ss"),
    )
)


In [26]:
spark.conf.set("spark.sql.ansi.enabled", "false")

In [27]:
traffic_csv.show(3)

+---------+--------+----------+-------------+---------+-----------+-------------------+--------+-------------------+
|sensor_id|location| road_name|vehicle_count|avg_speed|temperature|          timestamp|  status|         event_time|
+---------+--------+----------+-------------+---------+-----------+-------------------+--------+-------------------+
|     S105| Chennai|       Omr|         NULL|     NULL|         39|12/01/2026 06:00:00|Inactive|2026-01-12 06:00:00|
|     S113| Chennai|Mount Road|          103|     73.5|         36|2026-01-12 06:00:05|  Active|2026-01-12 06:00:05|
|     S228|   Delhi|   Janpath|           16|     20.0|         35|2026-01-12 06:00:10|  Active|2026-01-12 06:00:10|
+---------+--------+----------+-------------+---------+-----------+-------------------+--------+-------------------+
only showing top 3 rows


PHASE 3 – Validation

Count invalid vehicle_count rows.
Count invalid timestamp rows.

In [28]:

invalid_counts = traffic_csv.select(
    F.sum(col("vehicle_count").isNull().cast("int")).alias("invalid_vehicle_count"),
    F.sum(col("timestamp").isNull().cast("int")).alias("invalid_time_stamp"),
    F.sum(col("event_time").isNull().cast("int")).alias("invalid_time"),

)
print("Invalid Value Counts:")
invalid_counts.show()

Invalid Value Counts:
+---------------------+------------------+------------+
|invalid_vehicle_count|invalid_time_stamp|invalid_time|
+---------------------+------------------+------------+
|                49873|                 0|        4853|
+---------------------+------------------+------------+



3. Remove rows where:

status != "ACTIVE"

In [29]:
traffic_active=traffic_csv.filter(F.col("status")=="Active")

In [30]:
traffic_csv.count()


500000

In [31]:
traffic_active.count()

475000

1. Average speed per location.

In [32]:
avg_speed_per_loc=(
traffic_active.groupby("location")
                  .agg(F.avg(F.col("avg_speed")).alias("avg_speed_per_loc"))
                  .orderBy(F.col("avg_speed_per_loc").desc())
)
avg_speed_per_loc.show()

+---------+------------------+
| location| avg_speed_per_loc|
+---------+------------------+
|    Delhi|47.615486368277594|
|  Chennai|47.603255828145606|
|Hyderabad|47.536721714580516|
|   Mumbai| 47.47210939162663|
|Bangalore|47.455237585125005|
|  Kolkata| 47.43352094200893|
|     Pune|47.421643780726455|
+---------+------------------+



2. Total vehicle count per road.

In [33]:
vehicle_count_per_road=(
    traffic_active.groupBy("road_name")
    .agg(F.sum(F.col("vehicle_count")).alias("vehicle_count_per_road"))
    .orderBy(F.col("vehicle_count_per_road").desc()))
vehicle_count_per_road.show()

+---------------+----------------------+
|      road_name|vehicle_count_per_road|
+---------------+----------------------+
|  Outer Ring Rd|               1339365|
| Hitech City Rd|               1338486|
|           Nh48|               1335420|
|      Howrah Rd|               1334512|
|Western Express|               1334351|
|       Gst Road|               1333073|
|      Em Bypass|               1331117|
|     Mount Road|               1329511|
|  Gachibowli Rd|               1328605|
|      Ring Road|               1327408|
|Eastern Express|               1325865|
|    Madhapur Rd|               1324233|
|        Fc Road|               1322292|
|  University Rd|               1322004|
|  Whitefield Rd|               1320360|
|            Omr|               1317171|
|      Link Road|               1316848|
|    Park Street|               1310784|
|        Janpath|               1303498|
|        Mg Road|               1303485|
+---------------+----------------------+
only showing top

Peak traffic time per location.

In [34]:

from pyspark.sql import functions as F
from pyspark.sql.window import Window

df_with_hour = traffic_active.withColumn(
    "hour_of_day",
    F.hour("event_time")
)

traffic_by_hour = df_with_hour.groupBy(
    "location", "hour_of_day"
).agg(
    F.sum("vehicle_count").alias("total_vehicle_count")
)

# Step 3: Window to rank hours by traffic per location
window_spec = Window.partitionBy("location").orderBy(
    F.desc("total_vehicle_count")
)

peak_traffic_per_location = traffic_by_hour.withColumn(
    "rank",
    F.row_number().over(window_spec)
).filter(
    F.col("rank") == 1
).drop("rank")

peak_traffic_per_location.show(truncate=False)


+---------+-----------+-------------------+
|location |hour_of_day|total_vehicle_count|
+---------+-----------+-------------------+
|Bangalore|4          |168298             |
|Chennai  |6          |169857             |
|Delhi    |22         |170234             |
|Hyderabad|3          |169636             |
|Kolkata  |0          |173778             |
|Mumbai   |17         |169548             |
|Pune     |1          |171539             |
+---------+-----------+-------------------+



In [35]:

from pyspark.sql import functions as F

congested_roads = traffic_active.groupBy(
    "location", "road_name"
).agg(
    F.avg("avg_speed").alias("avg_road_speed")
).orderBy(
    F.asc("avg_road_speed")
)

congested_roads.show(truncate=False)


+---------+---------------+------------------+
|location |road_name      |avg_road_speed    |
+---------+---------------+------------------+
|Kolkata  |Em Bypass      |47.305273508964476|
|Mumbai   |Link Road      |47.3713610932772  |
|Bangalore|Whitefield Rd  |47.40783038021416 |
|Pune     |Fc Road        |47.41019017359676 |
|Kolkata  |Howrah Rd      |47.41786602740956 |
|Bangalore|Mg Road        |47.42467508440872 |
|Pune     |University Rd  |47.42538415393851 |
|Pune     |Nagar Rd       |47.429443229917716|
|Chennai  |Gst Road       |47.44407903702021 |
|Hyderabad|Gachibowli Rd  |47.455523521874355|
|Mumbai   |Eastern Express|47.48790560471994 |
|Hyderabad|Hitech City Rd |47.51352853463453 |
|Bangalore|Outer Ring Rd  |47.53179303464533 |
|Chennai  |Omr            |47.548242457791424|
|Delhi    |Ring Road      |47.55594236047568 |
|Mumbai   |Western Express|47.55637391185471 |
|Kolkata  |Park Street    |47.579433971003745|
|Delhi    |Nh48           |47.613086790393126|
|Hyderabad|Ma

PHASE 5 – Window Functions
1. Rank roads by congestion (lowest speed).
2. For each location, rank roads by vehicle_count.
3. Identify top 3 congested roads per location.

In [36]:

road_speed_df = traffic_active.groupBy(
    "location", "road_name"
).agg(
    F.avg("avg_speed").alias("avg_road_speed")
)

congestion_rank_window = Window.orderBy(
    F.asc("avg_road_speed")
)

ranked_congestion = road_speed_df.withColumn(
    "congestion_rank",
    F.dense_rank().over(congestion_rank_window)
)

ranked_congestion.show(truncate=False)


+---------+---------------+------------------+---------------+
|location |road_name      |avg_road_speed    |congestion_rank|
+---------+---------------+------------------+---------------+
|Kolkata  |Em Bypass      |47.305273508964476|1              |
|Mumbai   |Link Road      |47.3713610932772  |2              |
|Bangalore|Whitefield Rd  |47.40783038021416 |3              |
|Pune     |Fc Road        |47.41019017359676 |4              |
|Kolkata  |Howrah Rd      |47.41786602740956 |5              |
|Bangalore|Mg Road        |47.42467508440872 |6              |
|Pune     |University Rd  |47.42538415393851 |7              |
|Pune     |Nagar Rd       |47.429443229917716|8              |
|Chennai  |Gst Road       |47.44407903702021 |9              |
|Hyderabad|Gachibowli Rd  |47.455523521874355|10             |
|Mumbai   |Eastern Express|47.48790560471994 |11             |
|Hyderabad|Hitech City Rd |47.51352853463453 |12             |
|Bangalore|Outer Ring Rd  |47.53179303464533 |13       

In [37]:

road_vehicle_df = traffic_active.groupBy(
    "location", "road_name"
).agg(
    F.sum("vehicle_count").alias("total_vehicle_count")
)

vehicle_rank_window = Window.partitionBy(
    "location"
).orderBy(
    F.desc("total_vehicle_count")
)

ranked_by_vehicles = road_vehicle_df.withColumn(
    "vehicle_rank",
    F.dense_rank().over(vehicle_rank_window)
)

ranked_by_vehicles.show(truncate=False)


+---------+---------------+-------------------+------------+
|location |road_name      |total_vehicle_count|vehicle_rank|
+---------+---------------+-------------------+------------+
|Bangalore|Outer Ring Rd  |1339365            |1           |
|Bangalore|Whitefield Rd  |1320360            |2           |
|Bangalore|Mg Road        |1303485            |3           |
|Chennai  |Gst Road       |1333073            |1           |
|Chennai  |Mount Road     |1329511            |2           |
|Chennai  |Omr            |1317171            |3           |
|Delhi    |Nh48           |1335420            |1           |
|Delhi    |Ring Road      |1327408            |2           |
|Delhi    |Janpath        |1303498            |3           |
|Hyderabad|Hitech City Rd |1338486            |1           |
|Hyderabad|Gachibowli Rd  |1328605            |2           |
|Hyderabad|Madhapur Rd    |1324233            |3           |
|Kolkata  |Howrah Rd      |1334512            |1           |
|Kolkata  |Em Bypass    

In [38]:

road_speed_df = traffic_active.groupBy(
    "location", "road_name"
).agg(
    F.avg("avg_speed").alias("avg_road_speed")
)

congestion_window = Window.partitionBy(
    "location"
).orderBy(
    F.asc("avg_road_speed")
)

top3_congested_roads = road_speed_df.withColumn(
    "congestion_rank",
    F.row_number().over(congestion_window)
).filter(
    F.col("congestion_rank") <= 3
)
top3_congested_roads.show(truncate=False)

+---------+---------------+------------------+---------------+
|location |road_name      |avg_road_speed    |congestion_rank|
+---------+---------------+------------------+---------------+
|Bangalore|Whitefield Rd  |47.40783038021416 |1              |
|Bangalore|Mg Road        |47.42467508440872 |2              |
|Bangalore|Outer Ring Rd  |47.53179303464533 |3              |
|Chennai  |Gst Road       |47.44407903702021 |1              |
|Chennai  |Omr            |47.548242457791424|2              |
|Chennai  |Mount Road     |47.8182051164706  |3              |
|Delhi    |Ring Road      |47.55594236047568 |1              |
|Delhi    |Nh48           |47.613086790393126|2              |
|Delhi    |Janpath        |47.67768541905838 |3              |
|Hyderabad|Gachibowli Rd  |47.455523521874355|1              |
|Hyderabad|Hitech City Rd |47.51352853463453 |2              |
|Hyderabad|Madhapur Rd    |47.64118452897408 |3              |
|Kolkata  |Em Bypass      |47.305273508964476|1        

#PHASE 6

In [53]:

traffic_window = Window.partitionBy(
    "location", "road_name"
).orderBy("timestamp")

speed_df = traffic_active.withColumn(
    "prev_avg_speed",
    F.lag("avg_speed").over(traffic_window)
)

speed_df = speed_df.withColumn(
    "speed_drop_pct",
    ((F.col("prev_avg_speed") - F.col("avg_speed"))
     / F.col("prev_avg_speed")) * 100
)

speed_anomalies = speed_df.filter(
    (F.col("speed_drop_pct") >= 30) &
    F.col("prev_avg_speed").isNotNull()
)

speed_anomalies.select(
    "location",
    "road_name",
    "event_time",
    "prev_avg_speed",
    "avg_speed",
    "speed_drop_pct"
).show(truncate=False)



+---------+---------+-------------------+--------------+---------+------------------+
|location |road_name|event_time         |prev_avg_speed|avg_speed|speed_drop_pct    |
+---------+---------+-------------------+--------------+---------+------------------+
|Bangalore|Mg Road  |2026-02-01 02:45:15|79.4          |28.5     |64.1057934508816  |
|Bangalore|Mg Road  |2026-02-01 07:47:30|67.8          |33.3     |50.88495575221239 |
|Bangalore|Mg Road  |2026-02-01 10:25:05|62.6          |37.0     |40.89456869009585 |
|Bangalore|Mg Road  |2026-02-01 18:41:05|73.2          |37.8     |48.360655737704924|
|Bangalore|Mg Road  |2026-02-01 18:54:00|78.0          |37.1     |52.43589743589744 |
|Bangalore|Mg Road  |2026-02-01 20:06:20|37.1          |24.9     |32.88409703504044 |
|Bangalore|Mg Road  |2026-02-02 00:27:15|77.8          |21.5     |72.3650385604113  |
|Bangalore|Mg Road  |2026-02-02 03:07:25|75.4          |35.5     |52.917771883289134|
|Bangalore|Mg Road  |2026-02-02 07:02:30|79.4         

In [54]:

vehicle_df = traffic_active.withColumn(
    "prev_vehicle_count",
    F.lag("vehicle_count").over(traffic_window)
)

vehicle_df = vehicle_df.withColumn(
    "vehicle_spike_pct",
    ((F.col("vehicle_count") - F.col("prev_vehicle_count"))
     / F.col("prev_vehicle_count")) * 100
)

vehicle_anomalies = vehicle_df.filter(
    (F.col("vehicle_spike_pct") >= 40) &
    F.col("prev_vehicle_count").isNotNull()
)

vehicle_anomalies.select(
    "location",
    "road_name",
    "timestamp",
    "prev_vehicle_count",
    "vehicle_count",
    "vehicle_spike_pct"
).show(truncate=False)



+---------+---------+-------------------+------------------+-------------+------------------+
|location |road_name|timestamp          |prev_vehicle_count|vehicle_count|vehicle_spike_pct |
+---------+---------+-------------------+------------------+-------------+------------------+
|Bangalore|Mg Road  |01/02/2026 06:22:15|10                |102          |919.9999999999999 |
|Bangalore|Mg Road  |01/02/2026 10:07:00|22                |53           |140.9090909090909 |
|Bangalore|Mg Road  |01/02/2026 10:25:05|29                |74           |155.17241379310346|
|Bangalore|Mg Road  |01/02/2026 13:20:45|11                |101          |818.1818181818181 |
|Bangalore|Mg Road  |01/02/2026 16:19:00|17                |46           |170.58823529411765|
|Bangalore|Mg Road  |01/02/2026 18:48:50|73                |106          |45.20547945205479 |
|Bangalore|Mg Road  |01/02/2026 21:00:35|32                |104          |225.0             |
|Bangalore|Mg Road  |01/02/2026 22:23:15|69                |

phase 7

In [39]:


print(f"Number of partitions: {traffic_active.rdd.getNumPartitions()}")

Number of partitions: 2


In [42]:
top3_congested_roads.explain(True)

== Parsed Logical Plan ==
'Filter '`<=`('congestion_rank, 3)
+- Project [location#230, road_name#231, avg_road_speed#585, congestion_rank#596]
   +- Project [location#230, road_name#231, avg_road_speed#585, congestion_rank#596, congestion_rank#596]
      +- Window [row_number() windowspecdefinition(location#230, avg_road_speed#585 ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS congestion_rank#596], [location#230], [avg_road_speed#585 ASC NULLS FIRST]
         +- Project [location#230, road_name#231, avg_road_speed#585]
            +- Aggregate [location#230, road_name#231], [location#230, road_name#231, avg(avg_speed#280) AS avg_road_speed#585]
               +- Filter (status#232 = Active)
                  +- Project [sensor_id#176, location#230, road_name#231, vehicle_count#266, avg_speed#280, temperature#181, timestamp#182, status#232, coalesce(to_timestamp(timestamp#182, Some(yyyy-MM-dd HH:mm:ss), TimestampType, Some(Etc/UTC), false), to_

In [43]:
traffic_active=traffic_active.repartition("location")

In [56]:
traffic_active.cache()
traffic_active.count()

475000

In [47]:
top3_congested_roads.explain(True)

== Parsed Logical Plan ==
'Filter '`<=`('congestion_rank, 3)
+- Project [location#230, road_name#231, avg_road_speed#585, congestion_rank#596]
   +- Project [location#230, road_name#231, avg_road_speed#585, congestion_rank#596, congestion_rank#596]
      +- Window [row_number() windowspecdefinition(location#230, avg_road_speed#585 ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS congestion_rank#596], [location#230], [avg_road_speed#585 ASC NULLS FIRST]
         +- Project [location#230, road_name#231, avg_road_speed#585]
            +- Aggregate [location#230, road_name#231], [location#230, road_name#231, avg(avg_speed#280) AS avg_road_speed#585]
               +- Filter (status#232 = Active)
                  +- Project [sensor_id#176, location#230, road_name#231, vehicle_count#266, avg_speed#280, temperature#181, timestamp#182, status#232, coalesce(to_timestamp(timestamp#182, Some(yyyy-MM-dd HH:mm:ss), TimestampType, Some(Etc/UTC), false), to_

In [48]:
rdd_traffic=traffic_active.rdd

In [58]:

from pyspark.sql import functions as F
from pyspark.sql.types import LongType, IntegerType

vehicle_count_clean_df = (
    traffic_active
    .select(F.col("vehicle_count").cast(LongType()).alias("vehicle_count"))
    .filter(F.col("vehicle_count").isNotNull())
)

vehicle_count_rdd = vehicle_count_clean_df.rdd.map(lambda r: r["vehicle_count"])

total_vehicle_count = 0
if not vehicle_count_rdd.isEmpty():
    total_vehicle_count = vehicle_count_rdd.reduce(lambda a, b: a + b)

print(f"Total vehicle count = {total_vehicle_count}")


Total vehicle count = 27793397


In [60]:

per_loc_count_rdd = traffic_active.select("location").rdd.map(lambda r: (r["location"], 1))

counts_per_location = per_loc_count_rdd.reduceByKey(lambda a, b: a + b)

for loc, cnt in counts_per_location.take(20):
    print(loc, cnt)


Mumbai 68054
Delhi 68014
Kolkata 67978
Bangalore 67658
Pune 67212
Chennai 67919
Hyderabad 68165


Explain why DataFrames are better for this case.

Faster execution: DataFrames use Spark’s optimizer and vectorized execution, making aggregations much faster than RDD map‑reduce. \
Lower Python overhead: Operations run on the JVM, avoiding per‑row Python function calls. \
Cleaner & safer code: Declarative APIs (groupBy, sum, count) are shorter and less error‑prone. \
Built‑in optimizations: Automatic column pruning and predicate pushdown improve I/O and memory efficiency.

phase 10

In [49]:
traffic_active.write.mode('overwrite').partitionBy('location').parquet('/data/traffic_data_cleaned.parquet')


In [50]:
ranked_congestion.write.mode("overwrite").orc("/data/congestion.orc")

In [51]:

print("Reading back Parquet data...")
df_parquet = spark.read.parquet('/data/traffic_data_cleaned.parquet')
df_parquet.printSchema()
df_parquet.show(5, truncate=False)

print("Reading back ORC data...")
df_congestion_orc = spark.read.orc('/data/congestion.orc')
df_congestion_orc.printSchema()
df_congestion_orc.show(5, truncate=False)

Reading back Parquet data...
root
 |-- sensor_id: string (nullable = true)
 |-- road_name: string (nullable = true)
 |-- vehicle_count: integer (nullable = true)
 |-- avg_speed: double (nullable = true)
 |-- temperature: string (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- status: string (nullable = true)
 |-- event_time: timestamp (nullable = true)
 |-- location: string (nullable = true)

+---------+--------------+-------------+---------+-----------+-------------------+------+-------------------+---------+
|sensor_id|road_name     |vehicle_count|avg_speed|temperature|timestamp          |status|event_time         |location |
+---------+--------------+-------------+---------+-----------+-------------------+------+-------------------+---------+
|S197     |Hitech City Rd|97           |44.4     |26         |2026-01-12 06:01:20|Active|2026-01-12 06:01:20|Hyderabad|
|S237     |Gachibowli Rd |90           |57.1     |39         |2026-01-12 06:01:30|Active|2026-01-12 06:01:30|