In [0]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
df = spark.sql("SELECT * FROM workspace.default.traffic_30")
df.head()

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr, avg
import matplotlib.pyplot as plt
import seaborn as sns

# Create Spark session
spark = SparkSession.builder.appName("TrafficAnalysis").getOrCreate()

# Load dataset
df = spark.sql("SELECT * FROM workspace.default.traffic_30")

# Calculate delay percentage using Spark expressions
df = df.withColumn("delay_percentage", ((col("duration_in_traffic_value") - col("duration_value")) / col("duration_value")) * 100)

# Aggregate delay percentage by origin
avg_delay = df.groupBy("origin").agg(avg("delay_percentage").alias("avg_delay_percentage")).toPandas()

# Plot
plt.figure(figsize=(10, 6))
sns.barplot(x='origin', y='avg_delay_percentage', data=avg_delay, palette='coolwarm')
plt.title("Overall Delay Risk Bar Chart (All Days & Times)")
plt.xlabel("Route (Origin)")
plt.ylabel("Average Delay Risk (%)")
plt.xticks(rotation=45)
plt.show()


In [0]:
# Select key routes
routes = ["Gachibowli, Hyderabad", "Madhapur, Hyderabad"]
df_selected = df.filter(col("origin").isin(routes))

# Aggregate average delay by time slot
avg_route_delay = df_selected.groupBy("time", "origin").agg(avg("duration_in_traffic_value").alias("avg_duration")).toPandas()

# Plot
plt.figure(figsize=(10, 6))
sns.lineplot(x='time', y='avg_duration', hue='origin', data=avg_route_delay, marker='o')
plt.title("Overall Route Comparison Line Chart (All Time Slots)")
plt.xlabel("Time Slot")
plt.ylabel("Avg Travel Time (seconds)")
plt.legend(title="Route")
plt.xticks(rotation=45)
plt.show()


In [0]:
# Estimate CO₂ emissions
df = df.withColumn("co2_emission", col("duration_in_traffic_value") * 0.5)

# Aggregate average emissions and delay by origin
avg_co2_delay = df.groupBy("origin").agg(avg("duration_in_traffic_value").alias("avg_duration"), avg("co2_emission").alias("avg_co2")).toPandas()

# Plot
plt.figure(figsize=(10, 6))
plt.scatter(avg_co2_delay['avg_duration'], avg_co2_delay['avg_co2'], s=avg_co2_delay['avg_co2'], alpha=0.6)
plt.title("Overall CO₂ vs Delay Bubble Chart")
plt.xlabel("Avg Travel Time (seconds)")
plt.ylabel("Avg CO₂ Emission (grams)")
plt.grid(True)
plt.show()


In [0]:
# Assume delays above 2400 sec indicate rain impact
df = df.withColumn("rain_intensity", expr("CASE WHEN duration_in_traffic_value > 2400 THEN 1 ELSE 0 END"))

# Aggregate average duration per time slot
avg_rain_impact = df.groupBy("time").agg(avg("duration_in_traffic_value").alias("avg_duration")).toPandas()

# Plot
plt.figure(figsize=(10, 6))
sns.lineplot(x='time', y='avg_duration', data=avg_rain_impact, marker='o')
plt.fill_between(avg_rain_impact['time'], avg_rain_impact['avg_duration'], alpha=0.3, color='blue', where=df.toPandas()["rain_intensity"] == 1)
plt.title("Overall Rain Impact on Traffic")
plt.xlabel("Time Slot")
plt.ylabel("Avg Travel Time (seconds)")
plt.show()


In [0]:
# Assume delays above 2400 sec indicate rain impact
df = df.withColumn("rain_intensity", expr("CASE WHEN duration_in_traffic_value > 2400 THEN 1 ELSE 0 END"))

# Aggregate average duration per time slot
avg_rain_impact = df.groupBy("time").agg(avg("duration_in_traffic_value").alias("avg_duration")).toPandas()

# Plot
plt.figure(figsize=(10, 6))
sns.lineplot(x='time', y='avg_duration', data=avg_rain_impact, marker='o')
plt.fill_between(avg_rain_impact['time'], avg_rain_impact['avg_duration'], alpha=0.3, color='blue', where=df.toPandas()["rain_intensity"] == 1)
plt.title("Overall Rain Impact on Traffic")
plt.xlabel("Time Slot")
plt.ylabel("Avg Travel Time (seconds)")
plt.show()


In [0]:
# Aggregate average travel time per time slot
avg_departure_delay = df.groupBy("time").agg(avg("duration_in_traffic_value").alias("avg_duration")).toPandas()

# Plot
plt.figure(figsize=(10, 10))
sns.barplot(x='time', y='avg_duration', data=avg_departure_delay, palette='coolwarm')
plt.axvline(x="16:30", color='black', linestyle='--', label="Suggested Early Departure")
plt.title("Best Departure Time Analysis (Overall)")
plt.xlabel("Time Slot")
plt.ylabel("Avg Travel Time (seconds)")
plt.legend()
plt.show()


In [0]:
# Define activities and estimated durations
activities = ["Morning Exercise", "Brunch", "Shopping", "Workshop", "Dinner"]
start_times = [7.5, 10.5, 13.5, 16.5, 19.5]  # Approx start times

# Plot
plt.figure(figsize=(10, 6))
plt.barh(activities, [2, 3, 2, 3, 2], left=start_times, color='green')
plt.title("Overall Multi-Hop Travel Planner")
plt.xlabel("Time of Day")
plt.ylabel("Activity")
plt.grid(True)
plt.show()


In [0]:
import numpy as np

# Prepare weekly heatmap data (avg delay per time slot & day)
df_weekly = df.groupBy("date", "time").agg(avg("duration_in_traffic_value").alias("avg_delay")).toPandas().pivot("date", "time", "avg_delay")

# Plot
plt.figure(figsize=(12, 6))
sns.heatmap(df_weekly, cmap="coolwarm", annot=False)
plt.title("Weekly Delay Heatmap (All Routes)")
plt.xlabel("Time of Day")
plt.ylabel("Day of Week")
plt.show()


In [0]:
# Aggregate average delay by route
avg_delay = df.groupBy("origin").agg(avg("duration_in_traffic_value").alias("avg_duration")).toPandas()

# Plot simple bar chart
plt.figure(figsize=(8, 5))
sns.barplot(y='origin', x='avg_duration', data=avg_delay, palette='Blues_r')  # Horizontal bars for easy reading
plt.title("Average Delay Per Route")
plt.xlabel("Avg Travel Time (seconds)")
plt.ylabel("Route")
plt.show()


In [0]:
# Estimate CO₂ emissions
df = df.withColumn("co2_emission", col("duration_in_traffic_value") * 0.5)

# Aggregate average CO₂ by route
avg_co2 = df.groupBy("origin").agg(avg("co2_emission").alias("avg_co2")).toPandas()

# Plot simple bar chart
plt.figure(figsize=(8, 5))
sns.barplot(x='origin', y='avg_co2', data=avg_co2, palette='Reds_r')  # Reds to indicate emissions
plt.title("Avg CO₂ Emissions Per Route")
plt.ylabel("CO₂ Emission (grams)")
plt.xticks(rotation=45)
plt.show()


In [0]:
df = df.withColumn("rainy_condition", expr("CASE WHEN duration_in_traffic_value > 2400 THEN 'Rainy' ELSE 'Normal' END"))

# Convert to Pandas
df_pandas = df.select("rainy_condition", "duration_in_traffic_value").toPandas()

# Plot
plt.figure(figsize=(7, 5))
sns.boxplot(x='rainy_condition', y='duration_in_traffic_value', data=df_pandas, palette='coolwarm')
plt.title("Rain vs Normal Travel Delays")
plt.xlabel("Weather Condition")
plt.ylabel("Travel Time (seconds)")
plt.show()


In [0]:
avg_departure_delay = df.groupBy("time").agg(avg("duration_in_traffic_value").alias("avg_duration")).toPandas()

# Plot simple scatter plot
plt.figure(figsize=(8, 5))
sns.scatterplot(x='time', y='avg_duration', data=avg_departure_delay, color="blue")
plt.axvline(x="16:30", color='black', linestyle='--', label="Best Departure")
plt.title("Best Departure Time (Lower is Better)")
plt.xlabel("Time Slot")
plt.ylabel("Avg Travel Time (seconds)")
plt.legend()
plt.show()


In [0]:
df_weekly = df.groupBy("date", "time").agg(avg("duration_in_traffic_value").alias("avg_delay")).toPandas().pivot("date", "time", "avg_delay")

plt.figure(figsize=(10, 5))
sns.heatmap(df_weekly, cmap="Blues", annot=False)
plt.title("Weekly Delay Heatmap")
plt.xlabel("Time of Day")
plt.ylabel("Day of Week")
plt.show()


In [0]:
# Travel Time Distribution Histogram
plt.figure(figsize=(8, 5))
sns.histplot(df.toPandas()["duration_in_traffic_value"], bins=20, kde=True, color="blue")
plt.title("Distribution of Travel Times")
plt.xlabel("Travel Time (seconds)")
plt.ylabel("Frequency")
plt.show()


In [0]:
# Traffic Delay Box Plot per Route
plt.figure(figsize=(8, 5))
sns.boxplot(x='origin', y='duration_in_traffic_value', data=df.toPandas(), palette="coolwarm")
plt.title("Traffic Delay Box Plot per Route")
plt.xlabel("Route")
plt.ylabel("Travel Time (seconds)")
plt.xticks(rotation=45)
plt.show()


In [0]:
# Travel Speed Bar Chart
df_speed = df.withColumn("speed", col("distance_value") / col("duration_in_traffic_value"))
avg_speed = df_speed.groupBy("origin").agg(avg("speed").alias("avg_speed")).toPandas()

plt.figure(figsize=(8, 5))
sns.barplot(x='origin', y='avg_speed', data=avg_speed, palette="Greens_r")
plt.title("Avg Travel Speed Per Route")
plt.xlabel("Route")
plt.ylabel("Speed (m/s)")
plt.xticks(rotation=45)
plt.show()


In [0]:
# Day vs Night Travel Time Comparison
df = df.withColumn("time_of_day", expr("CASE WHEN time BETWEEN '06:00' AND '18:00' THEN 'Day' ELSE 'Night' END"))
df_day_night = df.groupBy("time_of_day").agg(avg("duration_in_traffic_value").alias("avg_duration")).toPandas()

plt.figure(figsize=(6, 4))
sns.barplot(x='time_of_day', y='avg_duration', data=df_day_night, palette="Set2")
plt.title("Day vs Night Travel Time Comparison")
plt.ylabel("Avg Travel Time (seconds)")
plt.show()


In [0]:
# Traffic Volume Pie Chart
df_traffic = df.groupBy("origin").count().toPandas()

plt.figure(figsize=(7, 7))
plt.pie(df_traffic["count"], labels=df_traffic["origin"], autopct="%1.1f%%", colors=sns.color_palette("pastel"))
plt.title("Traffic Volume Across Routes")
plt.show()


In [0]:
# Travel Time vs Distance Scatter Plot
plt.figure(figsize=(8, 5))
sns.scatterplot(x=df.toPandas()["distance_value"], y=df.toPandas()["duration_in_traffic_value"], color="purple")
plt.title("Travel Time vs Distance")
plt.xlabel("Distance (meters)")
plt.ylabel("Travel Time (seconds)")
plt.show()


In [0]:
# Hourly Traffic Pattern Line Chart
df_hourly = df.groupBy("time").agg(avg("duration_in_traffic_value").alias("avg_duration")).toPandas()

plt.figure(figsize=(9, 5))
sns.lineplot(x="time", y="avg_duration", data=df_hourly, marker="o", color="red")
plt.title("Hourly Traffic Pattern")
plt.xlabel("Time Slot")
plt.ylabel("Avg Travel Time (seconds)")
plt.xticks(rotation=45)
plt.show()



In [0]:
# Day vs Night Travel Time Violin Plot
df = df.withColumn("time_of_day", expr("CASE WHEN time BETWEEN '06:00' AND '18:00' THEN 'Day' ELSE 'Night' END"))

df_pandas = df.select("time_of_day", "duration_in_traffic_value").toPandas()

plt.figure(figsize=(8, 5))
sns.violinplot(x="time_of_day", y="duration_in_traffic_value", data=df_pandas, palette="coolwarm", split=True)
plt.title("Day vs. Night Travel Time Violin Plot")
plt.xlabel("Time of Day")
plt.ylabel("Travel Time (seconds)")
plt.show()


In [0]:
# Hourly Traffic Trends Multi-Line Plot
df_hourly = df.groupBy("time", "origin").agg(avg("duration_in_traffic_value").alias("avg_duration")).toPandas()

plt.figure(figsize=(10, 6))
sns.lineplot(x="time", y="avg_duration", hue="origin", data=df_hourly, marker="o", palette="Dark2")
plt.title("Hourly Traffic Trends for Major Routes")
plt.xlabel("Time Slot")
plt.ylabel("Avg Travel Time (seconds)")
plt.legend(title="Route")
plt.xticks(rotation=45)
plt.show()


In [0]:
# Day vs Night Traffic Flow Heatmap
df_heatmap = df.groupBy("time_of_day", "time").agg(avg("duration_in_traffic_value").alias("avg_delay")).toPandas().pivot("time_of_day", "time", "avg_delay")

plt.figure(figsize=(10, 5))
sns.heatmap(df_heatmap, cmap="coolwarm", annot=False)
plt.title("Traffic Flow Heatmap – Day vs Night")
plt.xlabel("Time of Day")
plt.ylabel("Time Slot")
plt.show()


In [0]:
# Speed Variation Box Plot – Day vs. Night
df_speed = df.withColumn("speed", col("distance_value") / col("duration_in_traffic_value"))
df_speed_pandas = df_speed.select("time_of_day", "speed").toPandas()

plt.figure(figsize=(8, 5))
sns.boxplot(x="time_of_day", y="speed", data=df_speed_pandas, palette="Set2")
plt.title("Speed Variation Between Day and Night")
plt.xlabel("Time of Day")
plt.ylabel("Avg Speed (m/s)")
plt.show()


In [0]:
# Traffic Congestion Tree Map
import squarify

df_congestion = df.groupBy("origin").agg(avg("duration_in_traffic_value").alias("avg_duration")).toPandas()

plt.figure(figsize=(8, 6))
squarify.plot(sizes=df_congestion["avg_duration"], label=df_congestion["origin"], color=sns.color_palette("coolwarm"))
plt.title("Traffic Congestion Contribution Per Route")
plt.show()

