In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import f_oneway

sns.set(style="whitegrid")


In [None]:
df = pd.read_csv("traffic_data-2.csv", header=None)
df.columns = ["Timestamp", "Origin", "Destination", "TravelDuration"]
df["Timestamp"] = pd.to_datetime(df["Timestamp"])
df["Hour"] = df["Timestamp"].dt.hour
df["Date"] = df["Timestamp"].dt.date
df["Weekday"] = df["Timestamp"].dt.day_name()
df["TravelDurationMinutes"] = df["TravelDuration"].str.extract(r'(\d+)').astype(int)


In [None]:
def time_category(hour):
    if 6 <= hour < 11:
        return "Morning"
    elif 11 <= hour < 16:
        return "Midday"
    else:
        return "Evening"

df["TimeCategory"] = df["Hour"].apply(time_category)
df["WeekdayNum"] = df["Weekday"].apply(lambda x: ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"].index(x))
df.tail()

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(df["TravelDurationMinutes"], bins=15, kde=True, color="steelblue")
plt.title("Distribution of Travel Duration")
plt.xlabel("Travel Duration (minutes)")
plt.ylabel("Frequency")
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(10, 6))
hourly_avg = df.groupby("Hour")["TravelDurationMinutes"].mean()
sns.lineplot(x=hourly_avg.index, y=hourly_avg.values, marker="o")
plt.title("Average Travel Duration by Hour of Day")
plt.xlabel("Hour of Day")
plt.ylabel("Average Travel Duration (minutes)")
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(10, 6))
order = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
sns.boxplot(data=df, x="Weekday", y="TravelDurationMinutes", order=order, palette="Set2")
plt.title("Travel Duration by Day of the Week")
plt.xlabel("Day of the Week")
plt.ylabel("Travel Duration (minutes)")
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
corr = df[["TravelDurationMinutes", "Hour", "WeekdayNum"]].corr()
plt.figure(figsize=(8, 6))
sns.heatmap(corr, annot=True, cmap="coolwarm", linewidths=0.5)
plt.title("Correlation Heatmap")
plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(10, 6))
sns.regplot(x="Hour", y="TravelDurationMinutes", data=df, scatter_kws={'alpha':0.6}, line_kws={"color": "red"})
plt.title("Hour of Day vs. Travel Duration with Trendline")
plt.xlabel("Hour of Day")
plt.ylabel("Travel Duration (minutes)")
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
# ANOVA: Morning vs Midday vs Evening
morning = df[df["TimeCategory"] == "Morning"]["TravelDurationMinutes"]
midday = df[df["TimeCategory"] == "Midday"]["TravelDurationMinutes"]
evening = df[df["TimeCategory"] == "Evening"]["TravelDurationMinutes"]

anova_stat, anova_p = f_oneway(morning, midday, evening)

print("ANOVA Test Results:")
print(f"F-statistic: {anova_stat:.2f}")
print(f"P-value: {anova_p:.10f}")

if anova_p < 0.05:
    print("Result: Reject H₀. Travel durations differ significantly based on time of day.")
else:
    print("Result: Fail to reject H₀. No significant difference found based on time of day.")
