In [None]:
"""
Traffic Prediction - Data Exploration
January 2023 BASt Data
Author: Tulsi
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import sys
sys.path.append('..')
import config

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (14, 6)

print("Traffic Volume Prediction - Data Exploration")
print("=" * 60)

In [None]:
# Load clean data
data_file = Path(config.PROCESSED_DATA_DIR) / "traffic_2023_01_clean.csv"
df = pd.read_csv(data_file)
df['timestamp'] = pd.to_datetime(df['timestamp'])

print(f"Dataset Shape: {df.shape}")
print(f"Date Range: {df['timestamp'].min()} to {df['timestamp'].max()}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nFirst 5 rows:")
df.head()

In [None]:
# Statistics
print("Traffic Statistics:")
df[['PKW', 'LKW', 'Buses', 'Total']].describe()

In [None]:
# Hourly patterns
df['hour'] = df['timestamp'].dt.hour

plt.figure(figsize=(14, 6))
hourly_avg = df.groupby('hour')['PKW'].mean()
plt.plot(hourly_avg, marker='o', linewidth=2, markersize=8)
plt.title('Average PKW Traffic by Hour of Day', fontsize=16, fontweight='bold')
plt.xlabel('Hour of Day', fontsize=12)
plt.ylabel('Average Traffic Volume', fontsize=12)
plt.grid(True, alpha=0.3)
plt.xticks(range(0, 24))
plt.tight_layout()
plt.show()

In [None]:
# Weekly patterns
df['day_of_week'] = df['timestamp'].dt.day_name()

plt.figure(figsize=(14, 6))
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
daily_avg = df.groupby('day_of_week')['PKW'].mean().reindex(day_order)
daily_avg.plot(kind='bar', color='steelblue', edgecolor='black')
plt.title('Average PKW Traffic by Day of Week', fontsize=16, fontweight='bold')
plt.xlabel('Day of Week', fontsize=12)
plt.ylabel('Average Traffic Volume', fontsize=12)
plt.xticks(rotation=45)
plt.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.show()

In [None]:
# Time series plot
plt.figure(figsize=(16, 6))
plt.plot(df['timestamp'], df['PKW'], alpha=0.7, linewidth=1)
plt.title('PKW Traffic Volume Over Time - January 2023', fontsize=16, fontweight='bold')
plt.xlabel('Date', fontsize=12)
plt.ylabel('Traffic Volume (vehicles/hour)', fontsize=12)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Correlation heatmap
plt.figure(figsize=(10, 8))
corr = df[['PKW', 'LKW', 'Buses', 'Total']].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm', center=0,
            square=True, linewidths=1, fmt='.2f')
plt.title('Correlation Matrix - Vehicle Types', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()