In [None]:
import pandas as pd 
import numpy as np

from sklearn.preprocessing import StandardScaler 
from sklearn.ensemble import RandomForestRegressor 
from sklearn.model_selection import train_test_split 

import matplotlib.pyplot as plt 
import seaborn as sns


In [None]:
# Load raw traffic data
df = pd.read_csv("Uber.csv")   # change filename if needed

# Preview data
df.head()


In [None]:
df.info()
df.describe()
df.isnull().sum()


In [None]:
# Fill numerical missing values with median
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

# Drop rows where critical columns are missing
df.dropna(subset=['timestamp', 'junction'], inplace=True)


In [None]:
df.drop_duplicates(inplace=True)


In [None]:
# Convert timestamp to datetime
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Ensure vehicle count is integer
df['vehicle_count'] = df['vehicle_count'].astype(int)


In [None]:
# Set timestamp as index
df.set_index('timestamp', inplace=True)

# Aggregate hourly traffic data per junction
hourly_df = df.groupby([
    pd.Grouper(freq='H'),
    'junction'
])['vehicle_count'].sum().reset_index()

hourly_df.head()


In [None]:
scaler = StandardScaler()

hourly_df['vehicle_count_scaled'] = scaler.fit_transform(
    hourly_df[['vehicle_count']]
)


In [None]:
hourly_df['hour'] = hourly_df['timestamp'].dt.hour
hourly_df['day_of_week'] = hourly_df['timestamp'].dt.dayofweek
hourly_df['month'] = hourly_df['timestamp'].dt.month


In [None]:
# Weekend indicator
hourly_df['is_weekend'] = hourly_df['day_of_week'].isin([5, 6]).astype(int)

# Special event placeholder (can be replaced with real event data)
hourly_df['special_event'] = 0


In [None]:
# Sort for lag feature creation
hourly_df.sort_values(by=['junction', 'timestamp'], inplace=True)

# Lag features (previous 1 hour and 24 hours)
hourly_df['lag_1_hour'] = hourly_df.groupby('junction')['vehicle_count'].shift(1)
hourly_df['lag_24_hour'] = hourly_df.groupby('junction')['vehicle_count'].shift(24)

# Fill lag NaNs
hourly_df.fillna(0, inplace=True)


In [None]:
plt.figure(figsize=(10,6))
sns.heatmap(hourly_df.corr(), cmap='coolwarm', annot=False)
plt.title("Feature Correlation Matrix")
plt.show()


In [None]:
# Define features and target
features = [
    'hour', 'day_of_week', 'month',
    'is_weekend', 'special_event',
    'lag_1_hour', 'lag_24_hour'
]

X = hourly_df[features]
y = hourly_df['vehicle_count']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)


In [None]:
feature_importance = pd.DataFrame({
    'Feature': features,
    'Importance': rf.feature_importances_
}).sort_values(by='Importance', ascending=False)

feature_importance


In [None]:
# Select features with importance > threshold
selected_features = feature_importance[
    feature_importance['Importance'] > 0.05
]['Feature'].tolist()

selected_features
