In [0]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor

In [0]:
traffic_sdf = spark.read.table("workspace.default.traffic_30")
weather_sdf = spark.read.table("workspace.default.weather_data_30")

In [0]:
display(traffic_sdf.select('origin').distinct())

In [0]:
display(traffic_sdf.select('datetime').distinct())

In [0]:
traffic_df = traffic_sdf.toPandas()
weather_df = weather_sdf.toPandas()

In [0]:
traffic_df['start_time'] = pd.to_datetime(traffic_df['time'])
weather_df['time'] = pd.to_datetime(weather_df['time'])

In [0]:
traffic_df.head()

In [0]:
weather_df['location'] = weather_df['location']+', Hyderabad'

In [0]:
merged_df = traffic_df.merge(
    weather_df,
    left_on=['start_time', 'origin'],
    right_on=['time', 'location'],
    how='left',
    suffixes=('', '_origin')
)

In [0]:
weather_df[weather_df['time'] == '2025-05-30 06:00:00']

In [0]:
merged_df.head()

In [0]:
merged_df.columns

# ML Model Pipeline

In [0]:
# Drop rows with missing critical values
merged_df = merged_df.dropna(subset=[
    "duration_in_traffic_minutes", "temperature_2m", "relative_humidity_2m", 
    "precipitation", "wind_speed_combined"
])

In [0]:
merged_df.shape

In [0]:
# Feature engineering
merged_df["hour"] = pd.to_datetime(merged_df["start_time"]).dt.hour
merged_df["dayofweek"] = pd.to_datetime(merged_df["start_time"]).dt.dayofweek

In [0]:
# Define features and target
features = [
    "origin", "destination", "temperature_2m", "relative_humidity_2m", 
    "precipitation", "wind_speed_combined", "hour", "dayofweek"
]
target = "duration_in_traffic_minutes"

In [0]:
X = merged_df[features]
y = merged_df[target]

In [0]:
# One-hot encode categorical features
X_encoded = pd.get_dummies(X, columns=["origin", "destination"], drop_first=True)

## 2. Train/Test Split and Model Training

In [0]:
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42
)

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

## 3. Evaluation Metrics

In [0]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae:.2f} mins")
print(f"RMSE: {rmse:.2f} mins")
print(f"R² Score: {r2:.2f}")

## 4. Real-Time Prediction Using Open-Meteo API

In [0]:
import requests
from datetime import datetime

In [0]:
def fetch_weather_data(location_name, latitude, longitude, datetime_obj):
    date_str = datetime_obj.strftime("%Y-%m-%d")
    hour_str = datetime_obj.strftime("%H:00")
    
    url = (
        f"https://api.open-meteo.com/v1/forecast"
        f"?latitude={latitude}&longitude={longitude}"
        f"&hourly=temperature_2m,relative_humidity_2m,precipitation,windspeed_10m"
        f"&start_date={date_str}&end_date={date_str}&timezone=Asia%2FKolkata"
    )
    response = requests.get(url)
    if response.status_code != 200:
        raise ValueError(f"Failed to fetch weather for {location_name}: {response.status_code}")

    data = response.json()["hourly"]
    timestamp = f"{date_str}T{hour_str}"
    try:
        idx = data["time"].index(timestamp)
    except ValueError:
        raise ValueError(f"No weather data available for {timestamp}")

    return {
        "temperature_2m": data["temperature_2m"][idx],
        "relative_humidity_2m": data["relative_humidity_2m"][idx],
        "precipitation": data["precipitation"][idx],
        "wind_speed_combined": data["windspeed_10m"][idx]
    }


In [0]:
def predict_eta(origin, destination, datetime_obj, location_coords):
    weather = fetch_weather_data(origin, *location_coords[origin], datetime_obj)
    
    features_input = {
        "temperature_2m": weather["temperature_2m"],
        "relative_humidity_2m": weather["relative_humidity_2m"],
        "precipitation": weather["precipitation"],
        "wind_speed_combined": weather["wind_speed_combined"],
        "hour": datetime_obj.hour,
        "dayofweek": datetime_obj.weekday(),
        **{f"origin_{origin}": 1},
        **{f"destination_{destination}": 1}
    }

    for col in X_encoded.columns:
        features_input.setdefault(col, 0)

    input_df = pd.DataFrame([features_input])
    return model.predict(input_df)[0]

In [0]:
location_coords = {
    "Kondapur": (17.4637, 78.3659),
    "Miyapur": (17.5006, 78.3572),
    "Secunderabad": (17.4399, 78.4983),
    "Begumpet": (17.4375, 78.4483)
}

eta = predict_eta("Kondapur", "Begumpet", datetime(2025, 6, 5, 9, 0), location_coords)
print(f"Predicted ETA: {eta:.2f} minutes")

### Fetching `latitude` and `longitude` using Nominatim (OpenStreetMap)