In [0]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor

In [0]:
traffic_sdf = spark.read.table("workspace.default.traffic_30")
weather_sdf = spark.read.table("workspace.default.weather_data_30")

In [0]:
display(traffic_sdf.select('origin').distinct())

In [0]:
display(traffic_sdf.select('datetime').distinct())

In [0]:
traffic_df = traffic_sdf.toPandas()
weather_df = weather_sdf.toPandas()

In [0]:
traffic_df['start_time'] = pd.to_datetime(traffic_df['time'])
weather_df['time'] = pd.to_datetime(weather_df['time'])

In [0]:
traffic_df.head()

In [0]:
weather_df['location'] = weather_df['location']+', Hyderabad'

In [0]:
merged_df = traffic_df.merge(
    weather_df,
    left_on=['start_time', 'origin'],
    right_on=['time', 'location'],
    how='left',
    suffixes=('', '_origin')
)

In [0]:
weather_df[weather_df['time'] == '2025-05-30 06:00:00']

In [0]:
merged_df.head()

In [0]:
merged_df.columns

# ML Model Pipeline

In [0]:
# Drop rows with missing critical values
merged_df = merged_df.dropna(subset=[
    "duration_in_traffic_minutes", "temperature_2m", "relative_humidity_2m", 
    "precipitation", "wind_speed_combined"
])

In [0]:
merged_df.shape

In [0]:
# Feature engineering
merged_df["hour"] = pd.to_datetime(merged_df["start_time"]).dt.hour
merged_df["dayofweek"] = pd.to_datetime(merged_df["start_time"]).dt.dayofweek

In [0]:
# Define features and target
features = [
    "origin", "destination", "temperature_2m", "relative_humidity_2m", 
    "precipitation", "wind_speed_combined", "hour", "dayofweek"
]
target = "duration_in_traffic_minutes"

In [0]:
X = merged_df[features]
y = merged_df[target]

In [0]:
# One-hot encode categorical features
X_encoded = pd.get_dummies(X, columns=["origin", "destination"], drop_first=True)

In [0]:
X.columns

## 2. Train/Test Split and Model Training

In [0]:
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42
)

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

In [0]:
X_train.columns

## 3. Evaluation Metrics

In [0]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae:.2f} mins")
print(f"RMSE: {rmse:.2f} mins")
print(f"R² Score: {r2:.2f}")

## 4. Real-Time Prediction Using Open-Meteo API

In [0]:
def predict_eta(origin, destination, dt_obj, coords, expected_columns, model):
    import requests
    import pandas as pd

    # 1. Fetch weather
    latitude, longitude = coords
    date_str = dt_obj.strftime('%Y-%m-%d')
    hour_str = dt_obj.strftime('%H:00')

    url = f"https://api.open-meteo.com/v1/forecast?latitude={latitude}&longitude={longitude}&hourly=temperature_2m,relative_humidity_2m,precipitation,windspeed_10m&start_date={date_str}&end_date={date_str}&timezone=Asia%2FKolkata"
    response = requests.get(url)
    weather = response.json()

    hour_index = weather['hourly']['time'].index(f"{date_str}T{hour_str}")
    temp = weather['hourly']['temperature_2m'][hour_index]
    humidity = weather['hourly']['relative_humidity_2m'][hour_index]
    precip = weather['hourly']['precipitation'][hour_index]
    wind = weather['hourly']['windspeed_10m'][hour_index]

    # 2. Prepare input row
    features_input = {
        'temperature_2m': 32.9,
        'relative_humidity_2m': 34,
        'precipitation': 0.0,
        'wind_speed_combined': 31.6,
        'hour': dt_obj.hour,
        'dayofweek': dt_obj.weekday(),
        f'origin_{origin}': 1,
        f'destination_{destination}': 1
    }

    # 3. Convert to DataFrame
    input_df = pd.DataFrame([features_input])

    # 4. Reindex to match training columns
    input_df = input_df.reindex(columns=expected_columns, fill_value=0)

    # 5. Predict ETA
    eta = model.predict(input_df)[0]
    return eta

### Fetching `latitude` and `longitude` using Nominatim (OpenStreetMap)

In [0]:
import requests
import time

In [0]:
def get_coordinates(location_str):
    """
    location_str: e.g. 'Kondapur, Hyderabad, Telangana, India'
    returns: (latitude, longitude) tuple or None
    """
    url = "https://nominatim.openstreetmap.org/search"
    params = {
        'q': location_str,
        'format': 'json',
        'limit': 1
    }
    headers = {
        'User-Agent': 'YourAppNameHere (your@email.com)'  # Required
    }

    response = requests.get(url, params=params, headers=headers)
    if response.status_code == 200:
        data = response.json()
        if data:
            lat = float(data[0]['lat'])
            lon = float(data[0]['lon'])
            return (lat, lon)
    return None


In [0]:
# Use your actual trained model and expected_columns
eta = predict_eta(
    origin="Kondapur, Hyderabad",
    destination="Secunderabad, Hyderabad",
    dt_obj=datetime(2025, 6, 12, 11, 0),
    coords=(17.4579, 78.3650),
    expected_columns=X_train.columns,  # this should be saved earlier
    model=model
)
print(f"Predicted ETA: {eta:.2f} minutes")

## Demo

In [0]:
# Example usage
locations = {
    1: 'Kondapur, Hyderabad, Telangana, India',
    2: 'Miyapur, Hyderabad, Telangana, India',
    3: 'Secunderabad, Telangana, India',
    4: 'Begumpet, Hyderabad, Telangana, India'
}
print('Please choose a location id from the list below:')
for key, value in locations.items():
    print(f'{key}: {value}')

travel_origin = int(input('\n\nWhere do you start from? >>> '))
if travel_origin > 4 or travel_origin < 1:
    print('Invalid location id')
    exit()
travel_destination = int(input('Where would you like to go? >>> '))
if travel_destination > 4 or travel_destination < 1:
    print('Invalid location id')
    exit()
travel_datetime = input('When do you plan on travelling in YYYY-MM-DD HH:MM format? >>>')
try:
    dt = datetime.strptime(travel_datetime, '%Y-%m-%d %H:%M')
except Exception:
    print('Invalid datetime')
    exit()

In [0]:
dt = datetime.strptime(travel_datetime, '%Y-%m-%d %H:%M')
origin = locations[travel_origin].split(',')
dest = locations[travel_destination].split(',')
eta = predict_eta(
    origin = origin[0] + ', ' + origin[1],
    destination = dest[0] + ', ' + dest[1],
    dt_obj = dt,
    coords = get_coordinates(locations[travel_origin]),
    expected_columns = X_train.columns,  
    model = model
)
print(f'Travelling from {origin[0]} to {dest[0]} on {dt.strftime("%Y-%m-%d")} at {dt.strftime("%H:%M")}')
print(f"Predicted ETA: {eta:.2f} minutes")