In [1]:
import pandas as pd

df = pd.read_csv("indore_traffic_data3.csv")

unique_places = set(df["origin"]).union(set(df["destination"]))

with open("unique_places.txt", "w", encoding="utf-8") as f:
    for p in unique_places:
        f.write(p + "\n")

print("Extracted:", len(unique_places), "places")


Extracted: 30 places


In [2]:
import requests
import csv

API_KEY = "AIzaSyAaWkTldjyvQe_ZTRgVLkgPgW1xQHtkx4k"

output_file = "place_coordinates.csv"

places = open("unique_places.txt", "r", encoding="utf-8").read().splitlines()

with open(output_file, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["place", "lat", "lng"])

    for place in places:
        url = f"https://maps.googleapis.com/maps/api/geocode/json?address={place}&key={API_KEY}"
        res = requests.get(url).json()

        if res["status"] != "OK":
            print("❌ Failed:", place)
            continue

        loc = res["results"][0]["geometry"]["location"]
        writer.writerow([place, loc["lat"], loc["lng"]])
        print("✔ Fetched:", place)


✔ Fetched: C21 Mall, Indore
✔ Fetched: Old RTO, Indore
✔ Fetched: Sarafa, Indore
✔ Fetched: Bhanwarkuan Square, Indore
✔ Fetched: Rajwada, Indore
✔ Fetched: Khajrana Ganesh Temple, Indore
✔ Fetched: Vijay Nagar, Indore
✔ Fetched: Bengali Square, Indore
✔ Fetched: Bombay Hospital, Indore
✔ Fetched: Dewas Naka, Indore
✔ Fetched: MR 10 Square, Indore
✔ Fetched: Indore Railway Station, Indore
✔ Fetched: Geeta Bhawan Square, Indore
✔ Fetched: Navlakha Bus Stand, Indore
✔ Fetched: Basketball Complex, Indore
✔ Fetched: Scheme No 78, Indore
✔ Fetched: Choithram Mandi, Indore
✔ Fetched: Medicaps University, Indore
✔ Fetched: Satya Sai Square, Indore
✔ Fetched: LIG Square, Indore
✔ Fetched: Super Corridor, Indore
✔ Fetched: Indore Airport, Indore
✔ Fetched: Scheme No 54, Indore
✔ Fetched: Medanta Hospital, Indore
✔ Fetched: Niranjanpur, Indore
✔ Fetched: Palasia Square, Indore
✔ Fetched: TCS Indore, Indore
✔ Fetched: Gopur Square, Indore
✔ Fetched: Niranjanpur Square, Indore
✔ Fetched: Rajendra 

In [4]:
coords = pd.read_csv("place_coordinates.csv")

df = pd.read_csv("indore_traffic_data3.csv")

df = df.merge(coords.rename(columns={"place":"origin"}),
              left_on="origin", right_on="origin", how="left")

df = df.rename(columns={"lat":"origin_lat", "lng":"origin_lng"})

df = df.merge(coords.rename(columns={"place":"destination"}),
              left_on="destination", right_on="destination", how="left")

df = df.rename(columns={"lat":"dest_lat", "lng":"dest_lng"})

df.to_csv("final_dataset.csv", index=False)


In [None]:
df = pd.read_csv("final_dataset.csv")

df["delta_lat"] = df["dest_lat"] - df["origin_lat"]
df["delta_lng"] = df["dest_lng"] - df["origin_lng"]


KeyError: "['delta_lat', 'delta_lng', 'haversine_km', 'bearing_deg', 'day_enc'] not in index"

In [6]:
df = pd.read_csv("final_dataset.csv")

df["delta_lat"] = df["dest_lat"] - df["origin_lat"]
df["delta_lng"] = df["dest_lng"] - df["origin_lng"]


In [7]:
import numpy as np

def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # km

    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])

    dlat = lat2 - lat1
    dlon = lon2 - lon1

    a = np.sin(dlat/2)**2 + np.cos(lat1)*np.cos(lat2)*np.sin(dlon/2)**2
    c = 2*np.arcsin(np.sqrt(a))

    return R * c

df["haversine_km"] = haversine(
    df["origin_lat"], df["origin_lng"],
    df["dest_lat"], df["dest_lng"]
)


In [8]:
def calculate_bearing(lat1, lon1, lat2, lon2):
    lat1 = np.radians(lat1)
    lat2 = np.radians(lat2)
    diff = np.radians(lon2 - lon1)

    x = np.sin(diff) * np.cos(lat2)
    y = np.cos(lat1)*np.sin(lat2) - np.sin(lat1)*np.cos(lat2)*np.cos(diff)
    initial_bearing = np.degrees(np.arctan2(x, y))

    return (initial_bearing + 360) % 360

df["bearing_deg"] = calculate_bearing(
    df["origin_lat"], df["origin_lng"],
    df["dest_lat"], df["dest_lng"]
)


In [9]:
from sklearn.preprocessing import LabelEncoder

le_day = LabelEncoder()
df["day_enc"] = le_day.fit_transform(df["day"])

import joblib
joblib.dump(le_day, "label_encoder_day.pkl")


['label_encoder_day.pkl']

In [10]:
df.to_csv("final_dataset_full.csv", index=False)
print("Saved final_dataset_full.csv with all features!")


Saved final_dataset_full.csv with all features!


In [11]:
df = pd.read_csv("final_dataset_full.csv")
df.isnull().sum()


origin           0
destination      0
timestamp        0
day              0
hour             0
is_weekend       0
is_peak_hour     0
distance_km      0
duration_min     0
temperature_c    0
humidity_pct     0
visibility_m     0
rain_mm          0
origin_lat       0
origin_lng       0
dest_lat         0
dest_lng         0
delta_lat        0
delta_lng        0
haversine_km     0
bearing_deg      0
day_enc          0
dtype: int64

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from catboost import CatBoostRegressor
import joblib

# Load dataset
df = pd.read_csv("final_dataset_full.csv")

# Feature columns
feature_cols = [
    "origin_lat", "origin_lng",
    "dest_lat", "dest_lng",
    "delta_lat", "delta_lng",
    "haversine_km", "bearing_deg",
    "distance_km",
    "hour", "day_enc",
    "is_weekend", "is_peak_hour",
    "temperature_c", "humidity_pct",
    "visibility_m", "rain_mm"
]

X = df[feature_cols]
y = df["duration_min"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# CatBoost model
model = CatBoostRegressor(
    iterations=700,
    learning_rate=0.05,
    depth=8,
    loss_function="MAE",
    eval_metric="MAE",
    random_seed=42,
    verbose=50
)

# Train
model.fit(
    X_train, y_train,
    eval_set=(X_test, y_test),
    use_best_model=True
)

# Predict
y_pred = model.predict(X_test)

# Metrics
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("====================================")
print("MAE:", mae)
print("R²:", r2)
print("====================================")

# Save model + columns
model.save_model("catboost_eta_model.cbm")
joblib.dump(feature_cols, "catboost_feature_cols.pkl")

print("Model & features saved.")


0:	learn: 6.7212329	test: 6.8735649	best: 6.8735649 (0)	total: 194ms	remaining: 2m 15s
50:	learn: 1.4022628	test: 1.4334013	best: 1.4334013 (50)	total: 1.18s	remaining: 15s
100:	learn: 0.8141169	test: 0.8459190	best: 0.8459190 (100)	total: 1.82s	remaining: 10.8s
150:	learn: 0.6702054	test: 0.7048112	best: 0.7048112 (150)	total: 2.29s	remaining: 8.34s
200:	learn: 0.5944433	test: 0.6339851	best: 0.6339851 (200)	total: 2.87s	remaining: 7.13s
250:	learn: 0.5405449	test: 0.5833602	best: 0.5833602 (250)	total: 3.75s	remaining: 6.7s
300:	learn: 0.5067934	test: 0.5539895	best: 0.5539895 (300)	total: 4.25s	remaining: 5.63s
350:	learn: 0.4807172	test: 0.5308576	best: 0.5308576 (350)	total: 4.73s	remaining: 4.7s
400:	learn: 0.4610794	test: 0.5142643	best: 0.5142643 (400)	total: 5.22s	remaining: 3.89s
450:	learn: 0.4418557	test: 0.4980526	best: 0.4980526 (450)	total: 5.7s	remaining: 3.15s
500:	learn: 0.4254290	test: 0.4854323	best: 0.4854323 (500)	total: 6.17s	remaining: 2.45s
550:	learn: 0.412365

In [13]:
import joblib
cols = joblib.load("catboost_feature_cols.pkl")
cols


['origin_lat',
 'origin_lng',
 'dest_lat',
 'dest_lng',
 'delta_lat',
 'delta_lng',
 'haversine_km',
 'bearing_deg',
 'distance_km',
 'hour',
 'day_enc',
 'is_weekend',
 'is_peak_hour',
 'temperature_c',
 'humidity_pct',
 'visibility_m',
 'rain_mm']