In [None]:
import os
import requests
from bs4 import BeautifulSoup
from datetime import datetime
from dotenv import load_dotenv
import json, re
from typing import List, Dict, Any, Optional
from urllib.parse import quote
import pandas as pd
from datetime import datetime

In [3]:
END_POINTS = {
    "melbourne": {
        "lat": -37.81, "lon": 144.96,
        "weather_station_id": "IDV60901.95936",
        "fuel_search_suburb": "Melbourne"
    },
    "sydney": {
        "lat": -33.86, "lon": 151.20,
        "weather_station_id": "IDN60901.94768",
        "fuel_search_suburb": "Sydney"
    }
}

WAYPOINTS = {
    "Hume_Highway": {
        "albury": {
            "lat": -36.08, "lon": 146.91,
            "weather_station_id": "IDN60901.94925",
            "fuel_search_suburb": "Albury"
        },
        "goulburn": {
            "lat": -34.75, "lon": 149.71,
            "weather_station_id": "IDN60903.94926",
            "fuel_search_suburb": "Goulburn"
        }
    },
    "Inland_Route": {
        "shepparton": {
            "lat": -36.38, "lon": 145.40,
            "weather_station_id": "IDV60801.94875",
            "fuel_search_suburb": "Shepparton"
        },
        "dubbo": {
            "lat": -32.25, "lon": 148.61,
            "weather_station_id": "IDN60801.95719",
            "fuel_search_suburb": "Dubbo"
        }
    }
}
CSV_FILE_PATH = 'freight_data_hume_highway.csv'

In [4]:
def get_weather_data(station_id):
    """
    Fetches the latest weather observation data from a specific BoM station.
    """
    url = f"http://www.bom.gov.au/fwo/{station_id.split('.')[0]}/{station_id}.json"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    try:
        response = requests.get(url, headers=headers ,timeout=10)
        response.raise_for_status()
        data = response.json()
        latest_obs = data.get("observations", {}).get("data", [])[0]
        if not latest_obs:
            return {"error": "No observation data found."}
        return {
            "location": latest_obs.get("name"),
            "time_utc": latest_obs.get("aifstime_utc"),
            "temperature": latest_obs.get("air_temp"),
            "rainfall_since_9am": latest_obs.get("rain_trace"),
            "wind_speed_kmh": latest_obs.get("wind_spd_kmh"),
            "wind_direction": latest_obs.get("wind_dir"),
        }
    except requests.exceptions.RequestException as e:
        return {"error": f"Failed to fetch weather data: {e}"}
    except (KeyError, IndexError) as e:
        return {"error": f"Could not parse weather data structure: {e}"}

In [5]:
def get_traffic_data(bounding_box: str):
    TOMTOM_API_KEY = os.getenv("TOMTOM_API_KEY")
    if not TOMTOM_API_KEY:
        return {"error": "TomTom API key not found. Please check your .env file."}
    
    try:
        parts = [float(x) for x in bounding_box.split(",")]
        if len(parts) != 4:
            return {"error": "bbox must be 'west,south,east,north' with 4 numeric values."}
        west, south, east, north = parts
        if west >= east or south >= north:
            return {"error": "bbox order invalid. Expected west < east and south < north."}
    except Exception:
        return {"error": "bbox must be numeric like '144.90,-38.10,145.30,-37.60'."}

    base_url = "https://api.tomtom.com/traffic/services/5/incidentDetails"

    # Minimal valid projection (one line, no newlines or spaces)
    fields_raw = "{incidents{type,geometry{type,coordinates},properties{id,iconCategory,length,magnitudeOfDelay,delay,roadNumbers,from,to,events{description,code}}}}"
    fields_encoded = quote(fields_raw, safe="")

    url = (
        f"{base_url}?key={TOMTOM_API_KEY}"
        f"&bbox={bounding_box}"
        f"&fields={fields_encoded}"
        f"&language=en-GB"
        f"&timeValidityFilter=present"
    )

    try:
        resp = requests.get(url, timeout=20, headers={"Accept": "application/json"})
        resp.raise_for_status()
        data = resp.json()
    except requests.exceptions.HTTPError as e:
        sample_url = url.replace(TOMTOM_API_KEY, "****")  # mask key in error
        return {"error": f"Failed to fetch TomTom traffic data: {e}. url: {sample_url}"}
    except requests.exceptions.RequestException as e:
        return {"error": f"Network error contacting TomTom: {e}"}
    except Exception as e:
        return {"error": f"Unexpected error: {e}"}

    incidents_out = []
    for inc in data.get("incidents", []):
        props = inc.get("properties", {}) or {}
        events = props.get("events") or []
        desc = None
        if events and isinstance(events, list) and isinstance(events[0], dict):
            desc = events[0].get("description")
        incidents_out.append({
            "category": props.get("iconCategory"),
            "description": desc,
            "from": props.get("from"),
            "to": props.get("to"),
            "road_numbers": props.get("roadNumbers"),
            "incident_length_m":props.get("length"),
            "incident_delay":props.get("delay"),
            "incident_magnitude_delay":props.get("magnitudeOfDelay")
        })

    if not incidents_out:
        return {"status": "No incidents returned for this area."}

    return incidents_out

In [None]:
import requests
from typing import List, Tuple, Optional, Dict, Any

def get_travel_time_osrm(
    origin: Tuple[float, float],
    destination: Tuple[float, float],
    waypoints: Optional[List[Tuple[float, float]]] = None,
    *,
    optimize_order: bool = False,
    server: str = "https://router.project-osrm.org"
) -> Dict[str, Any]:
    """
    Compute driving time with OSRM. origin/waypoints/destination are (lat, lon).
    If optimize_order=True, uses OSRM's trip service (TSP) with fixed start & end.
    Returns seconds and per-leg durations; no live traffic.
    """
    coords = [origin] + (waypoints or []) + [destination]
    # OSRM expects lon,lat order
    lonlat = ";".join(f"{lon:.6f},{lat:.6f}" for lat, lon in coords)

    if optimize_order:
        # Fix start=first and end=last, let OSRM reorder the intermediates
        url = (f"{server}/trip/v1/driving/{lonlat}"
               "?source=first&destination=last&roundtrip=false&overview=false&steps=false")
        r = requests.get(url, timeout=20)
        r.raise_for_status()
        data = r.json()
        trips = data.get("trips", [])
        if not trips:
            return {"error": data.get("message", "No trip found")}
        trip = trips[0]
        legs = trip.get("legs", [])
        leg_secs = [int(leg.get("duration", 0)) for leg in legs]
        # Return OSRM's optimized order for the intermediates
        # waypoints[0] and waypoints[-1] are start/end; the rest have an "waypoint_index"
        optimized_order = []
        for wp in data.get("waypoints", []):
            if wp.get("trips_index", 0) == 0 and wp.get("waypoint_index") not in (0, len(coords)-1):
                optimized_order.append(wp["waypoint_index"] - 1)  # shift to 0-based among intermediates
        return {
            "total_duration_sec": int(trip.get("duration", 0)),
            "distance_meters": int(trip.get("distance", 0)),
            "leg_durations_sec": leg_secs,
            "num_waypoints": len(waypoints or []),
            "optimized_intermediate_order": sorted(optimized_order) or None
        }
    else:
        url = (f"{server}/route/v1/driving/{lonlat}"
               "?overview=false&steps=false&annotations=duration")
        r = requests.get(url, timeout=20)
        r.raise_for_status()
        data = r.json()
        routes = data.get("routes", [])
        if not routes:
            return {"error": data.get("message", "No route found")}
        route = routes[0]
        leg_secs = [int(leg.get("duration", 0)) for leg in route.get("legs", [])]
        return {
            "total_duration_sec": int(route.get("duration", 0)),
            "distance_meters": int(route.get("distance", 0)),
            "leg_durations_sec": leg_secs,
            "num_waypoints": len(waypoints or []),
            "optimized_intermediate_order": None
        }


In [41]:
if __name__ == "__main__":
    load_dotenv()
    print("--- Agentic AI Logistics Planner: Multi-Route Data Ingestion Script ---")
    print(f"--- Running at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} ---\n")

    all_data = {}

    for route_name, route_wps in WAYPOINTS.items():
        print(f"\nFetching Data for: {route_name.replace('_',' ')}")
        print("=" * 40)
        all_data[route_name] = {}

        # lookup dict only (unordered is fine for lookups)
        full_route_locations = {**END_POINTS, **route_wps}

        # ordered node list for segment math
        node_names = ["melbourne"] + list(route_wps.keys()) + ["sydney"]

        # origin / intermediates / destination in declared order
        origin = (END_POINTS["melbourne"]["lat"], END_POINTS["melbourne"]["lon"])
        destination = (END_POINTS["sydney"]["lat"], END_POINTS["sydney"]["lon"])
        waypoints_list = [(route_wps[n]["lat"], route_wps[n]["lon"]) for n in route_wps.keys()]

        # ---- Free routing (OSRM public server). Fixed order; no traffic. ----
        coords = [origin] + waypoints_list + [destination]
        lonlat = ";".join(f"{lon:.6f},{lat:.6f}" for (lat, lon) in coords)
        osrm_url = f"https://router.project-osrm.org/route/v1/driving/{lonlat}?overview=false&steps=false"

        leg_secs, leg_dists = [], []
        route_travel = {}
        try:
            rr = requests.get(osrm_url, timeout=30)
            rr.raise_for_status()
            osrm = rr.json()
            routes = osrm.get("routes") or []
            if routes:
                route = routes[0]
                leg_secs  = [int(leg.get("duration", 0)) for leg in route.get("legs", [])]
                leg_dists = [int(leg.get("distance", 0)) for leg in route.get("legs", [])]
                route_travel = {
                    "total_duration_sec": int(route.get("duration", 0)),
                    "distance_meters": int(route.get("distance", 0)),
                    "leg_durations_sec": leg_secs,
                    "leg_distances_meters": leg_dists
                }
            else:
                route_travel = {"error": osrm.get("message", "No route returned")}
        except Exception as e:
            route_travel = {"error": f"OSRM call failed: {e}"}

        # cumulative time to each node
        cum_time = [0]
        for s in leg_secs:
            cum_time.append(cum_time[-1] + s)

        # ---- Per-location fetch + segment attachment ----
        for idx, loc_name in enumerate(node_names):
            print(f"  Processing Location: {loc_name.capitalize()}")
            cfg = full_route_locations[loc_name]
            lat, lon = cfg["lat"], cfg["lon"]

            bbox_size = 0.25
            min_lon, min_lat = lon - bbox_size, lat - bbox_size
            max_lon, max_lat = lon + bbox_size, lat + bbox_size
            location_bbox = f"{min_lon},{min_lat},{max_lon},{max_lat}"

            all_data[route_name][loc_name] = {}

            weather = get_weather_data(cfg["weather_station_id"])
            all_data[route_name][loc_name]["weather"] = weather

            traffic = get_traffic_data(location_bbox)
            all_data[route_name][loc_name]["traffic"] = traffic

            # segment info for this node
            inbound = None if idx == 0 else {
                "from": node_names[idx - 1],
                "to": loc_name,
                "duration_sec": leg_secs[idx - 1] if idx - 1 < len(leg_secs) else None,
                "distance_m":  leg_dists[idx - 1] if idx - 1 < len(leg_dists) else None,
            }
            outbound = None if idx == len(node_names) - 1 else {
                "from": loc_name,
                "to": node_names[idx + 1],
                "duration_sec": leg_secs[idx] if idx < len(leg_secs) else None,
                "distance_m":  leg_dists[idx] if idx < len(leg_dists) else None,
            }

            all_data[route_name][loc_name]["segment"] = {
                "inbound": inbound,
                "outbound": outbound,
                "cumulative_time_to_here_sec": cum_time[idx] if idx < len(cum_time) else None,
                "remaining_time_from_here_sec": (cum_time[-1] - cum_time[idx]) if idx < len(cum_time) else None,
            }

            # keep this for backward-compat with your processing
            all_data[route_name][loc_name]["route_travel"] = route_travel

        # route-level summary once
        all_data[route_name]["__route_summary__"] = {
            "total_duration_sec": route_travel.get("total_duration_sec"),
            "total_distance_meters": route_travel.get("distance_meters"),
            "num_waypoints": len(waypoints_list),
        }

    print("\n\n--- COMPLETE DATA PAYLOAD ---")
    print(json.dumps(all_data, indent=2))


--- Agentic AI Logistics Planner: Multi-Route Data Ingestion Script ---
--- Running at: 2025-08-12 17:59:31 ---


Fetching Data for: Hume Highway
  Processing Location: Melbourne
  Processing Location: Albury
  Processing Location: Goulburn
  Processing Location: Sydney

Fetching Data for: Inland Route
  Processing Location: Melbourne
  Processing Location: Shepparton
  Processing Location: Dubbo
  Processing Location: Sydney


--- COMPLETE DATA PAYLOAD ---
{
  "Hume_Highway": {
    "melbourne": {
      "weather": {
        "location": "Melbourne (Olympic Park)",
        "time_utc": "20250812073000",
        "temperature": 10.8,
        "rainfall_since_9am": "3.0",
        "wind_speed_kmh": 13,
        "wind_direction": "N"
      },
      "traffic": [
        {
          "category": 8,
          "description": "Closed",
          "from": "Hacketts Road / Bowral Avenue",
          "to": "Tamworth Grove",
          "road_numbers": [],
          "incident_length_m": 126.716,
          "in

In [32]:
def process_data(raw_data: dict) -> pd.DataFrame:
    """
    Processes the raw nested dictionary, creating a detailed record for each
    traffic incident, combined with the weather data for that location.
    """
    print("--- Starting data processing and feature engineering ---")
    
    incident_records = []
    
    # Loop through the nested structure
    for route_name, locations in raw_data.items():
        for location_name, data in locations.items():
            
            weather_info = data.get('weather', {})
            traffic_incidents = data.get('traffic', [])

            # If there are no traffic incidents, we can skip or create a record with no incident info
            if not isinstance(traffic_incidents, list) or not traffic_incidents:
                continue

            # --- Create a new row for EACH traffic incident ---
            for incident in traffic_incidents:
                # --- Weather Feature Engineering ---
                try:
                    rainfall = float(weather_info.get('rainfall_since_9am', 0.0))
                except (ValueError, TypeError):
                    rainfall = 0.0

                # --- Time Feature Engineering ---
                record_time_utc_str = weather_info.get('time_utc')
                record_dt, day_of_week, hour_of_day = None, None, None
                if record_time_utc_str:
                    try:
                        record_dt = datetime.strptime(record_time_utc_str, '%Y%m%d%H%M%S')
                        day_of_week = record_dt.weekday()
                        hour_of_day = record_dt.hour
                    except ValueError:
                        pass

                # Create a dictionary for this single incident record
                record = {
                    'timestamp': record_dt,
                    'route': route_name,
                    'from_location': location_name,
                    'temperature': weather_info.get('temperature'),
                    'is_raining': rainfall > 0.1,
                    'wind_speed_kmh': weather_info.get('wind_speed_kmh'),
                    'day_of_week': day_of_week,
                    'hour_of_day': hour_of_day,
                    # --- Add specific incident details ---
                    'incident_category': incident.get('category'),
                    'incident_description': incident.get('description'),
                    'incident_from': incident.get('from'),
                    'incident_to': incident.get('to'),
                    'incident_length_m': incident.get('incident_length_m'),
                    #'incident_delay_sec': incident.get('incident_delay'),
                    'incident_magnitude_delay': incident.get('incident_magnitude_delay'),
                    'total_distance_meters':route_travel.get('distance_meters'),
                    'total_time_sec':route_travel.get('total_duration_sec')

                }
                incident_records.append(record)

    # Create a Pandas DataFrame from the list of incident records
    df = pd.DataFrame(incident_records)
    
    # Set the timestamp as the index for time-series analysis
    if 'timestamp' in df.columns:
        df = df.set_index('timestamp')
    
    print("--- Data processing complete! ---")
    return df

In [42]:
import pandas as pd
import math
from datetime import datetime

def build_leg_level_df(all_data: dict) -> pd.DataFrame:
    """
    ONE row per leg: (route, from_node -> to_node).
    Incident impact is attributed to the outbound leg of the node whose bbox we queried.
    """
    rows = []

    now = datetime.now()
    dow = now.weekday()
    hour = now.hour

    for route, locations in all_data.items():
        summary = locations.get("__route_summary__", {}) or {}
        total_sec = summary.get("total_duration_sec")
        total_m   = summary.get("total_distance_meters")

        # Gather nodes with ordering by cumulative time so legs line up
        nodes = []
        for node, payload in locations.items():
            if node == "__route_summary__":
                continue
            seg = (payload.get("segment") or {})
            nodes.append({
                "name": node,
                "cum_sec": seg.get("cumulative_time_to_here_sec"),
                "payload": payload,
                "seg": seg
            })
        nodes = sorted(nodes, key=lambda r: r["cum_sec"] if r["cum_sec"] is not None else math.inf)

        # Build leg rows (node i -> node i+1)
        for i in range(len(nodes) - 1):
            from_node = nodes[i]
            to_node   = nodes[i + 1]

            seg = from_node["seg"] or {}
            outb = seg.get("outbound") or {}
            base_leg_sec = outb.get("duration_sec")
            base_leg_m   = outb.get("distance_m")

            # incidents fetched for the FROM node's bbox → attribute to this outbound leg
            traffic = from_node["payload"].get("traffic") or []
            if isinstance(traffic, dict):  # normalize "No incidents" / error shape
                traffic_list = []
            else:
                traffic_list = traffic

            # simple aggregates; TomTom 'delay' is seconds when present
            inc_cnt = len(traffic_list)
            inc_delay_sum = sum(
                max(0, (inc.get("incident_delay") or 0)) for inc in traffic_list
                if isinstance(inc, dict)
            )
            inc_len_sum_m = sum(
                max(0.0, float(inc.get("incident_length_m") or 0.0)) for inc in traffic_list
                if isinstance(inc, dict)
            )

            weather = from_node["payload"].get("weather") or {}

            rows.append({
                # keys
                "timestamp": now,
                "route": route,
                "leg_index": i,
                "from_node": from_node["name"],
                "to_node": to_node["name"],

                # route context
                "route_total_duration_sec": total_sec,
                "route_total_distance_m": total_m,

                # base leg metrics (from router)
                "leg_base_duration_sec": base_leg_sec,
                "leg_base_distance_m": base_leg_m,

                # incident aggregates mapped to this leg
                "leg_incident_count": inc_cnt,
                "leg_incident_delay_sum_sec": inc_delay_sum,
                "leg_incident_length_sum_m": inc_len_sum_m,

                # adjusted target (what you’d model if you want incident impact)
                "leg_total_duration_sec": (
                    (base_leg_sec or 0) + inc_delay_sum if base_leg_sec is not None else None
                ),

                # weather at the FROM node (you can add destination weather too if needed)
                "wx_location": weather.get("location"),
                "wx_time_utc": weather.get("time_utc"),
                "wx_temp_c": weather.get("temperature"),
                "wx_rain_mm": weather.get("rainfall_since_9am"),
                "wx_wind_kmh": weather.get("wind_speed_kmh"),
                "wx_wind_dir": weather.get("wind_direction"),

                # time features
                "day_of_week": dow,
                "hour_of_day": hour,
            })

    df = pd.DataFrame(rows)
    if "timestamp" in df.columns:
        df = df.set_index("timestamp")

    # optional: engineered helpers
    if "leg_base_distance_m" in df.columns:
        df["leg_distance_km"] = df["leg_base_distance_m"] / 1000.0
    if "leg_total_duration_sec" in df.columns:
        df["leg_total_duration_hr"] = df["leg_total_duration_sec"] / 3600.0

    return df


In [43]:
df = build_leg_level_df(all_data)

In [35]:
df['total_time_hr'] = df['total_time_sec']/3600
df['total_distance_km'] = df['total_distance_meters']/1000

In [50]:
pd.set_option('display.max_columns', None)
df.head()

Unnamed: 0_level_0,route,leg_index,from_node,to_node,route_total_duration_sec,route_total_distance_m,leg_base_duration_sec,leg_base_distance_m,leg_incident_count,leg_incident_delay_sum_sec,leg_incident_length_sum_m,leg_total_duration_sec,wx_location,wx_time_utc,wx_temp_c,wx_rain_mm,wx_wind_kmh,wx_wind_dir,day_of_week,hour_of_day,leg_distance_km,leg_total_duration_hr
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2025-08-12 18:03:48.768228,Hume_Highway,0,melbourne,albury,37561,867226,13826,313516,110,4112,65841.264807,17938,Melbourne (Olympic Park),20250812073000.0,10.8,3.0,13.0,N,1,18,313.516,4.982778
2025-08-12 18:03:48.768228,Hume_Highway,1,albury,goulburn,37561,867226,14950,357845,2,0,429.132801,14950,,,,,,,1,18,357.845,4.152778
2025-08-12 18:03:48.768228,Hume_Highway,2,goulburn,sydney,37561,867226,8784,195864,4,536,16670.217053,9320,Canberra,20250812073000.0,11.0,0.0,11.0,NNW,1,18,195.864,2.588889
2025-08-12 18:03:48.768228,Inland_Route,0,melbourne,shepparton,57734,1218183,8466,178743,110,4112,65841.264807,12578,Melbourne (Olympic Park),20250812073000.0,10.8,3.0,13.0,N,1,18,178.743,3.493889
2025-08-12 18:03:48.768228,Inland_Route,1,shepparton,dubbo,57734,1218183,28661,637696,2,0,357.135651,28661,Shepparton,20250812073000.0,11.7,0.0,9.0,ENE,1,18,637.696,7.961389
