# Creating the graph from csv data

In [1]:
# Santosh scraped data from SL API and constructed 'routes_master_data.csv'
# Wenhan wrote the code in this code cell
# Erik contributed to debugging
import math
import json
import pandas as pd
from tqdm import tqdm
import pickle

def time_to_seconds(time_str):
    """
    Converts a 'HH:MM:SS' string into seconds after midnight (integer).
    Returns None if the input is invalid (NaN or None).
    """
    if not time_str or pd.isna(time_str):
        return None
    
    hh, mm, ss = time_str.split(":")
    return int(hh) * 3600 + int(mm) * 60 + int(ss)

def build_traffic_graph(csv_path):
    """
    Build a graph from the routes_master_data CSV.
    Scenic values are intentionally excluded from this graph.
    """

    # Initialize the graph
    graph = {}

    # Read the CSV into a DataFrame
    df = pd.read_csv(csv_path)

    # Group by Route, Direction, Journey for consecutive stops
    grouped = df.groupby(["Route_ID", "Direction", "Journey_ID"], dropna=False)
    
    for _, group in tqdm(grouped):
        # Sort by 'Order'
        group_sorted = group.sort_values("Order", ascending=True)
        group_sorted = group_sorted.dropna(subset=["StopPlace Name"]).reset_index(drop=True)

        for i in range(len(group_sorted) - 1):
            from_stop = group_sorted.loc[i, "StopPlace Name"]
            to_stop = group_sorted.loc[i + 1, "StopPlace Name"]

            mode = group_sorted.loc[i, "TransportMode"]
            if pd.isna(mode):
                mode = "unknown"

            dep_time_str = group_sorted.loc[i, "DepartureTime"]
            arr_time_str = group_sorted.loc[i + 1, "ArrivalTime"]
            dep_sec = time_to_seconds(dep_time_str)
            arr_sec = time_to_seconds(arr_time_str)

            # ---------------------------
            # Handle potential day rollover
            # ---------------------------
            travel_time = None
            if dep_sec is not None and arr_sec is not None:
                # If arrival second-of-day is *earlier*, assume next day
                if arr_sec < dep_sec:
                    arr_sec += 24 * 3600  # Add 24 hours in seconds

                travel_time = arr_sec - dep_sec

            # Fetch latitude/longitude
            from_lat = group_sorted.loc[i, "StopPlace Latitude"]
            from_lon = group_sorted.loc[i, "StopPlace Longitude"]
            to_lat = group_sorted.loc[i + 1, "StopPlace Latitude"]
            to_lon = group_sorted.loc[i + 1, "StopPlace Longitude"]

            # Initialize nodes if needed
            if from_stop not in graph:
                graph[from_stop] = {
                    "latitude": from_lat,
                    "longitude": from_lon,
                    "neighbors": {}
                }
            if to_stop not in graph:
                graph[to_stop] = {
                    "latitude": to_lat,
                    "longitude": to_lon,
                    "neighbors": {}
                }

            # Initialize or update the edge
            if to_stop not in graph[from_stop]["neighbors"]:
                graph[from_stop]["neighbors"][to_stop] = {}

            # Ensure the mode key is present
            if mode not in graph[from_stop]["neighbors"][to_stop]:
                graph[from_stop]["neighbors"][to_stop][mode] = []

            # Append travel time
            if travel_time is not None:
                graph[from_stop]["neighbors"][to_stop][mode].append(travel_time)

    # Compute average (mean) travel times
    for from_stop, data in graph.items():
        for to_stop, edge_data in data["neighbors"].items():
            for mode, times_list in edge_data.items():
                if isinstance(times_list, list):  # Only process if it is a list
                    edge_data[mode] = sum(times_list) / len(times_list) if times_list else None

    return graph


# Path to the CSV
csv_path = "routes_master_data.csv"

# Build the graph without scenic values
graph = build_traffic_graph(csv_path)
with open("graph.pickle", "wb") as file:
    pickle.dump(graph, file)
    
# Print a single sample stop to verify
for i, (src_stop, data) in enumerate(graph.items()):
    print(f"Sample Stop: {src_stop}")
    print(f"  Latitude: {data['latitude']}, Longitude: {data['longitude']}")
    print("  Neighbors:")
    for dst_stop, edge_data in data["neighbors"].items():
        print(f"    -> {dst_stop}: {edge_data}")
    break  # Display only the first stop and its neighbors

100%|██████████| 66131/66131 [01:34<00:00, 702.99it/s] 


Sample Stop: Alvik
  Latitude: 59.333385, Longitude: 17.98016
  Neighbors:
    -> Tranebergsplan: {'bus': 65.4416135881104}
    -> Alviksvägen: {'bus': 120.8135593220339}
    -> Alléparken: {'tram': 60.0, 'bus': 104.5}
    -> Kristineberg: {'metro': 120.0}
    -> Stora mossen: {'metro': 120.0}
    -> Johannesfred: {'tram': 180.0, 'bus': 387.90697674418607}
    -> Alviks strand: {'tram': 105.0}
    -> Lintavägen: {'bus': 392.3720930232558}
