In [4]:
import asyncio
import google.generativeai as genai
import os
import json
import pickle
import math
import numpy as np
from tqdm.asyncio import tqdm_asyncio
from pydantic import BaseModel
from time import perf_counter, sleep
from dotenv import load_dotenv
load_dotenv()

genai.configure(api_key=os.environ["GEMINI_API_KEY"])
model = genai.GenerativeModel("gemini-1.5-flash-8b")
        
class ScenicValue(BaseModel):
    scenic_value: float

async def assign_scenic_description(stop_name, season="summer"):
    result = await model.generate_content_async(
        contents=[
            "Respond using a list.",
            f"Instructions: '{system_prompt1}'",
            f"Context: The stop name is '{stop_name}'. The current season is '{season}'."
        ],
        generation_config=genai.GenerationConfig(
            max_output_tokens=500,
            temperature=0,
        )
    )
    description = result.text
    return description

async def assign_scenic_value(description):
    result = await model.generate_content_async(
        contents=[
            "Respond with one number with one decimal.", 
            f"Instructions: {system_prompt2}", 
            f"Context: {description}"
        ],
        generation_config=genai.GenerationConfig(
            response_mime_type='application/json',
            response_schema=ScenicValue,
            max_output_tokens=50,
            temperature=0,
        )
    )
    scenic_value = json.loads(result.text)["scenic_value"]
    return scenic_value

async def evaluate(stop_name):
    description = await assign_scenic_description(stop_name)
    scenic_value = await assign_scenic_value(description)
    return {stop_name: scenic_value}

async def dummy_evaluate(stop_name):
    scenic_value = np.random.randint(0, 10)
    return {stop_name: scenic_value}

def split_list_into_chunks(original_list, max_chunk_size=2000):
    length = len(original_list)
    num_chunks = math.ceil(length / max_chunk_size)
    base_chunk_size = length // num_chunks
    remainder = length % num_chunks

    chunks = []
    start = 0
    for i in range(num_chunks):
        end = start + base_chunk_size + (1 if i < remainder else 0)
        chunks.append(original_list[start:end])
        start = end

    return chunks

with open('graph_with_coordinates_and_neighbors.pickle', 'rb') as file:
    graph = pickle.load(file)

def sub_graph(data, n=2):
    keys = list(data.keys())
    sliced_keys = keys[:n]
    return {key: data[key] for key in sliced_keys}

graph = sub_graph(graph, n=len(graph)) # set n = len(data) for the full dataset
stop_names = list(graph.keys()) # A list of stops represented as strings
stop_name_chunks = split_list_into_chunks(stop_names)
stop_scenic_values = []

# Creates a list of dictionaries [{stop_name: scenic_value}]
for stop_name_chunk in stop_name_chunks:
    tasks = [evaluate(stop_name) for stop_name in stop_name_chunk]
    stop_scenic_values_chunk = await tqdm_asyncio.gather(*tasks)
    stop_scenic_values += stop_scenic_values_chunk
    # Saving
    with open('stop_scenic_values.json', 'w') as file:
        json.dump(stop_scenic_values, file)
    sleep(60)

# Mapping the scenic values to each stops and its corresponding neighbors
scenic_values_dict = {}
for scenic_value in stop_scenic_values:
    for stop, value in scenic_value.items():
        scenic_values_dict[stop] = value

for stop, info in graph.items():
    if stop in scenic_values_dict:
        info['scenic_value'] = scenic_values_dict[stop]
    for neighbor, neighbor_info in info.get('neighbors', {}).items():
        if neighbor in scenic_values_dict:
            neighbor_info['scenic_value'] = scenic_values_dict[neighbor]

print(graph["Campus Roslagen"])
with open('graph_with_scenic_values.pickle', 'wb') as file:
    pickle.dump(graph, file)

100%|██████████| 1432/1432 [00:00<00:00, 67451.78it/s]
100%|██████████| 1432/1432 [00:00<00:00, 56055.58it/s]
100%|██████████| 1431/1431 [00:00<00:00, 46979.46it/s]


{'neighbors': {'Gustavslund': {'longitude': 18.685677, 'latitude': 59.748096, 'transport_mode': 'bus', 'scenic_value': 7}, 'Astrid Lindgrens gata': {'longitude': 18.685677, 'latitude': 59.748096, 'transport_mode': 'bus', 'scenic_value': 4}, 'Backtorp': {'longitude': 18.685677, 'latitude': 59.748096, 'transport_mode': 'bus', 'scenic_value': 5}, 'Norrtälje busstation': {'longitude': 18.685677, 'latitude': 59.748096, 'transport_mode': 'bus', 'scenic_value': 2}, 'Malsta vägskäl': {'longitude': 18.685677, 'latitude': 59.748096, 'transport_mode': 'bus', 'scenic_value': 6}, 'Stockholmsvägen': {'longitude': 18.685677, 'latitude': 59.748096, 'transport_mode': 'bus', 'scenic_value': 5}, 'Södra Lohärad': {'longitude': 18.685677, 'latitude': 59.748096, 'transport_mode': 'bus', 'scenic_value': 4}, 'Rösa trafikplats': {'longitude': 18.685677, 'latitude': 59.748096, 'transport_mode': 'bus', 'scenic_value': 2}}, 'latitude': 59.748096, 'longitude': 18.685677, 'scenic_value': 5}


# Creating the graph from csv data

In [1]:
# Santosh scraped data from SL API and constructed 'routes_master_data.csv'
# Wenhan wrote the code in this code cell
# Erik contributed to debugging
import math
import json
import pandas as pd
from tqdm import tqdm
import pickle

def time_to_seconds(time_str):
    """
    Converts a 'HH:MM:SS' string into seconds after midnight (integer).
    Returns None if the input is invalid (NaN or None).
    """
    if not time_str or pd.isna(time_str):
        return None
    
    hh, mm, ss = time_str.split(":")
    return int(hh) * 3600 + int(mm) * 60 + int(ss)

def build_traffic_graph(csv_path):
    """
    Build a graph from the routes_master_data CSV.
    Scenic values are intentionally excluded from this graph.
    """

    # Initialize the graph
    graph = {}

    # Read the CSV into a DataFrame
    df = pd.read_csv(csv_path)

    # Group by Route, Direction, Journey for consecutive stops
    grouped = df.groupby(["Route_ID", "Direction", "Journey_ID"], dropna=False)
    
    for _, group in tqdm(grouped):
        # Sort by 'Order'
        group_sorted = group.sort_values("Order", ascending=True)
        group_sorted = group_sorted.dropna(subset=["StopPlace Name"]).reset_index(drop=True)

        for i in range(len(group_sorted) - 1):
            from_stop = group_sorted.loc[i, "StopPlace Name"]
            to_stop = group_sorted.loc[i + 1, "StopPlace Name"]

            mode = group_sorted.loc[i, "TransportMode"]
            if pd.isna(mode):
                mode = "unknown"

            dep_time_str = group_sorted.loc[i, "DepartureTime"]
            arr_time_str = group_sorted.loc[i + 1, "ArrivalTime"]
            dep_sec = time_to_seconds(dep_time_str)
            arr_sec = time_to_seconds(arr_time_str)

            # ---------------------------
            # Handle potential day rollover
            # ---------------------------
            travel_time = None
            if dep_sec is not None and arr_sec is not None:
                # If arrival second-of-day is *earlier*, assume next day
                if arr_sec < dep_sec:
                    arr_sec += 24 * 3600  # Add 24 hours in seconds

                travel_time = arr_sec - dep_sec

            # Fetch latitude/longitude
            from_lat = group_sorted.loc[i, "StopPlace Latitude"]
            from_lon = group_sorted.loc[i, "StopPlace Longitude"]
            to_lat = group_sorted.loc[i + 1, "StopPlace Latitude"]
            to_lon = group_sorted.loc[i + 1, "StopPlace Longitude"]

            # Initialize nodes if needed
            if from_stop not in graph:
                graph[from_stop] = {
                    "latitude": from_lat,
                    "longitude": from_lon,
                    "neighbors": {}
                }
            if to_stop not in graph:
                graph[to_stop] = {
                    "latitude": to_lat,
                    "longitude": to_lon,
                    "neighbors": {}
                }

            # Initialize or update the edge
            if to_stop not in graph[from_stop]["neighbors"]:
                graph[from_stop]["neighbors"][to_stop] = {}

            # Ensure the mode key is present
            if mode not in graph[from_stop]["neighbors"][to_stop]:
                graph[from_stop]["neighbors"][to_stop][mode] = []

            # Append travel time
            if travel_time is not None:
                graph[from_stop]["neighbors"][to_stop][mode].append(travel_time)

    # Compute average (mean) travel times
    for from_stop, data in graph.items():
        for to_stop, edge_data in data["neighbors"].items():
            for mode, times_list in edge_data.items():
                if isinstance(times_list, list):  # Only process if it is a list
                    edge_data[mode] = sum(times_list) / len(times_list) if times_list else None

    return graph


# Path to the CSV
csv_path = "routes_master_data.csv"

# Build the graph without scenic values
graph = build_traffic_graph(csv_path)
with open("graph.pickle", "wb") as file:
    pickle.dump(graph, file)
    
# Print a single sample stop to verify
for i, (src_stop, data) in enumerate(graph.items()):
    print(f"Sample Stop: {src_stop}")
    print(f"  Latitude: {data['latitude']}, Longitude: {data['longitude']}")
    print("  Neighbors:")
    for dst_stop, edge_data in data["neighbors"].items():
        print(f"    -> {dst_stop}: {edge_data}")
    break  # Display only the first stop and its neighbors

100%|██████████| 66131/66131 [01:34<00:00, 702.99it/s] 


Sample Stop: Alvik
  Latitude: 59.333385, Longitude: 17.98016
  Neighbors:
    -> Tranebergsplan: {'bus': 65.4416135881104}
    -> Alviksvägen: {'bus': 120.8135593220339}
    -> Alléparken: {'tram': 60.0, 'bus': 104.5}
    -> Kristineberg: {'metro': 120.0}
    -> Stora mossen: {'metro': 120.0}
    -> Johannesfred: {'tram': 180.0, 'bus': 387.90697674418607}
    -> Alviks strand: {'tram': 105.0}
    -> Lintavägen: {'bus': 392.3720930232558}
