# Graph Creation and Weighting

This notebook performs the following steps:
1.  Loads the base subway graph from a GML file.
2.  Loads subway station location data.
3.  Calculates the real-world distance (in kilometers) for each edge (connection) in the graph using the Haversine formula.
4.  Adds this distance as a `distance` attribute to each edge.
5.  Saves the final, weighted graph to a new GML file for use in analysis notebooks.

In [35]:
import networkx as nx
import matplotlib.pyplot as plt

# Load the base GML file 
# This file represents the network topology without edge weights.
file_path = 'data/subway_graph_weekday_weekend.gml'
G_weights = nx.read_gml(file_path)
G_weights = G_weights.to_undirected()

# Display basic information about the graph
print(f"Graph loaded successfully from {file_path}")
print(f"Number of nodes: {G_weights.number_of_nodes()}")
print(f"Number of edges: {G_weights.number_of_edges()}")

Graph loaded successfully from data/subway_graph_weekday_weekend.gml
Number of nodes: 395
Number of edges: 708


In [36]:
# Install the haversine library if it's not already installed
# This library is used to calculate distances between lat/lon points.
!pip3 install haversine


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m


In [None]:
import pandas as pd
from tqdm import tqdm
from haversine import haversine, Unit

# Load subway stop coordinates from the GTFS data
# This file contains the latitude and longitude for each station.
stops_df = pd.read_csv('data/gtfs_subway/stops.txt')

# --- THIS IS THE CORRECTED LOGIC ---
# Create a lookup dictionary for station coordinates.
# The graph's nodes are station names, so we must key the dictionary by 'stop_name'.
station_coords = stops_df.set_index('stop_name')[['stop_lat', 'stop_lon']].T.to_dict()

# Add 'distance' attribute (in km) to each edge in the graph
for u, v in tqdm(G_weights.edges(), desc="Calculating edge distances"):
    # The graph nodes u and v are the station names.
    if u in station_coords and v in station_coords:
        point1 = (station_coords[u]['stop_lat'], station_coords[u]['stop_lon'])
        point2 = (station_coords[v]['stop_lat'], station_coords[v]['stop_lon'])
        
        distance = haversine(point1, point2, unit=Unit.KILOMETERS)
        G_weights.edges[u, v]['distance'] = distance

print("\\nFinished adding distance attributes to graph edges.")

# Verify an edge has the distance attribute
try:
    # Find an edge that should now have a distance
    edge_with_distance = None
    for u, v, data in G_weights.edges(data=True):
        if 'distance' in data:
            edge_with_distance = (u, v)
            break
    
    if edge_with_distance:
        print(f"Verification successful! Example edge {edge_with_distance} has distance: {G_weights.edges[edge_with_distance]['distance']:.4f} km")
    else:
        print("Warning: Could not find any edges with the 'distance' attribute after processing.")

except Exception as e:
    print(f"Could not verify edge distance: {e}")

Calculating edge distances: 100%|██████████| 708/708 [00:00<00:00, 1930797.94it/s]

\nFinished adding distance attributes to graph edges.





In [39]:
# --- Advanced Debugging Cell ---
# Let's find out why the labels aren't matching.

# 1. Inspect the data of the first 5 nodes in the graph
print("--- Graph Node Data Inspection ---")
try:
    node_sample = list(G_weights.nodes(data=True))[:5]
    for i, node_data in enumerate(node_sample):
        print(f"Node {i}: {node_data}")
except Exception as e:
    print(f"Could not inspect graph nodes: {e}")

# 2. Inspect the first 5 keys from the coordinate dictionary
print("\\n--- Coordinate Dictionary Key Inspection ---")
try:
    coord_keys_sample = list(station_coords.keys())[:5]
    print(f"Coord Keys: {coord_keys_sample}")
except Exception as e:
    print(f"Could not inspect coordinate keys: {e}")

# 3. Try to manually find a match
print("\\n--- Manual Match Attempt ---")
try:
    # Get the label of the first node from the graph
    first_node_id, first_node_data = node_sample[0]
    first_node_label = first_node_data.get('label')
    print(f"Attempting to match graph node label: '{first_node_label}' (Type: {type(first_node_label)})")
    
    # Check if this label exists in the coordinate dictionary keys
    if first_node_label in station_coords:
        print("SUCCESS: A match was found!")
    else:
        print("FAILURE: The label was NOT found in the coordinate keys.")
        # Let's check if there's a type mismatch, e.g., string vs. integer
        if str(first_node_label) in station_coords:
            print("HINT: A match was found after converting the label to a STRING.")
        else:
            print("HINT: No match found even after converting to string. The labels may be fundamentally different.")

except Exception as e:
    print(f"An error occurred during the manual match attempt: {e}")

--- Graph Node Data Inspection ---
Node 0: ('Van Cortlandt Park-242 St', {})
Node 1: ('238 St', {})
Node 2: ('231 St', {})
Node 3: ('Marble Hill-225 St', {})
Node 4: ('215 St', {})
\n--- Coordinate Dictionary Key Inspection ---
Coord Keys: ['101', '101N', '101S', '103', '103N']
\n--- Manual Match Attempt ---
Attempting to match graph node label: 'None' (Type: <class 'NoneType'>)
FAILURE: The label was NOT found in the coordinate keys.
HINT: No match found even after converting to string. The labels may be fundamentally different.


In [38]:
# Save the weighted graph to a new GML file
# This file can now be used directly in analysis notebooks.
output_path = 'data/subway_graph_weekday_weekend_weights.gml'
nx.write_gml(G_weights, output_path)

print(f"Weighted graph saved successfully to {output_path}")

Weighted graph saved successfully to data/subway_graph_weekday_weekend_weights.gml
