In [18]:
import pandas as pd
import math
import numpy as np
import networkx as nx
import my_nx as my_nx
import matplotlib.pyplot as plt
from datetime import datetime,time, timedelta
import pickle
from process_transfers import get_merged_stops, get_merged_stops_names

In [19]:
# Load necessary datasets
base_path = r'C:\Users\baodu\Dropbox\Summer_research_2024\google_transit_subway'
trips = pd.read_csv(f'{base_path}\\filtered_trips.txt')
stop_times = pd.read_csv(f'{base_path}\\stop_times.txt')
stops = pd.read_csv(f'{base_path}\\stops.txt')

In [20]:
# Merge these 3 datasets into 1 dataset for easy data processing
stop_details = pd.merge(stop_times, trips, on='trip_id')
stop_details = pd.merge(stop_details, stops, on='stop_id')
stop_details = stop_details.sort_values(by=['trip_id', 'stop_sequence'])

In [21]:
# Function to calculate the distance of two locations with given lat and lon
def haversine(lat1, lon1, lat2, lon2):
    R = 6371000
    phi1, phi2 = math.radians(lat1), math.radians(lat2)
    delta_phi = math.radians(lat2 - lat1)
    delta_lambda = math.radians(lon2 - lon1)
    a = math.sin(delta_phi / 2)**2 + math.cos(phi1) * math.cos(phi2) * math.sin(delta_lambda / 2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    return R * c

In [None]:
# Set all stations to its parent station if there is one. 101N, 101S -> 101
stop_details['stop_id'] = stop_details.apply(lambda row: row['parent_station'] if pd.notna(row['parent_station']) else row['stop_id'], axis=1)
stops['stop_id'] = stops.apply(lambda row: row['parent_station'] if pd.notna(row['parent_station']) else row['stop_id'], axis=1)

In [22]:
# store what each stop corresponds to after merging
stops_mapping = {}
stop_lst = []

In [23]:
# cluster stops that are within 200 m
G = nx.Graph()

for index, row in stops.iterrows():
    G.add_node(row['stop_id'], pos=(row['stop_lat'], row['stop_lon']), name=row['stop_name'])
    
def merge_close_nodes(G, threshold):
    pos = nx.get_node_attributes(G, 'pos')
    names = nx.get_node_attributes(G, 'name')
    nodes = list(G.nodes())
    # print(nodes)
    
    clusters = []
    visited = set()
    
    for node in nodes:
        if node not in visited:
            cluster = {node}
            queue = [node]
            visited.add(node)
            while queue:
                current = queue.pop(0)
                current_pos = G.nodes[current]['pos']
                for neighbor in nodes:
                    if neighbor not in visited:
                        neighbor_pos = G.nodes[neighbor]['pos']
                        if haversine(current_pos[0], current_pos[1], neighbor_pos[0], neighbor_pos[1]) < threshold:
                            visited.add(neighbor)
                            queue.append(neighbor)
                            cluster.add(neighbor)
            clusters.append(cluster)
    
    for cluster in clusters:
        for node in cluster:
            str_val = ' '.join(list(map(str, cluster)))
            stops_mapping[node] = str_val
    
    return clusters

clusters = merge_close_nodes(G, 200)
# print(clusters)

In [24]:
# map stations to clusters
stop_details['stop_id'] = stop_details.apply(lambda row: stops_mapping[row['stop_id']] if row['stop_id'] in stops_mapping else row['stop_id'], axis=1)
stops['stop_id'] = stops.apply(lambda row: stops_mapping[row['stop_id']] if row['stop_id'] in stops_mapping else row['stop_id'], axis=1)

# get all unique stops after merging
stop_lst = stops['stop_id'].unique()
# print(stop_lst)

In [25]:
# Find which routes are availiable at each stop
route_per_stop = {}
for row in stop_details.itertuples():
    if row.stop_id not in route_per_stop:
        route_per_stop[row.stop_id] = [row.route_id]
    else:
        if row.route_id not in route_per_stop[row.stop_id]:
            route_per_stop[row.stop_id].append(row.route_id)
        else:
            continue
# route_per_stop
stop_details['routes_per_stop'] = stop_details['stop_id'].map(route_per_stop)
# print(route_per_stop)

In [26]:
def parse_time(curr_time):
    if curr_time == '24:00:00':
        return pd.to_datetime('2024-01-01 00:00:00')
    else:
        try:
            return pd.to_datetime(curr_time, format='%H:%M:%S')
        
        # The following error occurs when the time in the file is above 24:00:00 such as 25:00:00
        except ValueError:
            h, m, s = map(int, curr_time.split(':'))
            h = h % 24
            return (datetime(2024, 1, 1, h, m, s) + timedelta(days=h // 24))

stop_details['departure_time'] = stop_details['departure_time'].apply(parse_time)
stop_details['arrival_time'] =stop_details['arrival_time'].apply(parse_time)
# stop_details.to_csv('check.txt')

# Separate the data into weekdays and weekends
stop_details_weekday = stop_details[stop_details['service_id'] == 'Weekday']
stop_details_weekends = stop_details[(stop_details['service_id'] == 'Saturday') | (stop_details['service_id'] == 'Sunday')]

In [27]:
# Function to build ad
def build_matrix(stop_details):
    # record how many vehicle trips between every two stops
    trip_count = pd.DataFrame(0, index=stop_lst, columns=stop_lst)
    
    # record the travel time between every two stops
    travel_time = pd.DataFrame(0.0, index=stop_lst, columns=stop_lst)
    
    # Iterate through the stops details to build the matrix
    for i in range(len(stop_details)-1):
        curr_seq = stop_details.iloc[i]['stop_sequence']
        curr_id = stop_details.iloc[i]['stop_id']
        next_seq = stop_details.iloc[i+1]['stop_sequence']
        next_id = stop_details.iloc[i+1]['stop_id']
        
        # If this condition meets, we know next_id is the next stop of the curr_id.  
        if curr_seq+1 == next_seq:
            trip_count.loc[curr_id, next_id] += 1
            travel_time.loc[curr_id, next_id] = (stop_details.iloc[i+1]['arrival_time'] - stop_details.iloc[i]['departure_time']).total_seconds() / 60

    return trip_count, travel_time

# build_matrix(stop_details_weekday)

In [28]:
def compute_edge_labels(trip_count, travel_time):
    edge_labels ={}
    for i in trip_count.index:
        for j in trip_count.columns:
            if trip_count.loc[i,j] !=0:
                edge_labels[(i, j)] = (trip_count.loc[i,j],  travel_time.loc[i,j])
    return edge_labels

In [29]:
def build_network(clusters, trip_count, travel_time, title):
    # Create a new graph with merged nodes
    new_G = nx.DiGraph()
    
    # G is the graph containing all stops before merging
    names = nx.get_node_attributes(G, 'name')
    nodes = list(G.nodes())
    
    for cluster in clusters:
        # Calculate centroid
        lat_sum = sum(G.nodes[node]['pos'][0] for node in cluster)
        lon_sum = sum(G.nodes[node]['pos'][1] for node in cluster)
        centroid = (lon_sum / len(cluster), lat_sum / len(cluster))
        
        # Combine names
        combined_name = " / ".join(sorted({names[node] for node in cluster}))
        temp_name = ' '.join(list(map(str, cluster)))
        new_node = f"{temp_name}"
        new_G.add_node(new_node, pos=centroid, name=combined_name, routes=route_per_stop[temp_name])

    
    for i in trip_count.index:
        for j in trip_count.columns:
            if trip_count.loc[i, j] > 0:  # There is a trip from i to j
                routes_i = set(nx.get_node_attributes(new_G, 'routes')[i])
                routes_j = set(nx.get_node_attributes(new_G, 'routes')[j])
                # Ensure both stops share at least one common route
                common_routes = routes_i.intersection(routes_j)
                if common_routes:
                    new_G.add_edge(i, j, vechicle_trips=trip_count.loc[i, j], travel_time=travel_time.loc[i, j], routes=list(common_routes))
    

    new_G.remove_edges_from(nx.selfloop_edges(new_G))

    # Plot the network
    pos = nx.get_node_attributes(new_G, 'pos')
    name = nx.get_node_attributes(new_G, 'name')
    edge_labels = {(u, v): f"{d['vechicle_trips']}, {d['travel_time']}" for u, v, d in new_G.edges(data=True)}

    plt.figure(figsize=(40, 30))
    nx.draw(new_G, pos, node_size=8, node_color='grey', labels=name, font_color='purple', font_size='12', with_labels=True, arrowsize=20, verticalalignment='bottom', connectionstyle='arc3, rad = 0.1')
    
    my_nx.my_draw_networkx_edge_labels(new_G, pos=pos, edge_labels=edge_labels, font_color='red', font_size=8, rotate=True, rad=0.1)

    plt.show()
    plt.savefig(f'{title}', dpi=500)
    pickle.dump(new_G, open(f'{title}.pickle', 'wb'))
    return new_G

In [30]:
%matplotlib widget
%matplotlib qt

In [31]:
weekday_trip_count, weekday_travel_time = build_matrix(stop_details_weekday)
weekend_trip_count, weekend_travel_time = build_matrix(stop_details_weekends)


weekday_trip_count.to_csv('weekday_trip_count.csv')
weekend_trip_count.to_csv('weekend_trip_count.csv')

weekday_travel_time.to_csv('weekday_travel_time.csv')
weekend_travel_time.to_csv('weekend_travel_time.csv')

weekday_edge_labels = compute_edge_labels(weekday_trip_count, weekday_travel_time)
weekend_edge_labels = compute_edge_labels(weekend_trip_count, weekend_travel_time)

In [32]:
weekday_G = build_network(clusters, weekday_trip_count, weekday_travel_time, 'subway_network_weekday')
weekend_G = build_network(clusters, weekend_trip_count, weekend_travel_time, 'subway_network_weekend')
print(weekday_G)

DiGraph with 435 nodes and 1080 edges
