<a href="https://colab.research.google.com/github/RizkyWidodo-project/EJ-EONC_Project/blob/main/R5py_Centroid_Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Setting up the Environment (Installing R5py and Dependencies)


In [None]:
# Install Java Development Kit (JDK) - Version 21
!sudo apt-get update -qq
!sudo apt-get install -y openjdk-21-jdk-headless -qq > /dev/null
print("OpenJDK 21 installed.")

# Set JAVA_HOME environment variable to Java 21
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-21-openjdk-amd64"
print(f"JAVA_HOME set to: {os.environ['JAVA_HOME']}")

# Verify Java version
!java -version

# Install r5py and other necessary libraries
!pip install pandas geopandas r5py
print("r5py, pandas, and geopandas installed.")

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 78, <> line 2.)
debconf: falling back to frontend: Readline
debconf: unable to initialize frontend: Readline
debconf: (This frontend requires a controlling tty.)
debconf: falling back to frontend: Teletype
dpkg-preconfigure: unable to re-open stdin: 
OpenJDK 21 installed.
JAVA_HOME set to: /usr/lib/jvm/java-21-openjdk-amd64
openjdk version "21.0.7" 2025-04-15
OpenJDK Runtime Environment (build 21.0.7+6-Ubuntu-0ubuntu122.04)
OpenJDK 64-Bit Server VM (build 21.0.7+6-Ubuntu-0ubuntu122.04, mixed mode, sharing)
Collecting r5py
  Downloading r5py-1.0.5-py3-none-any.whl.metadata (10.0 kB)
Collecting ConfigArgPars


---
#2. Writing the R5py Code for Assignment

## 2.1. Import Libraries and Load Data

In [None]:
from google.colab import drive
import os
import pandas as pd
import geopandas as gpd
from r5py import TransportNetwork, TravelTimeMatrixComputer, TransportMode
from datetime import datetime

# Mount Google Drive
drive.mount('/content/drive')

# --- Define base path to your Google Drive folder ---
gdrive_base_path = "file_path"

# --- Define file names and full paths ---
centroids_gpkg_file = os.path.join(gdrive_base_path, "centroids.gpkg")
facilities_csv_file = os.path.join(gdrive_base_path, "EONC Facility.csv")
osm_file = os.path.join(gdrive_base_path, "east java.osm.pbf")
output_csv_file = os.path.join(gdrive_base_path, "output.csv")

# --- Define actual column names ---
centroid_id_col = 'cell_id'
centroid_lat_col = 'centroids_lat'
centroid_lon_col = 'centroids_lng'

facility_id_col = 'facility_id'
facility_lat_col = 'Latitude'
facility_lon_col = 'Longitude'

# Load centroids
centroids_df = pd.read_csv(centroids_csv_file)
print(f"--- First 5 rows of Centroids CSV ({centroids_csv_file}): ---")
print(centroids_df.head())
print("\n--- Info for Centroids CSV: ---")
centroids_df.info()

# Convert centroids_df to GeoDataFrame
# Ensure the ID column is suitable for R5py (string or int, unique)
centroids_df[centroid_id_col] = centroids_df[centroid_id_col].astype(str)
centroids_gdf = gpd.GeoDataFrame(
    centroids_df,
    geometry=gpd.points_from_xy(centroids_df[centroid_lon_col], centroids_df[centroid_lat_col]),
    crs="EPSG:4326"
)
# Rename the ID column to 'id' for r5py compatibility
if centroid_id_col != 'id':
    centroids_gdf = centroids_gdf.rename(columns={centroid_id_col: 'id'})
else:
    centroids_gdf['id'] = centroids_gdf['id'].astype(str) # Ensure 'id' column exists and is string

print(f"\nLoaded and processed {len(centroids_gdf)} centroids.")
print(centroids_gdf[['id', 'geometry']].head())


# Load healthcare facilities
print(f"\nLoading facilities from: {facilities_csv_file}")
facilities_df = pd.read_csv(facilities_csv_file)
print(f"\n--- First 5 rows of Facilities CSV: ---")
print(facilities_df.head())
print("\n--- Info for Facilities CSV: ---")
facilities_df.info()

# Convert facilities_df to GeoDataFrame using 'temp_unique_facility_id'
# The facility_id_col variable (defined in Step 4) should now be 'temp_unique_facility_id'
facilities_df[facility_id_col] = facilities_df[facility_id_col].astype(str) # Ensure it's string
facilities_gdf = gpd.GeoDataFrame(
    facilities_df,
    geometry=gpd.points_from_xy(facilities_df[facility_lon_col], facilities_df[facility_lat_col]),
    crs="EPSG:4326"
)
# Rename the ID column to 'id' for r5py compatibility
if facility_id_col != 'id': # facility_id_col is 'temp_unique_facility_id'
    facilities_gdf = facilities_gdf.rename(columns={facility_id_col: 'id'})
else:
    facilities_gdf['id'] = facilities_gdf['id'].astype(str)

print(f"\nLoaded and processed {len(facilities_gdf)} healthcare facilities.")
print(facilities_gdf[['id', 'geometry']].head()) # Check the new 'id' column

print(f"\nUsing OSM file: {osm_file}. Make sure this is the correct name of your uploaded .pbf file.")

In [None]:
print("\n--- Checking for duplicate IDs in centroids_gdf ---")
duplicate_centroid_ids = centroids_gdf['id'].value_counts()[centroids_gdf['id'].value_counts() > 1]
if not duplicate_centroid_ids.empty:
    print("Found duplicate IDs in centroids_gdf:")
    print(duplicate_centroid_ids)
    print("\nExample rows with one of the duplicate IDs (first duplicate found):")
    if len(duplicate_centroid_ids.index) > 0:
      print(centroids_gdf[centroids_gdf['id'] == duplicate_centroid_ids.index[0]])
else:
    print("No duplicate IDs found in centroids_gdf. 'id' column is unique.")

print("\n--- Checking for duplicate IDs in facilities_gdf ---")
duplicate_facility_ids = facilities_gdf['id'].value_counts()[facilities_gdf['id'].value_counts() > 1]
if not duplicate_facility_ids.empty:
    print("Found duplicate IDs in facilities_gdf:")
    print(duplicate_facility_ids)
    print("\nExample rows with one of the duplicate IDs (first duplicate found):")
    if len(duplicate_facility_ids.index) > 0:
        print(facilities_gdf[facilities_gdf['id'] == duplicate_facility_ids.index[0]])
else:
    print("No duplicate IDs found in facilities_gdf. 'id' column is unique.")


--- Checking for duplicate IDs in centroids_gdf ---
No duplicate IDs found in centroids_gdf. 'id' column is unique.

--- Checking for duplicate IDs in facilities_gdf ---
No duplicate IDs found in facilities_gdf. 'id' column is unique.


## 2.2. Initialize the Transport Network

In [None]:
# Build the transport network from the OSM file
print(f"\nBuilding transport network from {osm_file}...")
transport_network = TransportNetwork(osm_pbf=osm_file)
print("Transport network built successfully!")


Building transport network from /content/drive/My Drive/MAP5010 Research Project/Assign Centroid/east java.osm.pbf...
Transport network built successfully!


## 2.3. Compute Travel Times (for R5py to Determine "Nearest")

In [None]:
# In Step 5.3 - TEMPORARY DIAGNOSTIC TEST

from r5py import TravelTimeMatrix
from datetime import datetime
import pandas as pd # Just in case it's needed for other parts you might add

departure_datetime = datetime(2025, 5, 27, 10, 0, 0)

print("--- DIAGNOSTIC TEST: Initializing TravelTimeMatrix with only 10 origins ---")
try:
    # Using only the first 10 centroids for this test
    test_origins = centroids_gdf[['id', 'geometry']].head(10)
    all_transport_modes = [
        TransportMode.WALK,
        TransportMode.BICYCLE,
        TransportMode.TRANSIT,
        TransportMode.CAR
]
    travel_time_matrix_test = TravelTimeMatrix(
        transport_network,
        origins=test_origins, # Using the small subset of origins
        destinations=facilities_gdf[['id', 'geometry']], # Still using all destinations for this test
        departure=departure_datetime,
        transport_modes=all_transport_modes
    )
    print("SUCCESS: TravelTimeMatrix initialized QUICKLY with 10 origins.")

except Exception as e:
    print(f"ERROR during diagnostic test: {e}")

print("--- End of DIAGNOSTIC TEST ---")

--- DIAGNOSTIC TEST: Initializing TravelTimeMatrix with only 10 origins ---




SUCCESS: TravelTimeMatrix initialized QUICKLY with 10 origins.
--- End of DIAGNOSTIC TEST ---


In [None]:
# Define departure time (R5py requires a specific datetime)
departure_datetime = datetime(2025, 5, 27, 10, 0, 0) # Year, Month, Day, Hour, Minute, Second

# Initialize the TravelTimeMatrixComputer
# It's crucial that centroids_gdf and facilities_gdf have an 'id' column and a 'geometry' column
all_transport_modes = [
        TransportMode.WALK,
        TransportMode.BICYCLE,
        TransportMode.TRANSIT,
        TransportMode.CAR
]
travel_time_computer = TravelTimeMatrixComputer(
    transport_network,
    origins=centroids_gdf[['id', 'geometry']],      # Uses the processed GeoDataFrames
    destinations=facilities_gdf[['id', 'geometry']],# Uses the processed GeoDataFrames
    departure=departure_datetime,
    transport_modes=all_transport_modes # Driving mode
)
print("\nTravelTimeMatrixComputer initialized.")

# Compute the travel times (R5py uses these to determine 'nearest')
print("Computing travel time matrix for assignment... (this may take a while)")
r5py_travel_times_df = travel_time_computer.compute_travel_times()

if r5py_travel_times_df.empty:
    print("WARNING: The computed travel time matrix is empty!")
    print("Please check:")
    print("1. OSM file coverage: Does your OSM map cover the area of your centroids and facilities?")
    print("2. Coordinate systems: Are centroids and facilities correctly located (check latitudes/longitudes)?")
    print("3. R5py errors during network build or computation.")
else:
    print("R5py travel time matrix computed!")
    print(r5py_travel_times_df.head())

  travel_time_computer = TravelTimeMatrixComputer(



TravelTimeMatrixComputer initialized.
Computing travel time matrix for assignment... (this may take a while)
R5py travel time matrix computed!
  from_id to_id  travel_time
0  150558     0          NaN
1  150558     1          NaN
2  150558     2          NaN
3  150558     3          NaN
4  150558     4          NaN


## 2.4. Assign Centroids to the Nearest Facility (based on R5py's calculation)

In [None]:
#Assign each centroid to the nearest facility based on R5py's network travel times
assigned_centroids_df = pd.DataFrame() # Initialize an empty DataFrame for results

if not r5py_travel_times_df.empty:
    # Filter out rows where travel_time is NaN before sorting and grouping
    r5py_travel_times_df_cleaned = r5py_travel_times_df.dropna(subset=['travel_time'])

    if not r5py_travel_times_df_cleaned.empty:
        r5py_travel_times_df_sorted = r5py_travel_times_df_cleaned.sort_values(by=['from_id', 'travel_time'])
        nearest_facility_assignment_r5py = r5py_travel_times_df_sorted.loc[
            r5py_travel_times_df_sorted.groupby('from_id')['travel_time'].idxmin()
        ]

        nearest_facility_assignment_r5py = nearest_facility_assignment_r5py.rename(
            columns={
                'from_id': 'centroid_id_r5py', # Renaming to avoid clash if 'centroid_id' is an original col name
                'to_id': 'nearest_facility_id',
                'travel_time': 'r5py_travel_time_for_assignment (min)'
            }
        )

        print("\nAssignment of centroids to nearest facility (based on R5py):")
        print(nearest_facility_assignment_r5py.head())

        # --- Merge with original centroid data ---
        centroids_df_for_merge = centroids_df.copy()
        centroids_df_for_merge[centroid_id_col] = centroids_df_for_merge[centroid_id_col].astype(str)


        assigned_centroids_df = centroids_df_for_merge.merge(
            nearest_facility_assignment_r5py,
            left_on=centroid_id_col, # Original ID column from centroids_df e.g. 'cell_id'
            right_on='centroid_id_r5py', # This 'id' came from the centroids_gdf that used centroid_id_col
            how='left'
        )

        # Clean up columns: drop the temporary 'centroid_id_r5py' if it's different from the original centroid id column name
        if 'centroid_id_r5py' in assigned_centroids_df.columns and 'centroid_id_r5py' != centroid_id_col:
            assigned_centroids_df = assigned_centroids_df.drop(columns=['centroid_id_r5py'])

        print("\nFinal assignment data merged with original centroid details:")

        # Define the columns you expect in the final output
        # (original centroid columns + new assignment columns)
        expected_display_cols = [centroid_id_col, centroid_lat_col, centroid_lon_col,
                                 'nearest_facility_id', 'r5py_travel_time_for_assignment (min)']

        # Filter for columns that actually exist in the DataFrame to avoid KeyError
        actual_display_cols = [col for col in expected_display_cols if col in assigned_centroids_df.columns]

        if not assigned_centroids_df.empty:
            # Display only relevant columns and handle potential missing columns gracefully
            display_cols = [col for col in actual_display_cols if col in assigned_centroids_df.columns]
            if display_cols:
                 print(assigned_centroids_df[display_cols].head())
            else:
                 print("Could not find expected display columns in the merged DataFrame.")
                 print("Columns available:", assigned_centroids_df.columns.tolist())
        else:
            print("assigned_centroids_df is empty after merge. Check merge keys and logic.")
    else:
        print("After removing NaNs, the travel time matrix is empty. No valid routes found for any origin-destination pair.")

else:
    print("Travel time matrix was empty. Cannot perform assignment. Please check previous steps.")


Assignment of centroids to nearest facility (based on R5py):
      centroid_id_r5py nearest_facility_id  \
44610           100110                 280   
6061            102603                 264   
6402            103018                 264   
12025           104279                  90   
12366           104694                  90   

       r5py_travel_time_for_assignment (min)  
44610                                   62.0  
6061                                    44.0  
6402                                    44.0  
12025                                   31.0  
12366                                   45.0  

Final assignment data merged with original centroid details:
  cell_id  centroids_lat  centroids_lng nearest_facility_id  \
0  150558      -8.480965     113.714199                  25   
1  150974      -8.476465     113.721993                  25   
2  151805      -8.467465     113.737582                  25   
3  152218      -8.444965     113.745376                  25   
4 

# 3. Saving Your Assignment Results

In [None]:
#Save the assignment results to a CSV file
output_filename = "ASSIGNMENT_R5py_centroids_to_facilities.csv"

if not assigned_centroids_df.empty:
    # Select relevant columns for the output CSV.
    # You'll need the original centroid ID, its lat/lon, and the ID of the assigned facility.
    # The r5py_travel_time is optional but can be good for reference.

    columns_to_save = []
    # Add original centroid columns by their defined names
    if centroid_id_col in assigned_centroids_df.columns:
        columns_to_save.append(centroid_id_col)
    if centroid_lat_col in assigned_centroids_df.columns:
         columns_to_save.append(centroid_lat_col) # Make sure this is the original lat column
    if centroid_lon_col in assigned_centroids_df.columns:
        columns_to_save.append(centroid_lon_col) # Make sure this is the original lon column

    # Add assignment specific columns
    if 'nearest_facility_id' in assigned_centroids_df.columns:
        columns_to_save.append('nearest_facility_id')
    if 'r5py_travel_time_for_assignment (min)' in assigned_centroids_df.columns:
        columns_to_save.append('r5py_travel_time_for_assignment (min)')

    # Ensure no duplicate columns if, for example, centroid_id_col was 'id'
    columns_to_save = sorted(list(set(columns_to_save)))


    if columns_to_save: # if list is not empty
        output_df = assigned_centroids_df[columns_to_save]
        output_df.to_csv(output_filename, index=False)
        print(f"\nAssignment results saved to {output_filename}")
        print(f"Columns saved: {output_df.columns.tolist()}")
        print("This file contains original centroid data merged with its 'nearest_facility_id' and R5py's assignment travel time.")
        print("You can now use these pairs (centroid and its assigned nearest_facility_id) with the Google Maps API.")
    else:
        print("\nCould not determine columns to save. Saving all columns from assigned_centroids_df.")
        assigned_centroids_df.to_csv(output_filename, index=False)
        print(f"\nAssignment results (all columns) saved to {output_filename}")

else:
    print("\nNo assignment data to save as the result DataFrame was empty.")

# You can download this file from the Colab "Files" tab on the left (refresh if needed).


Assignment results saved to ASSIGNMENT_R5py_centroids_to_facilities.csv
Columns saved: ['cell_id', 'centroids_lat', 'centroids_lng', 'nearest_facility_id', 'r5py_travel_time_for_assignment (min)']
This file contains original centroid data merged with its 'nearest_facility_id' and R5py's assignment travel time.
You can now use these pairs (centroid and its assigned nearest_facility_id) with the Google Maps API.


In [None]:
import pandas as pd
import geopandas as gpd

FACILITIES_GDF_ID_COL = 'id'
FACILITIES_GDF_NAME_COL = 'nama_fasilitas'
FACILITIES_GDF_GEOMETRY_COL = 'geometry'

output_facility_name_col = 'nearest_facility_name'
output_facility_lat_col = 'nearest_facility_latitude'
output_facility_lon_col = 'nearest_facility_longitude'

if not assigned_centroids_df.empty:
    if 'nearest_facility_id' in assigned_centroids_df.columns:
        if FACILITIES_GDF_ID_COL in facilities_gdf.columns and \
           FACILITIES_GDF_NAME_COL in facilities_gdf.columns and \
           FACILITIES_GDF_GEOMETRY_COL in facilities_gdf.columns:

            facility_details_to_merge = pd.DataFrame({
                FACILITIES_GDF_ID_COL: facilities_gdf[FACILITIES_GDF_ID_COL],
                FACILITIES_GDF_NAME_COL: facilities_gdf[FACILITIES_GDF_NAME_COL],
                'facility_temp_lat': facilities_gdf[FACILITIES_GDF_GEOMETRY_COL].y,
                'facility_temp_lon': facilities_gdf[FACILITIES_GDF_GEOMETRY_COL].x
            })

            assigned_centroids_df = pd.merge(
                assigned_centroids_df,
                facility_details_to_merge,
                left_on='nearest_facility_id',
                right_on=FACILITIES_GDF_ID_COL,
                how='left'
            )

            rename_map = {}
            if FACILITIES_GDF_NAME_COL in assigned_centroids_df.columns:
                rename_map[FACILITIES_GDF_NAME_COL] = output_facility_name_col
            if 'facility_temp_lat' in assigned_centroids_df.columns:
                rename_map['facility_temp_lat'] = output_facility_lat_col
            if 'facility_temp_lon' in assigned_centroids_df.columns:
                rename_map['facility_temp_lon'] = output_facility_lon_col

            if rename_map:
                assigned_centroids_df.rename(columns=rename_map, inplace=True)

            if FACILITIES_GDF_ID_COL != 'nearest_facility_id' and FACILITIES_GDF_ID_COL in assigned_centroids_df.columns:
                assigned_centroids_df.drop(columns=[FACILITIES_GDF_ID_COL], inplace=True)

        else:
            pass
    else:
        pass
else:
    pass

output_filename = "ASSIGNMENT_R5py_centroids_to_facilities_with_coords.csv"

if not assigned_centroids_df.empty:
    columns_to_save = []

    if centroid_id_col in assigned_centroids_df.columns:
        columns_to_save.append(centroid_id_col)
    if centroid_lat_col in assigned_centroids_df.columns:
         columns_to_save.append(centroid_lat_col)
    if centroid_lon_col in assigned_centroids_df.columns:
        columns_to_save.append(centroid_lon_col)

    if 'nearest_facility_id' in assigned_centroids_df.columns:
        columns_to_save.append('nearest_facility_id')

    if output_facility_name_col in assigned_centroids_df.columns:
        columns_to_save.append(output_facility_name_col)

    if output_facility_lat_col in assigned_centroids_df.columns:
        columns_to_save.append(output_facility_lat_col)

    if output_facility_lon_col in assigned_centroids_df.columns:
        columns_to_save.append(output_facility_lon_col)

    if 'r5py_travel_time_for_assignment (min)' in assigned_centroids_df.columns:
        columns_to_save.append('r5py_travel_time_for_assignment (min)')

    if columns_to_save:
        columns_to_save = sorted(list(set(columns_to_save)))
        final_columns_to_save = [col for col in columns_to_save if col in assigned_centroids_df.columns]

        if final_columns_to_save:
            output_df = assigned_centroids_df[final_columns_to_save]
            output_df.to_csv(output_filename, index=False)
            print(f"\nAssignment results saved to {output_filename}")
            print(f"Columns saved: {output_df.columns.tolist()}")
        else:
            assigned_centroids_df.to_csv(output_filename, index=False)
            print(f"\nAssignment results (all columns) saved to {output_filename}")
    else:
        assigned_centroids_df.to_csv(output_filename, index=False)
        print(f"\nAssignment results (all columns) saved to {output_filename}")
else:
    print("\nNo assignment data to save as the result DataFrame was empty.")


Assignment results saved to ASSIGNMENT_R5py_centroids_to_facilities_with_coords.csv
Columns saved: ['cell_id', 'centroids_lat', 'centroids_lng', 'nearest_facility_id', 'r5py_travel_time_for_assignment (min)']


In [None]:
if not assigned_centroids_df.empty:
    facilities_df_prefixed = facilities_df.copy()
    # Ensure the original ID column name is used to create the prefixed column name
    facilities_df_prefixed.columns = ["HF_" + col for col in facilities_df_prefixed.columns]
    prefixed_facility_join_key = "HF_" + facility_id_col # Use the correct variable name for facility ID

    # Ensure the merge uses the correct original column name from facilities_df for the join key
    # The 'nearest_facility_id' in assigned_centroids_df corresponds to the original facility_id_col values.
    # We need to join assigned_centroids_df on 'nearest_facility_id' with facilities_df_prefixed on the prefixed original ID.
    final_report_df = pd.merge(
        assigned_centroids_df, facilities_df_prefixed,
        left_on='nearest_facility_id', right_on=prefixed_facility_join_key, how='left'
    )

    output_filepath = os.path.join(gdrive_base_path, "FINAL_ASSIGNMENT_REPORT.csv") # Define a single output file path
    final_report_df.to_csv(output_filepath, index=False)
    print(f"Final assignment report saved to {output_filepath}")

else:
    print("Assigned_centroids_df is empty. Cannot create the final report.")
    pass

Final assignment report saved to /content/drive/My Drive/MAP5010 Research Project/Assign Centroid/FINAL_ASSIGNMENT_REPORT.csv
