# Using OSRM to dertermin the drive distance

In [1]:
import time
import numpy as np
import requests
import pandas as pd
import os

In [2]:
def distance(lat1, lon1, lat2, lon2):

    """
    Calculate the approximate distance between two sets of latitude and longitude coordinates using a simplified method.

    Parameters:
    - lat1, lon1: Latitude and longitude of the first location.
    - lat2, lon2: Latitude and longitude of the second location.

    Returns:
    - float: The approximate distance between the two locations in kilometers.
    """
    
    lat_diff = (lat2 - lat1) * 111

    # 1 degree of longitude varies, we use an approximation based on the average latitude
    avg_lat = np.radians((lat1 + lat2) / 2.0)  # Convert average latitude to radians
    lon_diff = (lon2 - lon1) * 111 * np.cos(avg_lat)

    # Calculate the distance using the Pythagorean theorem on the lat_diff and lon_diff
    return np.sqrt(lat_diff**2 + lon_diff**2)

In [3]:
def get_driving_distance(lat1, lon1, lat2, lon2):

    """
    Calculate the driving distance between two sets of latitude and longitude coordinates.

    Parameters:
    - lat1: Latitude of the starting location.
    - lon1: Longitude of the starting location.
    - lat2: Latitude of the destination location.
    - lon2: Longitude of the destination location.

    Returns:
    - distance_kilometers: The driving distance in kilometers.

    This function uses the OSRM (Open Source Routing Machine) service to calculate the driving 
    distance between two sets of coordinates. It sends a request to the OSRM API and retrieves 
    the distance in meters, which is then converted to kilometers and returned. If no route is 
    found or an error occurs, it returns NaN (Not-a-Number).
    """

    base_url = "http://router.project-osrm.org/route/v1/driving/"
    coordinates = f"{lon1},{lat1};{lon2},{lat2}"  # OSRM uses (longitude,latitude)
    url = base_url + coordinates + "?overview=false"
    response = requests.get(url)
    data = response.json()

    if 'routes' not in data or not data['routes']:
        return np.nan  # Return NaN if no route is found or an error occurs

    # Extract and return the driving distance (in meters) converted to kilometers
    distance_meters = data['routes'][0]['distance']
    distance_kilometers = distance_meters / 1000
    return distance_kilometers


In [4]:
def find_nearest_hospital(house_lat, house_lon, hospital):
    """
    Find the nearest hospital(s) to a given house location and calculate the minimum driving distance.

    Parameters:
    - house_lat: Latitude of the house's location.
    - house_lon: Longitude of the house's location.
    - hospital: A DataFrame containing hospital information with 'latitude' and 'longitude' columns.

    Returns:
    - min_driving_distance: The minimum driving distance to the nearest hospital(s).

    This function takes the latitude and longitude of a house's location and a DataFrame containing 
    hospital information. It calculates the Euclidean distance from the house to each hospital and 
    then determines the minimum driving distance to the nearest hospital(s). The function returns 
    the minimum driving distance.
    """

    distances = []

    # Calculate distances to all hospitals
    for index, row in hospital.iterrows():
        hospital_lat, hospital_lon = row['latitude'], row['longitude']
        dist = distance(house_lat, house_lon, hospital_lat, hospital_lon)
        distances.append((hospital_lat, hospital_lon, dist))

    # Find the three nearest hospitals by sorting the distances
    nearest_hospitals = sorted(distances, key=lambda x: x[2])[:3]

    min_driving_distance = float('inf')

    # Calculate the minimum driving distance to the nearest hospitals
    for hospital_lat, hospital_lon, _ in nearest_hospitals:
        driving_distance = get_driving_distance(house_lat, house_lon, hospital_lat, hospital_lon)
        min_driving_distance = min(min_driving_distance, driving_distance)

    # Print the minimum driving distance and introduce a small delay
    print(min_driving_distance)
    time.sleep(0.2)

    return min_driving_distance


In [5]:
# create folder for closest driving distance to hospital
relative_path = '../data/raw/hospital_distance/'
if not os.path.exists(relative_path):
    os.makedirs(relative_path)

# Apply OSRM function to proporties

In [None]:
hospital = pd.read_csv('../data/landing/all_hospitals_in_victoria.csv')

# Input and output paths
input_path = '../data/raw/domain_outliers_removed.csv'
output_dir = '../data/raw/hospital_distance/'

# Batch processing parameters
batch_size = 100
total_rows = 8873

# Iterate over the input data in batches
for batch_start in range(8400, total_rows, batch_size):
    # Read a batch of data from the input file
    df = pd.read_csv(input_path, skiprows=range(1, batch_start + 1), nrows=batch_size)
    
    # Calculate the nearest hospital distance for each row in the batch
    df['NearestHospitalDistance'] = df.apply(
        lambda row: find_nearest_hospital(row['Latitude'], row['Longitude'], hospital),
        axis=1
    )
    
    # Determine the batch number for naming the output file
    batch_number = (batch_start // batch_size) + 1
    
    # Define the output file path
    output_path = f"{output_dir}{batch_number}_{batch_start + 1}-{batch_start + batch_size}.csv"
    
    # Save the batch of data with hospital distance information to a separate CSV file
    df.to_csv(output_path, index=False)
