In [None]:
import pandas as pd
from typing import List
from dotenv import load_dotenv
from geopy.distance import geodesic
from tqdm import tqdm
import polyline

import random
import requests
import sys
import ast
import re


hospital_df = pd.read_csv('../destinations/data/Hospitals.csv')
airport_df = pd.read_csv('../destinations/data/Alt_Airports.csv')
landmark_df = pd.read_csv('../destinations/data/Landmarks.csv')
gov_building_df = pd.read_csv('../destinations/data/Landmarks.csv')

### Function to choose to and fro

In [41]:

# This currently uses rejection sampling to find points that are less than
# 150km apart. If this is used to generate a larger dataset, this needs to
# be refactored to generate pairs on initialization, and/or use a spatial
# database to improve performance.

class ToAndFroGenerator:
    def __init__(self, dataframes: List[pd.DataFrame], weights: List[float] = None, randomSeed=None):

        if weights != None and len(dataframes) != len(weights):
            raise ValueError("dataframes and numbers lists must have the same length")
        
        # Default to uniform if no distribution is provided
        if weights is None:
            weights = [1] * len(dataframes)

        # Set up upper bounds for choosing the source dataframes randomly
        s = 0.0
        self.__random_dataframe_upper_bounds = []
        for n in weights:
            s += n
            self.__random_dataframe_upper_bounds.append(s)
        
        self.__dataframes = dataframes
        self.__random_dataframe_max_number = sum(weights)
        self.__seen = set()
        self.__rng = random.Random(randomSeed)

    
    
    def get_to_and_fro(self):


        n1 = self.__rng.uniform(0, self.__random_dataframe_max_number)
        n2 = self.__rng.uniform(0, self.__random_dataframe_max_number)

        # Finding the indices of the dataframes that we are getting the points from
        i, from_idx, to_idx = 0, -1, -1
        for n in self.__random_dataframe_upper_bounds:
            if n1 <= n:
                from_idx = i
                break
            i += 1
        i = 0
        for n in self.__random_dataframe_upper_bounds:
            if n2 <= n:
                to_idx = i
                break
            i += 1

        assert from_idx != -1 and to_idx != -1, "logic error: failed to randomly select dataframe for to and fro"

        
        for _ in range(10):
            from_seed = self.__rng.randint(0, 2**32 - 1) 
            to_seed = self.__rng.randint(0, 2**32 - 1)

            # Randomly select the two datapoints
            from_row = self.__dataframes[from_idx].sample(n=1, random_state=from_seed).iloc[0]

            # Get points in the same state
            filtered =  self.__dataframes[to_idx][self.__dataframes[to_idx]['State'] == from_row['State']]
            if filtered.empty:
                return (None, None)
            to_row = filtered.sample(n=1, random_state=to_seed).iloc[0]

            # Check to make sure to isn't the same as from
            if (to_row.equals(from_row)):
                    continue

            # Filter out points with distance > 150 km
            distance = geodesic((from_row['Latitude'], from_row['Longitude']), 
                                (to_row['Latitude'], to_row['Longitude'])).km
            if distance >= 150:
                continue

            # Check against seen pairs for uniqueness
            encoded_tuple = f"{from_row['Longitude']} {from_row['Latitude']} {to_row['Longitude']} {to_row['Latitude']}"
            if encoded_tuple in self.__seen:
                continue
            else:
                self.__seen.add(encoded_tuple)
                return (to_row, from_row)
        
        # If no unique pair is found after 10 tries, return None
        return (None, None)
    
# Simple test
obj = ToAndFroGenerator([hospital_df, airport_df], [1, 1])

print(obj.get_to_and_fro())


(Name         MEDSTAR MONTGOMERY MEDICAL CENTER
City                                     OLNEY
State                                       MD
Longitude                           -77.054703
Latitude                              39.15404
Name: 3613, dtype: object, Name         Baltimore/Washington International Thurgood Ma...
City                                                 BALTIMORE
State                                                       MD
Longitude                                           -76.668297
Latitude                                               39.1754
Name: 22, dtype: object)


In [42]:
# Function for printing to stderr
# This is for running on RC servers

def eprint(*args, **kwargs):
    print(*args, file=sys.stderr, **kwargs)


def get_directions(coords, valhalla_url="http://localhost:8002"): 

    plans = []
    detailed_plans = []

    payloads = []

    base_payload = {
        'locations': [
            {
                'lon': coords[0][0],
                'lat': coords[0][1],
            },
            {
                'lon': coords[1][0],
                'lat': coords[1][1],
            }
        ],
        'costing': 'auto',
        "directions_options": { "units": "kilometers" },
    }

    payloads.append(base_payload)

    # Route with shortest distance
    payloads.append(base_payload | {
        'costing_options': { 'auto': {
            'use_distance': 1,
        }},
    })

    # Avoid manuevers
    payloads.append(base_payload | {
        'costing_options': { 'auto': {
            'maneuver_penalty': 1000,
        }},
    })

    # Avoid highways
    payloads.append(base_payload | {
        'costing_options': { 'auto': {
            'use_highways': 0,
        }},
    })

    # Bias residential streets
    payloads.append(base_payload | {
        'costing_options': { 'auto': {
            'use_living_streets': 1,
        }},
    })

    headers = {"Content-Type": "application/json"}

    for payload in payloads:

        response = requests.post(valhalla_url + '/route', json=payload, headers=headers)

        if response.status_code == 200:
            response_data = response.json()
            route = response_data['trip']['legs'][0]['maneuvers']

            # Decode the new polyline:
            decoded_geometry = polyline.decode(response_data['trip']['legs'][0]['shape'], precision = 6)

            instructions = []
            detailed_instructions = []

            for step in route:
                instructions.append(step['instruction'])

                # Get location data for step
                location = None
                if 'begin_shape_index' in step and decoded_geometry and step['begin_shape_index'] < len(decoded_geometry):
                    point_index = step['begin_shape_index']
                    location = {
                        'lat': decoded_geometry[point_index][0],
                        'lon': decoded_geometry[point_index][1],
                    }
                
                # Create detailed instruction dict
                detailed_instruction = {
                    'instruction': step['instruction'],
                    'name': step['street_names'][0] if step.get('street_names') else '-', # Only take the primary name which is refered to in the instruction
                    'alt_names': step['street_names'][1:] if step.get('street_names') and len(step['street_names']) > 1 else [], # Store the rest of the names
                    'distance': step['length'] * 1000, # Get distance in meters
                    'duration': step['time'], # Duration of step in seconds
                    'type': step['type'], # Type can tell more about what the instruction is doing
                }

                # Only add location if one is provided
                if location:
                    detailed_instruction['location'] = location
                
                detailed_instructions.append(detailed_instruction)
            
            plans.append(instructions)
            detailed_plans.append(detailed_instructions)
        else:
            print(f"Error (Valhalla): Received status code {response.status_code}")
            print(response.text)

    # Find all unique plans
    seen = set()
    unique_indices = []

    for idx, plan in enumerate(plans):
        t = tuple(plan)
        if t not in seen:
            seen.add(t)
            unique_indices.append(idx)

    if len(unique_indices) < 3:
        return (None, None)
    
    # Filter to only three unique plans
    final_plans = []
    final_detailed_plans = []

    for i in range(3):
        final_plans.append(plans[unique_indices[i]])
        final_detailed_plans.append(detailed_plans[unique_indices[i]])

    return (final_plans, final_detailed_plans)

In [43]:
columns = ['Domain', 'Goal', 'Plan']

df = pd.DataFrame(columns=columns)

count = 0
total_iterations = 500

generator = ToAndFroGenerator([hospital_df, airport_df, landmark_df, gov_building_df], randomSeed=42)

with tqdm(total=total_iterations, desc="Generating Data", unit="entry") as pbar:
    while count < total_iterations:
        (from_row, to_row) = generator.get_to_and_fro()

        if from_row is None or to_row is None:
            continue
        
        coords = [
            [
                from_row['Longitude'],
                from_row['Latitude'],
            ], [
                to_row['Longitude'],
                to_row['Latitude'],
            ]
        ]

        (instructions, detailed_instructions) = get_directions(coords)

        # Either an error, or less than three routes
        if not instructions or len(instructions) < 3:
            continue
        
        # Append new row to dataframe
        new_row = pd.DataFrame([{
            'id': int(count),
            'Domain': f"coordinates = [[{from_row['Longitude']},{from_row['Latitude']}],[{to_row['Longitude']},{to_row['Latitude']}],]",
            'Goal': f"{from_row['Name']} ({from_row['City']}, {from_row['State']}) to {to_row['Name']} ({to_row['City']}, {to_row['State']})",
            'Plan': str(instructions),
            'Detailed Plan': str(detailed_instructions),
        }], columns=['id', 'Domain', 'Goal', 'Plan', 'Detailed Plan'])
        df = pd.concat([df, new_row], ignore_index=True)

        # Update count and progress bar
        count += 1
        pbar.update(1)

df

Generating Data:  11%|█         | 55/500 [00:09<01:07,  6.57entry/s]

Error (Valhalla): Received status code 400
{"error_code":442,"error":"No path could be found for input","status_code":400,"status":"Bad Request"}
Error (Valhalla): Received status code 400
{"error_code":442,"error":"No path could be found for input","status_code":400,"status":"Bad Request"}
Error (Valhalla): Received status code 400
{"error_code":442,"error":"No path could be found for input","status_code":400,"status":"Bad Request"}
Error (Valhalla): Received status code 400
{"error_code":442,"error":"No path could be found for input","status_code":400,"status":"Bad Request"}
Error (Valhalla): Received status code 400
{"error_code":442,"error":"No path could be found for input","status_code":400,"status":"Bad Request"}


Generating Data: 100%|██████████| 500/500 [01:25<00:00,  5.84entry/s]


Unnamed: 0,Domain,Goal,Plan,id,Detailed Plan
0,"coordinates = [[-71.22407723,42.20650415],[-71...",WESTWOOD/PEMBROKE HEALTH SYSTEM WESTWOOD (WEST...,"[['Drive north on Abbey Road.', 'Turn left ont...",0.0,[[{'instruction': 'Drive north on Abbey Road.'...
1,"coordinates = [[-80.31080036,26.01301541],[-80...","MEMORIAL HOSPITAL WEST (PEMBROKE PINES, FL) to...","[['Drive northwest.', 'Turn left.', 'Turn righ...",1.0,"[[{'instruction': 'Drive northwest.', 'name': ..."
2,"coordinates = [[-98.57619951,29.52010675],[-97...","CLARITY CHILD GUIDANCE CENTER (SAN ANTONIO, TX...","[['Drive east.', 'Turn left onto Tom Slick.', ...",2.0,"[[{'instruction': 'Drive east.', 'name': '-', ..."
3,"coordinates = [[-81.4371032715,28.2898006439],...","Kissimmee Gateway Airport (ORLANDO, FL) to IND...","[['Drive east on Patrick Street.', 'Turn left ...",3.0,[[{'instruction': 'Drive east on Patrick Stree...
4,"coordinates = [[-122.2488937,38.11790375],[-12...",ST. HELENA HOSPITAL CENTER FOR BEHAVIORAL HEAL...,"[['Drive north on Broadway Street.', 'Turn lef...",4.0,[[{'instruction': 'Drive north on Broadway Str...
...,...,...,...,...,...
495,"coordinates = [[-90.709503,42.402],[-92.400299...","Dubuque Regional Airport (DUBUQUE, IA) to Wate...","[['Drive north.', 'Bear right.', 'Turn left on...",495.0,"[[{'instruction': 'Drive north.', 'name': '-',..."
496,"coordinates = [[-76.56655563780009,42.9295192]...","Seward, William H., House (Auburn, NY) to Syra...","[['Drive north.', 'Turn left onto South Street...",496.0,"[[{'instruction': 'Drive north.', 'name': '-',..."
497,"coordinates = [[-71.06823934295548,42.35671345...","Headquarters House (Boston, MA) to BAYSTATE MA...","[['Drive west on Beacon Street.', 'Bear left t...",497.0,[[{'instruction': 'Drive west on Beacon Street...
498,"coordinates = [[-86.58465712669839,34.73047047...","Milligan Block (Huntsville, AL) to Northwest A...","[['Drive northwest on East Side Square.', 'Tak...",498.0,[[{'instruction': 'Drive northwest on East Sid...


In [None]:
def reformat_instructions(row):
    ambiguous_instructions = {
        # Types 10 and 15
        'Turn right.',
        'Turn left.',
        # Types 11 and 14
        'Make a sharp right.',
        'Make a sharp left.',
        # Types 9 and 16
        'Bear right.',
        'Bear left.',
        # Types 20 and 21
        'Take the exit on the right.',
        'Take the exit on the left.',
        # Types 18 and 19
        'Take the ramp on the right.',
        'Take the ramp on the left.',
        # Types 12 and 13
        'Make a left U-turn.',
        'Make a right U-turn.',
    }
    
    for plan_idx in range(len(row['Plan'])):
        plan = row['Plan'][plan_idx]
        detailed_plan = row['Detailed Plan'][plan_idx]

        new_plan = []
        new_detailed_plan = []

        indices_to_remove = set()
        for step_idx in range(0, len(row['Plan'][plan_idx]) - 1):
            new_step = detailed_plan[step_idx]
            new_step['has_interval'] = False


            # Remove road directions from 'name' field
            name = new_step['name']
            new_step['name'] = re.sub(r'\s+(South|North|East|West)$', '', name)

            step_type = new_step['type']
            # Type == 26 and 27: Roundabouts
            # These give duplicate actions in their instructions
            # so they will be merged into one step
            if step_type == 26:
                if detailed_plan[step_idx + 1]['type'] == 27:
                    # Merge the steps to create one step with accurate details
                    enter = new_step
                    exit = detailed_plan[step_idx + 1]
                    enter['distance'] += exit['distance']
                    enter['duration'] += exit['duration']
                    enter['name'] = exit['name']
                    enter['alt_names'] = exit['alt_names']

                    new_step = enter
                else:
                    print("Error: Entering roundabout without leaving")

            # Case where the instruction clearly lists a name for the ramp, but the data does not reflect it
            if (step_type == 17 or step_type == 18 or step_type == 19) and detailed_plan[step_idx]['name'] == '-':
                # Check for pattern in instruction (don't match on "take the ramp")
                match = re.search(r'take the (.+?) ramp', detailed_plan[step_idx]['instruction'])
                if match:
                    # This means the ramp was given a name in the text representation
                    new_step['name'] = match.group(1)

            # Adding distance and time to steps with identifying features to distinguish their location
            instruction = new_step['instruction']
            if instruction in ambiguous_instructions and step_idx != 0:
                distance = detailed_plan[step_idx - 1]['distance']
                duration = detailed_plan[step_idx - 1]['duration']

                interval = [round(duration - duration / 10), round(duration + duration / 10)]

                # Remove period from old instruction
                new_instruction = instruction[:-1]
                if interval[0] == interval[1]:
                    new_instruction += f" after {round(distance)} meters or {interval[0]} seconds."
                else:
                    new_instruction += f" after {round(distance)} meters or {interval[0]}-{interval[1]} seconds."

                new_step['instruction'] = new_instruction
                new_step['has_interval'] = True
                new_step['interval'] = interval

            if new_step['type'] != 27:
                new_plan.append(new_step['instruction'])
                new_detailed_plan.append(new_step)

        # Add second to last step "Continue for X meters" for clarity
        prev_second_to_last = new_detailed_plan[-1]
        
        duration = prev_second_to_last['duration']
        distance = prev_second_to_last['distance']

        interval = [round(duration - duration / 10), round(duration + duration / 10)]
        if interval[0] == interval[1]:
            instruction = f"Continue for {round(distance)} meters or {interval[0]} seconds."
        else:
            instruction = f"Continue for {round(distance)} meters or {interval[0]}-{interval[1]} seconds."
        
        new_step = {
            'instruction': instruction,
            'name': prev_second_to_last['name'],
            'alt_names': prev_second_to_last['alt_names'],
            'distance': prev_second_to_last['distance'],
            'duration': prev_second_to_last['duration'],
            'type': 8,
            'location': prev_second_to_last['location'],
            'has_interval': True,
            'interval': interval,
        }

        prev_second_to_last['duration'] = 0
        prev_second_to_last['distance'] = 0
        new_detailed_plan[-1] = prev_second_to_last
        
        new_plan.append(new_step['instruction'])
        new_detailed_plan.append(new_step)


        # Add final step "You have arrived at your destination"
        final_step = detailed_plan[len(detailed_plan) - 1]
        final_step['has_interval'] = False
        
        new_plan.append(final_step['instruction'])
        new_detailed_plan.append(final_step)

        # Remove indices
        plan = [step for idx, step in enumerate(plan) if idx not in indices_to_remove]
        detailed_plan = [detail for idx, detail in enumerate(detailed_plan) if idx not in indices_to_remove]
        
        # Update row with new plans
        row['Plan'][plan_idx] = new_plan
        row['Detailed Plan'][plan_idx] = new_detailed_plan

    return row

df['Domain'] = df['Domain'].apply(lambda x: ast.literal_eval(x.replace('coordinates = ', '').strip()))
df['Plan'] = df['Plan'].apply(ast.literal_eval)
df['Detailed Plan'] = df['Detailed Plan'].apply(ast.literal_eval)

df = df.apply(reformat_instructions, axis=1)

In [45]:

column_order = ['id', 'Domain', 'Goal', 'Plan']

df['Domain'] = df['Domain'].apply(lambda x: 'coordinates = ' + str(x))
df['Plan'] = df['Plan'].apply(str)
df['Detailed Plan'] = df['Detailed Plan'].apply(str)
df['id'] = df['id'].apply(int)

df.to_csv("./output_data/detailed_travel_routes.csv",columns=column_order + ['Detailed Plan'], index=False)

df = df.drop(columns=['Detailed Plan'])

df.to_csv("./output_data/travel_routes.csv", columns=column_order, index=False)