In [30]:
import pandas as pd
from typing import List
from dotenv import load_dotenv
from geopy.distance import geodesic
from tqdm import tqdm
import openrouteservice
import polyline

import random
import requests
import json
import sys
import os
import time
import ast
import re


hospital_df = pd.read_csv('../destinations/data/Hospitals.csv')
airport_df = pd.read_csv('../destinations/data/Alt_Airports.csv')
landmark_df = pd.read_csv('../destinations/data/Landmarks.csv')
gov_building_df = pd.read_csv('../destinations/data/Landmarks.csv')

### Function to choose to and fro

In [31]:

# This currently uses rejection sampling to find points that are less than
# 150km apart. If this is used to generate a larger dataset, this needs to
# be refactored to generate pairs on initialization, and/or use a spatial
# database to improve performance.

class ToAndFroGenerator:
    def __init__(self, dataframes: List[pd.DataFrame], frequencies: List[float] = None):

        if frequencies != None and len(dataframes) != len(frequencies):
            raise ValueError("dataframes and numbers lists must have the same length")
        
        # Default to uniform if no distribution is provided
        if frequencies is None:
            frequencies = [1] * len(dataframes)

        # Set up upper bounds for choosing the source dataframes randomly
        s = 0.0
        self.__random_dataframe_upper_bounds = []
        for n in frequencies:
            s += n
            self.__random_dataframe_upper_bounds.append(s)
        
        self.__dataframes = dataframes
        self.__random_dataframe_max_number = sum(frequencies)
        self.__seen = set()

    
    
    def get_to_and_fro(self):
        n1 = random.uniform(0, self.__random_dataframe_max_number)
        n2 = random.uniform(0, self.__random_dataframe_max_number)

        # Finding the indices of the dataframes that we are getting the points from
        i, from_idx, to_idx = 0, -1, -1
        for n in self.__random_dataframe_upper_bounds:
            if n1 <= n:
                from_idx = i
                break
            i += 1
        i = 0
        for n in self.__random_dataframe_upper_bounds:
            if n2 <= n:
                to_idx = i
                break
            i += 1

        assert from_idx != -1 and to_idx != -1, "logic error: failed to randomly select dataframe for to and fro"

        
        for _ in range(10):
            # Randomly select the two datapoints
            from_row = self.__dataframes[from_idx].sample(n=1).iloc[0]

            # Get points in the same state
            filtered =  self.__dataframes[to_idx][self.__dataframes[to_idx]['State'] == from_row['State']]
            if filtered.empty:
                return (None, None)
            to_row = filtered.sample(n=1).iloc[0]

            # Check to make sure to isn't the same as from
            if (to_row.equals(from_row)):
                    continue

            # Filter out points with distance > 150 km
            distance = geodesic((from_row['Latitude'], from_row['Longitude']), 
                                (to_row['Latitude'], to_row['Longitude'])).km
            if distance >= 150:
                continue

            # Check against seen pairs for uniqueness
            encoded_tuple = f"{from_row['Longitude']} {from_row['Latitude']} {to_row['Longitude']} {to_row['Latitude']}"
            if encoded_tuple in self.__seen:
                continue
            else:
                self.__seen.add(encoded_tuple)
                return (to_row, from_row)
        
        # If no unique pair is found after 10 tries, return None
        return (None, None)
    
# Simple test
obj = ToAndFroGenerator([hospital_df, airport_df], [1, 1])

print(obj.get_to_and_fro())


(Name         RAYMOND G. MURPHY VA MEDICAL CENTER
City                                 ALBUQUERQUE
State                                         NM
Longitude                            -106.581767
Latitude                               35.055168
Name: 112, dtype: object, Name         Las Vegas Municipal Airport
City                           LAS VEGAS
State                                 NM
Longitude                    -105.141998
Latitude                       35.654202
Name: 577, dtype: object)


In [32]:
# Function for printing to stderr
# This is for running on RC servers
def eprint(*args, **kwargs):
    print(*args, file=sys.stderr, **kwargs)


def get_directions(from_row, to_row, ors_client, valhalla_url="http://localhost:8002"): 

    plans = []
    detailed_plans = []

    # Avoid rate limit
    time.sleep(1)

    try:
        # Query directions
        data = ors_client.directions(
            coordinates = [
                [from_row['Longitude'], from_row['Latitude']],
                [to_row['Longitude'], to_row['Latitude']]
            ],
            profile = 'driving-car',
            alternative_routes = {
                'target_count': 3
            },
        )
    except:
        return (None, None)
    
    for route in data['routes']:
        decoded_geometry = polyline.decode(route['geometry'])

        # print(decoded_geometry)
        
        points = []

        for point in decoded_geometry:
            points.append({
                'lat': point[0],
                'lon': point[1],
            })
        
        payload = {
            "shape": points,
            "costing": "auto",
            "shape_match": "map_snap",
            "directions_options": {"units": "kilometers"},
        }

        headers = {"Content-Type": "application/json"}

        response = requests.post(valhalla_url + '/trace_route', json=payload, headers=headers)

        if response.status_code == 200:
            response_data = response.json()
            route = response_data['trip']['legs'][0]['maneuvers']

            # Decode the new polyline:
            decoded_geometry = polyline.decode(response_data['trip']['legs'][0]['shape'])

            instructions = []
            detailed_instructions = []

            for step in route:
                instructions.append(step['instruction'])

                # Get location data for step
                location = None
                if 'begin_shape_index' in step and decoded_geometry and step['begin_shape_index'] < len(decoded_geometry):
                    point_index = step['begin_shape_index']
                    location = {
                        'lat': round(decoded_geometry[point_index][0] / 10, 6),
                        'lon': round(decoded_geometry[point_index][1] / 10, 6),
                    }
                
                # Create detailed instruction dict
                detailed_instruction = {
                    'instruction': step['instruction'],
                    'name': step['street_names'][0] if step.get('street_names') else '-', # Only take the primary name which is refered to in the instruction
                    'alt_names': step['street_names'][1:] if step.get('street_names') and len(step['street_names']) > 1 else [], # Store the rest of the names
                    'distance': step['length'] * 1000, # Get distance in meters
                    'duration': step['time'], # Duration of step in seconds
                    'type': step['type'], # Type can tell more about what the instruction is doing
                }

                # Only add location if one is provided
                if location:
                    detailed_instruction['location'] = location
                
                detailed_instructions.append(detailed_instruction)
        else:
            print(f"Error (Valhalla): Received status code {response.status_code}")
            print(response.text)
        plans.append(instructions)
        detailed_plans.append(detailed_instructions)

    return (plans, detailed_plans)

In [33]:
columns = ['Domain', 'Goal', 'Plan']

df = pd.DataFrame(columns=columns)

count = 0
total_iterations = 2

generator = ToAndFroGenerator([hospital_df, airport_df, landmark_df, gov_building_df])

load_dotenv()
API_KEY = os.getenv("OPEN_ROUTE_SERVICE_API_KEY")

ors_client = openrouteservice.Client(key=API_KEY)

with tqdm(total=total_iterations, desc="Generating Data", unit="entry") as pbar:
    while count < total_iterations:
        time.sleep(1)
        (from_row, to_row) = generator.get_to_and_fro()

        if from_row is None or to_row is None:
            continue

        (instructions, detailed_instructions) = get_directions(from_row=from_row, to_row=to_row, ors_client=ors_client)

        # Either an error, or less than three routes
        if not instructions or len(instructions) < 3:
            continue
        
        # Append new row to dataframe
        new_row = pd.DataFrame([{
            'id': int(count),
            'Domain': f"coordinates = [[{from_row['Longitude']},{from_row['Latitude']}],[{to_row['Longitude']},{to_row['Latitude']}],]",
            'Goal': f"{from_row['Name']} ({from_row['City']}, {from_row['State']}) to {to_row['Name']} ({to_row['City']}, {to_row['State']})",
            'Plan': str(instructions),
            'Detailed Plan': str(detailed_instructions),
        }], columns=['id', 'Domain', 'Goal', 'Plan', 'Detailed Plan'])
        df = pd.concat([df, new_row], ignore_index=True)

        # Update count and progress bar
        count += 1
        pbar.update(1)

df

df.to_csv('./output_data/intermediate.csv')

Generating Data: 100%|██████████| 2/2 [00:10<00:00,  5.27s/entry]


In [34]:
def reformat_instructions(row):
    ambiguous_instructions = {
        # Types 10 and 15
        'Turn right.',
        'Turn left.',
        # Types 11 and 14
        'Make a sharp right.',
        'Make a sharp left.',
        # Types 9 and 16
        'Bear right.',
        'Bear left.',
        # Types 20 and 21
        'Take the exit on the right.',
        'Take the exit on the left.',
        # Types 18 and 19
        'Take the ramp on the right.',
        'Take the ramp on the left.',
        # Types 12 and 13
        'Make a left U-turn.',
        'Make a right U-turn.',
    }
    
    for plan_idx in range(len(row['Plan'])):
        plan = row['Plan'][plan_idx]
        detailed_plan = row['Detailed Plan'][plan_idx]

        indices_to_remove = set()
        for step_idx in range(0, len(row['Plan'][plan_idx]) - 1):
            step = detailed_plan[step_idx]
            detailed_plan[step_idx]['has_interval'] = False

            # Remove road directions from 'name' field
            name = detailed_plan[step_idx]['name']
            detailed_plan[step_idx]['name'] = re.sub(r'\s+(South|North|East|West)$', '', name)

            step_type = detailed_plan[step_idx]['type']
            # Type == 26 and 27: Roundabouts
            # These have a redundancy in their natural language representation
            # so they will be merged into one step
            if step_type == 26:
                if detailed_plan[step_idx + 1]['type'] == 27:
                    enter = detailed_plan[step_idx]
                    exit = detailed_plan[step_idx + 1]
                    enter['distance'] += exit['distance']
                    enter['duration'] += exit['duration']
                    enter['name'] = exit['name']
                    enter['alt_names'] = exit['alt_names']

                    indices_to_remove.add(step_idx + 1)

                    detailed_plan[step_idx] = enter
                else:
                    print("Error: Entering roundabout without leaving")

            # Case where the instruction clearly lists a name for the ramp, but the data does not reflect it
            if (step_type == 17 or step_type == 18 or step_type == 19) and detailed_plan[step_idx]['name'] == '-':
                # Check for pattern in instruction (don't match on "take the ramp")
                match = re.search(r'take the (.+?) ramp', detailed_plan[step_idx]['instruction'])
                if match:
                    # This means the ramp was given a name in the text representation
                    detailed_plan[step_idx]['name'] = match.group(1)

            # Adding distance and time to steps with identifying features to distinguish their location
            instruction = detailed_plan[step_idx]['instruction']
            if instruction in ambiguous_instructions and step_idx != 0:
                distance = detailed_plan[step_idx - 1]['distance']
                duration = detailed_plan[step_idx - 1]['duration']

                interval = [round(duration - duration / 10), round(duration + duration / 10)]

                # Remove period from old instruction
                new_instruction = instruction[:-1]
                if interval[0] == interval[1]:
                    new_instruction += f" after {round(distance)} meters or {interval[0]} seconds."
                else:
                    new_instruction += f" after {round(distance)} meters or {interval[0]}-{interval[1]} seconds."

                detailed_plan[step_idx]['instruction'] = new_instruction
                plan[step_idx] = new_instruction

                detailed_plan[step_idx]['has_interval'] = True
                detailed_plan[step_idx]['interval'] = interval
            
        detailed_plan[len(detailed_plan) - 1]['has_interval'] = False

        # Remove indices
        plan = [step for idx, step in enumerate(plan) if idx not in indices_to_remove]
        detailed_plan = [detail for idx, detail in enumerate(detailed_plan) if idx not in indices_to_remove]
        
        # Update row with new plans
        row['Plan'][plan_idx] = plan
        row['Detailed Plan'][plan_idx] = detailed_plan

    return row

df['Domain'] = df['Domain'].apply(lambda x: ast.literal_eval(x.replace('coordinates = ', '').strip()))
df['Plan'] = df['Plan'].apply(ast.literal_eval)
df['Detailed Plan'] = df['Detailed Plan'].apply(ast.literal_eval)

df = df.apply(reformat_instructions, axis=1)

In [35]:

column_order = ['id', 'Domain', 'Goal', 'Plan']

df['Domain'] = df['Domain'].apply(lambda x: 'coordinates = ' + str(x))
df['Plan'] = df['Plan'].apply(str)
df['Detailed Plan'] = df['Detailed Plan'].apply(str)
df['id'] = df['id'].apply(int)

df.to_csv("./output_data/testing_generation.csv",columns=column_order + ['Detailed Plan'], index=False)

df = df.drop(columns=['Detailed Plan'])

df.to_csv("./output_data/ajdfljasdfjkl.csv", columns=column_order, index=False)