In [244]:
import re

text = "I would like to schedule a trip from Brooklyn museum to Conservatory Garden, Central Park on tomorrow at 21:00"

# Define regular expressions to capture the required information
from_pattern = r'from\s+(.*?)\s+to'
to_pattern = r'(?:.*\bto\s)(.*?)(?=\s*on\b)'
day_pattern = r"on\s(\w+)\sat"
time_pattern = r"at\s([\d:]+)"

# Extract information using the defined patterns
pickup_location = re.search(from_pattern, text).group(1).strip()
dropoff_location = re.search(to_pattern, text).group(1).strip()
dropoff_location = dropoff_location.split(',')[0] if ',' in dropoff_location else dropoff_location
day = re.search(day_pattern, text).group(1).strip().lower()
time = re.search(time_pattern, text).group(1).strip()

In [245]:
from opencage.geocoder import OpenCageGeocode

key = '8d9425286cb64486aae1d5000472b211'
geocoder = OpenCageGeocode(key)

# query_from = u'Canarsie High School'.lower() + u', New York, USA'.lower()
# query_to = u'Brooklyn Museum'.lower() + u', New York, USA'.lower()

query_from = f'{pickup_location}'.lower() + u', New York, USA'.lower()
query_to = f'{dropoff_location}'.lower() + u', New York, USA'.lower()

go_from = geocoder.geocode(query_from)[0]
go_to = geocoder.geocode(query_to)[0]
neighborhood_from = geocoder.geocode(go_from['components']['neighbourhood'].lower() + u', New York, USA'.lower())[0]
neighborhood_to = geocoder.geocode(go_to['components']['neighbourhood'].lower() + u', New York, USA'.lower())[0]

In [246]:
import openrouteservice
from geopy.distance import geodesic

client = openrouteservice.Client(key='5b3ce3597851110001cf624841589dcc471d49fbb2b9a6f24ea4e804')
distance_coords:tuple = (
    (go_from['geometry']['lng'], go_from['geometry']['lat']), #pikcup (lng, lat)
    (go_to['geometry']['lng'], go_to['geometry']['lat']) #dropoff (lng, lat)
)

# Calculate directions from A to B
routes = client.directions(distance_coords, profile="driving-car")
trip_distance:float = routes['routes'][0]['summary']['distance'] / 1000  # Distance in kilometers

centoid_distance:float = geodesic(
    (neighborhood_from['geometry']['lat'], neighborhood_from['geometry']['lng']),
    (neighborhood_to['geometry']['lat'], neighborhood_to['geometry']['lng'])
).kilometers

# print(f"Distance from {pickup_location} to {dropoff_location}: {distance_km} km")
print(f"Trip distance: {trip_distance} km")
print(f"Centroid distance (neighborhoods/zones): {centoid_distance} km")

Trip distance: 19.160700000000002 km
Centroid distance (neighborhoods/zones): 13.160362686887272 km


In [253]:
from datetime import datetime, timedelta
import holidays

day = "today"
day_mapping:dict = {
    "monday": 0,
    "tuesday": 1,
    "wednesday": 2,
    "thursday": 3,
    "friday": 4,
    "saturday": 5,
    "sunday": 6
}
us_holidays = holidays.country_holidays('US', years=range(datetime.now().date().year, (datetime.now() + timedelta(days=1*365)).year))
hol_dts = []
for date, name in sorted(us_holidays.items()):
    hol_dts.append(date)
if day == "today":
    dt = datetime.now().date()
elif day == "tomorrow":
    dt = datetime.now().date() + timedelta(days=1)
elif day in ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"]:
    current_dt = datetime.now().date()
    days_ahead = day_mapping[day] - current_dt.weekday() + 7
    dt = current_dt + timedelta(days=days_ahead)
else:
    clean_dt = re.sub(u'[./,-]', '/', day).split('/')
    (year_dt, month_dt, day_dt) = datetime.now().year, int(clean_dt[0]), int(clean_dt[1])
    dt = datetime(year=year_dt, month=month_dt, day=day_dt).date()
#--------------------------------------------------------------------------------------
pickup_weekday = dt.weekday() + 1
pickup_holiday = int(dt in hol_dts)
pickup_weekend = int(pickup_weekday in [6,7])
#--------------------------------------------------------------------------------------
hour_tm, minute_tm = ( int(time.split(":")[0]), int(time.split(":")[1]) )
pickup_hour = hour_tm
pickup_daytime = 1 if (hour_tm in range(7,11)) or (hour_tm in range(16,20)) else 2 if hour_tm in [20,21,22,23,0,1,2,3,4,5,6] else 3
pickup_quarter =  (hour_tm * 60 + minute_tm) // 15 + 1
month_start = datetime(year=year_dt, month=month_dt, day=1)
pickup_tm = datetime(year=year_dt, month=month_dt, day=day_dt, hour=hour_tm, minute=minute_tm)
pickup_seconds = (pickup_tm - month_start).total_seconds()/60

In [256]:
import os
import configparser
import joblib
import numpy as np
from sklearn.pipeline import Pipeline

config = configparser.ConfigParser()
current_dir = "/home/nspanos/m151_web_systems_project/src"
parent_dir = os.path.dirname(current_dir)
config.read(os.path.join(parent_dir, "config", "config.ini"))

application_path:str = config.get("settings", "application_path")
model_artifacts_parent:str = config.get("ml-settings", "model_artifacts_path")
model_artifacts_child:str = config.get("ml-settings", "duration_model_artifact")
duration_model_path:str = os.path.join(
    application_path, model_artifacts_parent, model_artifacts_child
)
available_directories:list = [os.path.join(duration_model_path, file) for file in os.listdir(duration_model_path)]
latest_modified_directory:str = max(available_directories, key=os.path.getmtime)
trip_type:str = "short_trip" if trip_distance < 30.0 else "long_trip"
models_path:str = os.path.join(latest_modified_directory, trip_type, "models")
predictions:list = []
for model in os.listdir(models_path):
    print(os.path.join(models_path, model))
    regressor:Pipeline = joblib.load(os.path.join(models_path, model))
    prediction = regressor.predict([[
        trip_distance,
        pickup_daytime,
        pickup_hour,
        pickup_weekday,
        pickup_quarter,
        pickup_seconds,
        pickup_holiday,
        pickup_weekend,
        centoid_distance
    ]])
    predictions.append(prediction[0])
avg_value = np.round(np.mean(predictions), 2)

/home/nspanos/m151_web_systems_project/ml_models/trip_duration/20240601/short_trip/models/linear_regressor_best_model.joblib
/home/nspanos/m151_web_systems_project/ml_models/trip_duration/20240601/short_trip/models/voting_regressor_best_model.joblib


In [257]:
avg_value

29.5

In [222]:
#     "trip_distance",  # Will be scaled
#     "pickup_daytime", # One-Hot-Encode for Linear/Voting Regressor
#     "pickup_hour",    # One-Hot-Encode for Linear/Voting Regressor
#     "pickup_weekday", # One-Hot-Encode for Linear/Voting Regressor
#     "pickup_quarter", # Will be scaled
#     "pickup_seconds", # Will be scaled
#     "pickup_holiday",
#     "pickup_weekend",
#     "haversine_centroid_distance" # Will be scaled

'short_trip'

In [203]:
hour_tm, minute_tm = ( int(time.split(":")[0]), int(time.split(":")[1]) )
pickup_hour = hour_tm
pickup_daytime = 1 if (hour_tm in range(7,11)) or (hour_tm in range(16,20)) else 2 if hour_tm in [20,21,22,23,0,1,2,3,4,5,6] else 3
pickup_quarter =  (hour_tm * 60 + minute_tm) // 15 + 1
month_start = datetime(year=year_dt, month=month_dt, day=1)
pickup_tm = datetime(year=year_dt, month=month_dt, day=day_dt, hour=hour_tm, minute=minute_tm)
pickup_seconds = (pickup_tm - month_start).total_seconds()/60

In [208]:
pickup_tm

datetime.datetime(2024, 6, 2, 9, 0)

In [210]:
pickup_seconds.total_seconds()/60

1980.0

In [None]:
pickup_tm = datetime(year=year_dt, month=month_dt, day=1)

In [171]:
pickup_weekend

1

In [None]:
tomorrow -> pickup_weekday, pickup_holiday, pickup_weekend

In [149]:
dt.weekday()+1

4

In [None]:
tomorrow -> pickup_weekday, pickup_holiday, pickup_weekend

In [77]:
results_from['components']['neighbourhood']

'Prospect Heights'

In [63]:
geocoder.geocode(results_from[0]['components']['neighbourhood'].lower() + u', New York, USA'.lower())[0]

{'annotations': {'DMS': {'lat': "40° 40' 40.33488'' N",
   'lng': "73° 58' 6.50100'' W"},
  'FIPS': {'county': '36047', 'state': '36'},
  'MGRS': '18TWL8717403510',
  'Maidenhead': 'FN30aq32sq',
  'Mercator': {'x': -8234132.693, 'y': 4937086.153},
  'OSM': {'edit_url': 'https://www.openstreetmap.org/edit?node=248606480#map=17/40.67787/-73.96847',
   'note_url': 'https://www.openstreetmap.org/note/new#map=17/40.67787/-73.96847&layers=N',
   'url': 'https://www.openstreetmap.org/?mlat=40.67787&mlon=-73.96847#map=17/40.67787/-73.96847'},
  'UN_M49': {'regions': {'AMERICAS': '019',
    'NORTHERN_AMERICA': '021',
    'US': '840',
    'WORLD': '001'},
   'statistical_groupings': ['MEDC']},
  'callingcode': 1,
  'currency': {'alternate_symbols': ['US$'],
   'decimal_mark': '.',
   'disambiguate_symbol': 'US$',
   'html_entity': '$',
   'iso_code': 'USD',
   'iso_numeric': '840',
   'name': 'United States Dollar',
   'smallest_denomination': 1,
   'subunit': 'Cent',
   'subunit_to_unit': 100,


In [27]:
results_from

[{'annotations': {'DMS': {'lat': "40° 40' 35.65920'' N",
    'lng': "73° 56' 39.48720'' W"},
   'FIPS': {'state': '36'},
   'MGRS': '18TWL8921903390',
   'Maidenhead': 'FN30aq62qj',
   'Mercator': {'x': -8231442.046, 'y': 4936896.247},
   'OSM': {'note_url': 'https://www.openstreetmap.org/note/new#map=17/40.67657/-73.94430&layers=N',
    'url': 'https://www.openstreetmap.org/?mlat=40.67657&mlon=-73.94430#map=17/40.67657/-73.94430'},
   'UN_M49': {'regions': {'AMERICAS': '019',
     'NORTHERN_AMERICA': '021',
     'US': '840',
     'WORLD': '001'},
    'statistical_groupings': ['MEDC']},
   'callingcode': 1,
   'currency': {'alternate_symbols': ['US$'],
    'decimal_mark': '.',
    'disambiguate_symbol': 'US$',
    'html_entity': '$',
    'iso_code': 'USD',
    'iso_numeric': '840',
    'name': 'United States Dollar',
    'smallest_denomination': 1,
    'subunit': 'Cent',
    'subunit_to_unit': 100,
    'symbol': '$',
    'symbol_first': 1,
    'thousands_separator': ','},
   'flag': '🇺