In [1]:
import pandas as pd
import numpy as np
import os
import joblib

from imblearn.ensemble import BalancedRandomForestClassifier
from geopy.distance import geodesic


from models import Driver
from models import Order
from models import DriverRecord
from models import WeatherService

In [2]:
schedule_data = pd.read_csv("./data/driver_schedule.csv", engine="pyarrow")
driver_data = pd.read_csv("./data/driver_update2.csv", engine="pyarrow")
order_data = pd.read_csv("./data/order.csv", engine="pyarrow")
order_driver_data = pd.read_csv("./data/order_driver.csv", engine="pyarrow")
weather_service = WeatherService(weather_csv_path="./data/weather.csv")
driver_record = DriverRecord()

In [3]:
model_output_folder = "./out"
os.makedirs(model_output_folder, exist_ok=True)
model_path = "./Out/acceptance_model.pkl"

# if the model exists, decide if re-train the model is needed
retrain_model: bool = False

if os.path.exists(model_path) and not retrain_model:
    print(f"Model found at {model_path}. Loading model...")
    # with open(model_path, "rb") as f:
    #     model = pickle.load(f)
    model = joblib.load(model_path)
    print("Model loaded successfully.")

Model found at ./Out/acceptance_model.pkl. Loading model...
Model loaded successfully.


In [4]:
class DriverManager:
    """
    Manages the driver information DataFrame.
    """

    def __init__(
        self,
        order_driver_data: pd.DataFrame,
        driver_data: pd.DataFrame,
        schedule_data: pd.DataFrame,
        acceptance_model: BalancedRandomForestClassifier,
        driver_record: DriverRecord,
        sort_driver_pool_policy: str = "distance",
    ):
        self.order_driver_data = order_driver_data
        self.driver_data = driver_data
        self.schedule_data = schedule_data
        # init the update_driver_set
        self.update_driver_set = pd.DataFrame(
            columns=[
                "driver_id",
                "driver_lat",
                "driver_lon",
                "driver_area",
                "work_time_minutes",
            ]
        )
        self.model = acceptance_model
        self.driver_record = driver_record

        if sort_driver_pool_policy == "distance":
            self.sort_driver_pool_policy = "distance"
        elif sort_driver_pool_policy == "random":
            self.sort_driver_pool_policy = "random"
        else:
            self.sort_driver_pool_policy = "distance"
        print(f"The Driver Pool is sorted by {self.sort_driver_pool_policy}")

    def get_original_driver_set(self, order: Order) -> pd.DataFrame:
        """
        Get the original driver set for a specific order.
        """
        original_driver_assign_set = self.order_driver_data[
            self.order_driver_data["order_id"] == order.order_id
        ]
        original_driver_assign_ids = (
            original_driver_assign_set["driver_id"].unique().tolist()
        )

        # select the target driver
        original_driver_set = self.driver_data[
            (self.driver_data["driver_id"].isin(original_driver_assign_ids))
            & (self.driver_data["order_id"] == order.order_id)
        ]

        original_driver_set = original_driver_set[
            [
                "driver_id",
                "driver_lat",
                "driver_lon",
                "driver_area",
                "work_time_minutes",
            ]
        ]
        # original_driver_set = original_driver_set.reset_index(drop=True)
        # Because in the driver data set, if a rider accepts an order, the platform will continue to record the update of his location
        # keep the last record to get the rider's original position for the order
        # original_driver_set = original_driver_set.drop_duplicates(
        #     subset=['driver_id'], keep='last')
        # no need to drop duplicates

        # self.original_driver_set = original_driver_set

        return original_driver_set

    def get_driver_pool(self, order: Order) -> pd.DataFrame:
        """
        Constructs the driver pool, prioritizing updated driver information
        and including new drivers based on matching pickup area.
        """
        original_driver_set = self.get_original_driver_set(order)
        driver_pool = original_driver_set.copy()

        # New Logic: Add drivers from update_driver_set if their area matches order.pickup_area
        # and they are not already in the original driver pool.

        # 1. Filter update_driver_set for drivers matching the pickup area
        area_matched_drivers_from_updates = self.update_driver_set[
            self.update_driver_set["driver_area"] == order.pickup_area
        ].copy()  # Use .copy() to ensure an independent DataFrame

        # 2. Identify truly new drivers (not in original_driver_set) from the area-matched set
        truly_new_drivers_to_add = area_matched_drivers_from_updates[
            ~area_matched_drivers_from_updates["driver_id"].isin(
                driver_pool["driver_id"]
            )
        ]

        # 3. Add these truly new drivers to the driver_pool
        if not truly_new_drivers_to_add.empty:
            driver_pool = pd.concat(
                [driver_pool, truly_new_drivers_to_add], ignore_index=True
            )
            print(
                f"Added new drivers to the pool based on matching 'driver_area' ('{order.pickup_area}'): {truly_new_drivers_to_add['driver_id'].tolist()}"
            )
        else:
            print(
                f"No new drivers from update_driver_set with matching 'driver_area' ('{order.pickup_area}') to add."
            )

        # Prepare for update: Set 'driver_id' as index for both DataFrames.
        # driver_pool now might contain newly added drivers.
        driver_pool_indexed = driver_pool.set_index("driver_id")
        update_set_indexed = self.update_driver_set.set_index("driver_id")

        # Check if driver_pool (which now includes original + potentially new area-matched drivers)
        # and update_set_indexed have common driver_ids.
        # These are the drivers whose information will be updated.
        drivers_to_be_updated = driver_pool_indexed.index.intersection(
            update_set_indexed.index
        )

        # Perform the update operation. This will update rows in driver_pool_indexed
        # where the 'driver_id' exists in update_set_indexed.
        driver_pool_indexed.update(update_set_indexed)

        # Reset the index to 'driver_id' column again for the final DataFrame
        driver_pool = driver_pool_indexed.reset_index()

        # Check if any updates actually happened based on common_driver_ids
        if not drivers_to_be_updated.empty:
            print(
                f"Using data in update_driver_set to update the following drivers in the pool (original and new area-matched): {drivers_to_be_updated.tolist()}"
            )
        else:
            print(
                "update_driver_set has no matched driver ID for existing drivers in the pool, no update from it."
            )

        if self.sort_driver_pool_policy == "random":
            # Randomize the order of the driver_pool
            driver_pool = driver_pool.sample(frac=1).reset_index(drop=True)
            print("Driver pool has been randomized.")
        else:
            # Sorted by distance
            driver_pool["distance"] = driver_pool.apply(
                lambda row: geodesic(
                    (row["driver_lat"], row["driver_lon"]),
                    (order.pickup_lat, order.pickup_lon),
                ).m,
                axis=1,
            )
            driver_pool = driver_pool.sort_values(by="distance", ascending=True)
            print("Driver pool has been sorted by distance.")

        return driver_pool

    def get_driver_attampt(self, order: Order):
        """
        Iterates through the driver pool for a given order.
        You can add your specific logic for each driver inside the loop.
        """
        driver_pool = self.get_driver_pool(order)
        if driver_pool.empty:
            print("Driver pool is empty for this order.")
        for _, driver_info in driver_pool.iterrows():
            driver_id = driver_info["driver_id"]
            driver_lat = driver_info["driver_lat"]
            driver_lon = driver_info["driver_lon"]
            driver_area = driver_info["driver_area"]
            work_time_minutes = driver_info["work_time_minutes"]
            driver = Driver(
                driver_id=driver_id,
                current_lat=driver_lat,
                current_lon=driver_lon,
                current_area=driver_area,
                work_time_minutes=work_time_minutes,
                model=self.model,
            )

            # accept_order = driver.decide_acceptance(
            #     order=order,
            #     schedule_data=self.schedule_data,
            #     threshold=0.5
            # )
            accept_order = driver.decide_acceptance(
                order=order,
                schedule_data=self.schedule_data,
                threshold=np.random.random(),
            )

            if accept_order:
                self.driver_record.add_driver_record(order=order, driver=driver)
                print(f"Order has been accepted, stop driver attampt")
                # New Logic: Update update_driver_set with the accepting driver's info
                accepting_driver_data = pd.DataFrame(
                    [
                        {
                            "driver_id": driver_id,
                            "driver_lat": order.dropoff_lat,  # Use order's pickup latitude
                            "driver_lon": order.dropoff_lon,  # Use order's pickup longitude
                            "driver_area": order.pickup_area,  # Use order's pickup area
                            # Keep driver's current work time
                            "work_time_minutes": work_time_minutes
                            + order.complete_time,
                        }
                    ]
                )

                # Concatenate the existing set with the new driver's data,
                # then drop duplicates to ensure only the latest record for each driver_id is kept.
                # self.update_driver_set = pd.concat(
                #         [self.update_driver_set, accepting_driver_data],
                #         ignore_index=True
                #     ).drop_duplicates(subset=['driver_id'], keep='last')
                # print(f"Updated update_driver_set with driver_id: {driver_id}")

                if self.update_driver_set.empty:
                    self.update_driver_set = accepting_driver_data
                else:
                    self.update_driver_set = pd.concat(
                        [self.update_driver_set, accepting_driver_data],
                        ignore_index=True,
                    ).drop_duplicates(subset=["driver_id"], keep="last")
                print(f"Updated update_driver_set with driver_id: {driver_id}")

                break

        return accept_order

In [5]:
driver_manager = DriverManager(
    order_driver_data=order_driver_data,
    driver_data=driver_data,
    schedule_data=schedule_data,
    acceptance_model=model,
    driver_record=driver_record,
    sort_driver_pool_policy="random",
)

The Driver Pool is sorted by random


In [6]:
for index, order in order_data.iterrows():

    order_id = order["order_id"]
    datetime_str = order["datetime"]
    pickup_area = order["pickup_area2"]
    dropoff_area = order["dropoff_area2"]
    pickup_lat = order["pickup_lat"]
    pickup_lon = order["pickup_lon"]
    dropoff_lat = order["dropoff_lat"]
    dropoff_lon = order["dropoff_lon"]

    customer_price = order["customer_price"]
    complete_time = order["complete_time"]

    # Check for any missing values before proceeding
    if (
        pd.isna(order_id)
        or pd.isna(datetime_str)
        or pd.isna(pickup_area)
        or pd.isna(dropoff_area)
        or pd.isna(pickup_lat)
        or pd.isna(pickup_lon)
        or pd.isna(dropoff_lat)
        or pd.isna(dropoff_lon)
        or pd.isna(customer_price)
        # complete_time
        or pd.isna(complete_time)
    ):
        # Skip to the next iteration if any value is missing
        print()
        print(f"--- Skipping Order (Index: {index}) due to missing values ---")
        continue
    order = Order(
        order_id=order_id,
        datetime_str=datetime_str,
        pickup_area=pickup_area,
        dropoff_area=dropoff_area,
        pickup_lat=pickup_lat,
        pickup_lon=pickup_lon,
        dropoff_lat=dropoff_lat,
        dropoff_lon=dropoff_lon,
        customer_price=customer_price,
        commissionPercent=0.20,
        complete_time=complete_time,
        weather_service=weather_service,
    )
    print()
    print(f"--- Order Details (Index: {index}) ---")
    # print(f"Order ID: {order_id}")
    # print(f"Datetime: {datetime_str}")
    # print(f"Pickup Area: {pickup_area}")
    # print(f"Dropoff Area: {dropoff_area}")
    # print(f"Pickup Latitude: {pickup_lat}")
    # print(f"Pickup Longitude: {pickup_lon}")
    # print(f"Dropoff Latitude: {dropoff_lat}")
    # print(f"Dropoff Longitude: {dropoff_lon}")
    # print(f"Customer Price: {customer_price}")
    # print(f"The weather code is: {weather_code}")
    print(order)
    if index == 10:
        break


--- Skipping Order (Index: 0) due to missing values ---

--- Order Details (Index: 1) ---
Order(
    order_id=4863452,
    datetime=2025-04-07 08:08:52,
    pickup_area=598.0,
    dropoff_area=328.0,
    pickup_lat=32.6959297,
    pickup_lon=51.7367204,
    dropoff_lat=32.6326779,
    dropoff_lon=51.6529232,
    customer_price=96000.00,
    commissionPercent=0.20,
    driver_commission=19200.00,
    platform_revenue=76800.00,
    hour_of_day=8
    weather_code=0.0
    complete_time=31.600000000000005
)

--- Order Details (Index: 2) ---
Order(
    order_id=4863453,
    datetime=2025-04-07 08:10:17,
    pickup_area=396.0,
    dropoff_area=595.0,
    pickup_lat=32.651796,
    pickup_lon=51.6078153,
    dropoff_lat=32.69834,
    dropoff_lon=51.7077533,
    customer_price=90000.00,
    commissionPercent=0.20,
    driver_commission=18000.00,
    platform_revenue=72000.00,
    hour_of_day=8
    weather_code=0.0
    complete_time=21.266666666666666
)

--- Order Details (Index: 3) ---
Order(
 

In [7]:
order

Order(
    order_id=4863461,
    datetime=2025-04-07 08:38:56,
    pickup_area=522.0,
    dropoff_area=522.0,
    pickup_lat=32.6807386,
    pickup_lon=51.697661,
    dropoff_lat=32.6833418,
    dropoff_lon=51.6969515,
    customer_price=38000.00,
    commissionPercent=0.20,
    driver_commission=7600.00,
    platform_revenue=30400.00,
    hour_of_day=8
    weather_code=0.0
    complete_time=11.45
)

In [8]:
driver_manager.get_driver_pool(order=order)

No new drivers from update_driver_set with matching 'driver_area' ('522.0') to add.
update_driver_set has no matched driver ID for existing drivers in the pool, no update from it.
Driver pool has been randomized.


Unnamed: 0,driver_id,driver_lat,driver_lon,driver_area,work_time_minutes
0,7790,32.677754,51.682033,520,0.0
1,4899,32.680719,51.697595,522,0.0
2,15035,32.698978,51.701583,594,0.0
3,20135,32.663718,51.703288,443,0.0
4,20746,32.689681,51.680326,558,0.0
5,19847,32.690875,51.721958,562,0.0
6,20046,32.663442,51.702553,443,0.0
7,11902,32.693494,51.682626,558,4.35
8,19281,32.696688,51.706058,595,28.383333


In [13]:
driver_manager.get_driver_attampt(order=order)

No new drivers from update_driver_set with matching 'driver_area' ('522.0') to add.
Using data in update_driver_set to update the following drivers in the pool (original and new area-matched): [4899.0, 15035.0, 19847.0, 20046.0]
Driver pool has been randomized.
Driver 19281.0 is initialized with location (32.6966883, 51.7060581)
Driver 19281.0 can work at 2025-04-07 08:00.
The distance calculated by geodesic is 1936.1804859773672
Features input to the model for prediction:
{'commission': [7600.0], 'driver_distance': [1936.1804859773672], 'hour': [8], 'weather_code': [0.0], 'work_time_minutes': [np.float64(28.38333333)]}
Driver 19281.0 accept the order with probability of 0.35 and threshold 0.0800557275742032
Add driver record: Driver 19281.0 has accepted 4863461 at 2025-04-07 08:38:56
Order has been accepted, stop driver attampt
Updated update_driver_set with driver_id: 19281.0


True