In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import os

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    classification_report,
    roc_auc_score,
    confusion_matrix,
    precision_score,
    recall_score,
    f1_score,
)

from imblearn.ensemble import BalancedRandomForestClassifier
import joblib

from geopy.distance import geodesic

In [2]:
schedule_data = pd.read_csv("./data/driver_schedule.csv", engine="pyarrow")
driver_data = pd.read_csv("./data/driver_update2.csv", engine="pyarrow")
order_data = pd.read_csv("./data/order.csv", engine="pyarrow")
order_driver_data = pd.read_csv("./data/order_driver.csv", engine="pyarrow")

In [3]:
model_output_folder = "./out"
os.makedirs(model_output_folder, exist_ok=True)
model_path = "./Out/acceptance_model.pkl"

# if the model exists, decide if re-train the model is needed
retrain_model: bool = False

In [4]:
if os.path.exists(model_path) and not retrain_model:
    print(f"Model found at {model_path}. Loading model...")
    # with open(model_path, "rb") as f:
    #     model = pickle.load(f)
    model = joblib.load(model_path)
    print("Model loaded successfully.")
else:
    order_driver_data = pd.read_csv("./data/order_driver.csv")

    order_driver_data = order_driver_data.loc[
        (order_driver_data["status"] == 5) & (
            order_driver_data["outside"] == 0)
    ]
    print(order_driver_data.shape)
    print(order_driver_data["accept"].describe())

    # Define features & target variable
    order_features = order_driver_data[
        ["commission", "driver_distance", "hour",
            "weather_code", "work_time_minutes"]
    ]
    # order_features = order_driver_data[
    #     ["commission", "distance", "hour", "weather_code", "work_time_minutes"]
    # ]
    print(order_features.head())
    acceptance_status = order_driver_data["accept"]
    # Define features & target variable
    # Train-test split
    features_train, features_test, target_train, target_test = train_test_split(
        order_features, acceptance_status, test_size=0.2, random_state=42
    )

    # Train model (BalancedRandomForest handles imbalance natively)
    model = BalancedRandomForestClassifier(random_state=42)
    model.fit(features_train, target_train)

    # Make predictions
    y_pred = model.predict(features_test)
    y_probs = model.predict_proba(features_test)[:, 1]

    # Evaluate model performance
    print("Precision:", precision_score(target_test, y_pred))
    print("Recall:", recall_score(target_test, y_pred))
    print("F1 Score:", f1_score(target_test, y_pred))
    print("Classification Report:\n", classification_report(target_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(target_test, y_pred))
    print("ROC AUC Score:", roc_auc_score(target_test, y_probs))
    # Save trained model (only the classifier, without SMOTE)
    joblib.dump(model, "./Out/acceptance_model.pkl")

Model found at ./Out/acceptance_model.pkl. Loading model...
Model loaded successfully.


In [5]:
from models import WeatherService
from models import Order
from models import DriverManager


weather_service = WeatherService(weather_csv_path="./data/weather.csv")

# init driver_manager
driver_manager = DriverManager(
    order_driver_data=order_driver_data,
    driver_data=driver_data,
    schedule_data=schedule_data,
    acceptance_model=model,
)


order_count = 0
accepted_order_count = 0
real_accpted_order_count = 0

for index, order in order_data.iterrows():

    order_id = order["order_id"]
    datetime_str = order["datetime"]
    pickup_area = order["pickup_area2"]
    dropoff_area = order["dropoff_area2"]
    pickup_lat = order["pickup_lat"]
    pickup_lon = order["pickup_lon"]
    dropoff_lat = order["dropoff_lat"]
    dropoff_lon = order["dropoff_lon"]

    customer_price = order["customer_price"]
    complete_time = order['complete_time']

    # Check for any missing values before proceeding
    if (
        pd.isna(order_id)
        or pd.isna(datetime_str)
        or pd.isna(pickup_area)
        or pd.isna(dropoff_area)
        or pd.isna(pickup_lat)
        or pd.isna(pickup_lon)
        or pd.isna(dropoff_lat)
        or pd.isna(dropoff_lon)
        or pd.isna(customer_price)
        # complete_time
        or pd.isna(complete_time)
    ):
        # Skip to the next iteration if any value is missing
        print()
        print(f"--- Skipping Order (Index: {index}) due to missing values ---")
        continue
    order = Order(
        order_id=order_id,
        datetime_str=datetime_str,
        pickup_area=pickup_area,
        dropoff_area=dropoff_area,
        pickup_lat=pickup_lat,
        pickup_lon=pickup_lon,
        dropoff_lat=dropoff_lat,
        dropoff_lon=dropoff_lon,
        customer_price=customer_price,
        commissionPercent=0.20,
        complete_time=complete_time,
        weather_service=weather_service
    )
    weather_code = weather_service.get_weather_code(order.datetime)
    print()
    print(f"--- Order Details (Index: {index}) ---")
    # print(f"Order ID: {order_id}")
    # print(f"Datetime: {datetime_str}")
    # print(f"Pickup Area: {pickup_area}")
    # print(f"Dropoff Area: {dropoff_area}")
    # print(f"Pickup Latitude: {pickup_lat}")
    # print(f"Pickup Longitude: {pickup_lon}")
    # print(f"Dropoff Latitude: {dropoff_lat}")
    # print(f"Dropoff Longitude: {dropoff_lon}")
    # print(f"Customer Price: {customer_price}")
    # print(f"The weather code is: {weather_code}")
    print(order)
    order_count = order_count + 1
    accept_order = driver_manager.get_driver_attampt(order=order)
    if accept_order:
        accepted_order_count = accepted_order_count + 1
    if order_driver_data[order_driver_data['order_id']==order.order_id]['accept'].max():
        real_accpted_order_count = real_accpted_order_count + 1
    print("---------------------------------")

    if index == 1000:
        break

print(f"Order Count: {order_count}")
print(f"Accept Order Count: {accepted_order_count}")
print(f"Original Accept Order Count: {real_accpted_order_count}")


--- Skipping Order (Index: 0) due to missing values ---

--- Order Details (Index: 1) ---
Order(
    order_id=4863452,
    datetime=2025-04-07 08:08:52,
    pickup_area=598.0,
    dropoff_area=328.0,
    pickup_lat=32.6959297,
    pickup_lon=51.7367204,
    dropoff_lat=32.6326779,
    dropoff_lon=51.6529232,
    customer_price=96000.00,
    commissionPercent=0.20,
    driver_commission=19200.00,
    platform_revenue=76800.00,
    hour_of_day=8
    weather_code=0.0
    complete_time=31.600000000000005
)
No new drivers from update_driver_set with matching 'driver_area' ('598.0') to add.
update_driver_set has no matched driver ID for existing drivers in the pool, no update from it.
Driver pool has been randomized.
Driver 17771.0 is initialized with location (32.6827523, 51.7405087)
Driver 17771.0 is not scheduled to work at 2025-04-07 08:00.
Driver 19281.0 is initialized with location (32.6946307, 51.7129286)
Driver 19281.0 can work at 2025-04-07 08:00.
The distance calculated by geodesi

#### Fixed Threshold of Acceptance model (thre=0.5)

* Order Count: 928
* Accept Order Count: 748

#### Random Threshold of Acceptance model (thre=(0, 1))

* Order Count: 928
* Accept Order Count: 854

#### Original (100% accept)
* Original Accept Count: 928

In [6]:
748 / 928

0.8060344827586207

In [7]:
928 /1000

0.928

In [8]:
854 / 928

0.9202586206896551