In [8]:
import pandas as pd
import numpy as np

---

## Weather Data

In [9]:
weather_data_path = "./data/weather.csv"
weather_data = pd.read_csv(weather_data_path, engine="pyarrow")
weather_data

Unnamed: 0,datetime,weather_code
0,2025-04-07 08:07:35,0.0
1,2025-04-07 08:08:52,0.0
2,2025-04-07 08:10:17,0.0
3,2025-04-07 08:12:07,0.0
4,2025-04-07 08:15:35,0.0
...,...,...
27001,2025-04-21 22:50:12,1.0
27002,2025-04-21 22:51:45,3.0
27003,2025-04-21 23:23:14,3.0
27004,2025-04-21 23:54:09,3.0


---

## Driver Data

In [10]:
driver_data_path = "./data/driver_initial.csv"
driver_data = pd.read_csv(driver_data_path, engine="pyarrow")
# driver_data

## Order Data

In [11]:
order_data_path = "./data/order.csv"
order_data = pd.read_csv(order_data_path, engine="pyarrow")
order_data

Unnamed: 0,order_id,driver_id,customer_id,status,user_rate,driver_rate,datetime,updated_at,driver_commission,route,...,dropoff_lon,pickup_lat,pickup_lon,pickup_area,dropoff_area,weather_code,lat2,lng2,dropoff_area2,pickup_area2
0,4863451,,109231,6,,,2025-04-07 08:07:35,2025-04-07 08:12:01,74800.0,,...,,,,,,0.0,,,,
1,4863452,21137.0,192867,5,,,2025-04-07 08:08:52,2025-04-07 08:31:56,81600.0,,...,51.652923,32.695930,51.736720,36.0,28.0,0.0,32.70,51.74,328.0,598.0
2,4863453,21577.0,62707,5,,,2025-04-07 08:10:17,2025-04-07 08:21:42,76500.0,,...,51.707753,32.651796,51.607815,35.0,36.0,0.0,32.65,51.61,595.0,396.0
3,4863454,1599.0,4174,5,,,2025-04-07 08:12:07,2025-04-07 09:15:15,52700.0,,...,51.723003,32.672692,51.657033,36.0,36.0,0.0,32.67,51.66,445.0,479.0
4,4863455,20416.0,82566,5,,,2025-04-07 08:15:35,2025-04-07 08:37:43,208250.0,,...,51.687026,32.653523,51.719105,36.0,36.0,0.0,32.65,51.72,442.0,407.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27001,4890452,,235410,6,,,2025-04-21 22:50:12,2025-04-21 22:50:12,987000.0,,...,,,,,,,,,,
27002,4890453,,235410,6,,,2025-04-21 22:51:45,2025-04-21 23:20:01,987000.0,,...,,,,,,3.0,,,,
27003,4890454,2443.0,77236,5,,,2025-04-21 23:23:14,2025-04-21 23:23:45,405000.0,,...,51.659327,32.679519,51.659273,36.0,36.0,3.0,32.68,51.66,518.0,518.0
27004,4890455,14432.0,176262,5,,,2025-04-21 23:54:09,2025-04-21 23:54:27,450000.0,,...,,,,,,3.0,,,,


## Order-Driver Data

In [12]:
order_driver_data_path = "./data/order_driver.csv"
order_driver_data = pd.read_csv(order_driver_data_path, engine="pyarrow")
# order_driver_data

In [13]:
driver_update_2_path = "./data/driver_update2.csv"
driver_update_2_data = pd.read_csv(driver_update_2_path, engine="pyarrow")
# driver_update_2_data

In [None]:
driver_update_2_data

---

## Distance Validation Data

In [None]:
dist_val_data = order_driver_data[
    [
        "id",
        "order_id",
        "driver_id",
        "driver_distance",
        "distance",
        "accept",
        "driver_lat",
        "driver_lon",
    ]
]
# dist_val_data[dist_val_data["driver_id"] == 8]
# dist_val_data

In [None]:
dist_val_data = pd.merge(
    dist_val_data,
    order_data[["order_id", "pickup_lat", "pickup_lon", "dropoff_lat", "dropoff_lon"]],
    on="order_id",
    how="left",
)

# dist_val_data

In [None]:
dist_val_data["driver_to_pickup_distance"] = (
    np.sqrt(
        (dist_val_data["driver_lat"] - dist_val_data["pickup_lat"]) ** 2
        + (dist_val_data["driver_lon"] - dist_val_data["pickup_lon"]) ** 2
    )
    * 111
    * 1000
)

# Calculate pickup to dropoff distance
dist_val_data["pickup_to_dropoff_distance"] = (
    np.sqrt(
        (dist_val_data["pickup_lat"] - dist_val_data["dropoff_lat"]) ** 2
        + (dist_val_data["pickup_lon"] - dist_val_data["dropoff_lon"]) ** 2
    )
    * 111
    * 1000
)

dist_val_data["driver_to_dropoff_distance"] = (
    np.sqrt(
        (dist_val_data["driver_lat"] - dist_val_data["dropoff_lat"]) ** 2
        + (dist_val_data["driver_lon"] - dist_val_data["dropoff_lon"]) ** 2
    )
    * 111
    * 1000
)

dist_val_data["total_driver_trip_distance"] = (
    dist_val_data["driver_to_pickup_distance"]
    + dist_val_data["pickup_to_dropoff_distance"]
)

In [None]:
dist_val_data[
    [
        "id",
        "order_id",
        "driver_id",
        "driver_distance",
        "driver_to_pickup_distance",
        "distance",
        "driver_to_dropoff_distance",
        "total_driver_trip_distance",
    ]
]

In [None]:
order_data.columns

In [None]:
order_data[["order_id", "driver_id", "accepted_at", "started_at", "completed_at"]]

---

## Accept more than 1 orders

In [None]:
# Ensure 'started_at' and 'completed_at' columns are datetime objects
order_data["started_at"] = pd.to_datetime(order_data["started_at"])
order_data["completed_at"] = pd.to_datetime(order_data["completed_at"])

# Sort data by driver_id and then by started_at to facilitate checking for overlaps
sorted_orders = order_data.sort_values(by=["driver_id", "started_at"])

# List to store all detected overlapping order pairs
overlapping_orders_list = []

# Group by driver_id and iterate through each driver's orders
for driver_id, driver_df in sorted_orders.groupby("driver_id"):
    # Iterate through consecutive orders for the current driver
    for i in range(len(driver_df) - 1):
        current_order = driver_df.iloc[i]
        next_order = driver_df.iloc[i + 1]

        # Check if the next order's start time is before the current order's completion time
        # This indicates an overlap in their active delivery periods
        if next_order["started_at"] < current_order["completed_at"]:
            overlapping_orders_list.append(
                {
                    "driver_id": driver_id,
                    "order_1_id": current_order["order_id"],
                    "order_1_started_at": current_order["started_at"],
                    "order_1_completed_at": current_order["completed_at"],
                    "order_2_id": next_order["order_id"],
                    "order_2_started_at": next_order["started_at"],
                    "order_2_completed_at": next_order["completed_at"],
                }
            )

# Convert the list of overlapping orders into a DataFrame
overlapping_df = pd.DataFrame(overlapping_orders_list)

if not overlapping_df.empty:
    print("Drivers with overlapping order completion times:")
    # print(overlapping_df)
    overlapping_ids = overlapping_df["driver_id"].astype(int).unique().tolist()
    print(f"The IDs of drivers with overlapping orders are: {overlapping_ids}")
else:
    print("No drivers found completing multiple orders in overlapping time periods.")


overlapping_df

In [None]:
order_data.shape

---

## Driver Update Data

In [None]:
driver_update2_data_path = './data/driver_update2.csv'
driver_update2_data = pd.read_csv(driver_update2_data_path)
driver_update2_data

In [None]:
len(driver_update2_data['driver_area'].unique().tolist())