In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("data/lade_cleaned.csv")

In [3]:
print("Raw shape :", df.shape)
df.head()

Raw shape : (10000, 13)


Unnamed: 0,order_id,from_dipan_id,from_city_name,delivery_user_id,poi_lng,poi_lat,aoi_id,typecode,receipt_time,receipt_lng,receipt_lat,sign_time,ds
0,698874e11b0d210e4586274114288778,4126aff79b11158c3181cd02b5747016,杭州市,240a284909e577d08d49f5f671430187,10423520.0,-7594308.0,582d47eefe21ee13d2361d073f4e1d77,203ac3454d75e02ebb0a3c6f51d735e4,03-18 17:08:00,10425470.0,-7593416.0,03-18 18:42:00,318
1,dfbe435bab2c490251c714b463d2700d,32ee03c02fa79c1d53e8fe01156c2863,杭州市,811d6958f99eb0de70d2e97d31fb63ba,10414020.0,-7589769.0,d0f685dd6c6ced88011bbfd8a6482a00,fe76dff35bb199cdb7329eba2b918f18,03-18 16:56:00,10413470.0,-7591632.0,03-18 17:16:00,318
2,295aa7cd77ed29713e900d5d9ee843ba,30330a823baf6442fe3a3efa3f10aebc,上海市,de542215e326c9394163b4cda8c2f429,10563840.0,-7475921.0,5e577b097ec1a16bbc48f8a90fe00a1f,203ac3454d75e02ebb0a3c6f51d735e4,03-18 13:18:00,10562870.0,-7475695.0,03-18 15:03:00,318
3,f0c2180b28329574e3eaedb01316ae0f,fe4b0880307873de265970ba5d0128d2,重庆市,1afc0138069d35206aa90ce8f42107a4,8908732.0,-7688857.0,3baac3a5c7959d94917578e7646e1a5d,203ac3454d75e02ebb0a3c6f51d735e4,03-18 15:04:00,8909401.0,-7692811.0,03-18 16:12:00,318
4,476e2aec3fa5afe9716b583dc6cb5cdc,0d9b8762d4530e2945c402ddd7c0a79e,上海市,2dc3fd756fa5db1e769cf6d32ef958a3,10547660.0,-7468834.0,2bd17d1629f490497557dd3933fffc2d,fe76dff35bb199cdb7329eba2b918f18,03-18 15:38:00,10547610.0,-7471315.0,03-18 17:06:00,318


| Raw Field | Description |
|----------|-------------|
| **order_id** | Unique identifier for the delivery order. |
| **from_dipan_id** | ID of the merchant/store (origin). |
| **from_city_name** | City where the delivery takes place. |
| **delivery_user_id** | Unique identifier of the courier assigned to the order. |
| **poi_lng**, **poi_lat** | Projected-coordinate pickup location. |
| **aoi_id** | Area-of-Interest (neighborhood/zone) identifier. |
| **typecode** | Encoded category representing merchant/store type. |
| **receipt_time** | Timestamp when the order was received/accepted. |
| **receipt_lng**, **receipt_lat** | Projected-coordinate dropoff/delivery location. |
| **sign_time** | Timestamp when the courier completed the delivery. |
| **ds** | Date stamp for the order (day or dataset partition). |

In [4]:
# Parse the two time columns
df["receipt_time"] = pd.to_datetime("1900-" + df["receipt_time"], format="%Y-%m-%d %H:%M:%S")
df["sign_time"]    = pd.to_datetime("1900-" + df["sign_time"],    format="%Y-%m-%d %H:%M:%S")

# Trip duration in seconds
df["duration_sec"] = (df["sign_time"] - df["receipt_time"]).dt.total_seconds()
df["duration_sec"].describe()

count     10000.000000
mean      10228.218000
std       25030.470282
min           0.000000
25%        3120.000000
50%        5520.000000
75%        9840.000000
max      982620.000000
Name: duration_sec, dtype: float64

In [5]:
# Late flag
LATE_THRESHOLD = 9840 # based on historical 75th percentile of delivery durations in seconds
df["is_late"] = (df["duration_sec"] > LATE_THRESHOLD).astype(int)
print("Late-rate:", df["is_late"].mean())

Late-rate: 0.2488


In [6]:
# Distance in km (Euclidean)
df["distance_km"] = np.sqrt(
    (df["poi_lng"] - df["receipt_lng"])**2 +
    (df["poi_lat"] - df["receipt_lat"])**2
) / 1000.0

# time of day 
df["receipt_hour"] = df["receipt_time"].dt.hour

# One-hot encoding 
df = pd.get_dummies(df, columns=["from_city_name", "typecode"], drop_first=True)

# Target-encode high-cardinality IDs
global_late = df["is_late"].mean()

def target_encode(col, min_samples=5):
    agg = df.groupby(col)["is_late"].agg(["mean", "count"])
    smooth = (agg["mean"]*agg["count"] + global_late*min_samples) / (agg["count"] + min_samples)
    return df[col].map(smooth).fillna(global_late)

df["rider_late"] = target_encode("delivery_user_id", min_samples=3)
df["aoi_late"]   = target_encode("aoi_id",         min_samples=5)
df["dipan_late"] = target_encode("from_dipan_id",  min_samples=3)

| **Feature** | **Description** |
|------------|-----------------|
| **distance_km** | Straight-line distance (km) between pickup and dropoff, computed from projected coordinates. |
| **receipt_hour** | Hour of day the order was received (0–23), capturing demand cycles and traffic patterns. |
| **rider_late** | Target-encoded historical lateness rate of the assigned courier (smoothed). |
| **aoi_late** | Target-encoded lateness rate for the delivery area (AOI). |
| **dipan_late** | Target-encoded lateness rate for the merchant/store. |
| **from_city_name_\*** | One-hot encoded indicators for each city present in the dataset. |
| **typecode_\*** | One-hot encoded indicators representing merchant/store type categories. |

In [7]:
# Additional time-based features
df["receipt_dow"]   = df["receipt_time"].dt.dayofweek
df["is_weekend"]    = df["receipt_dow"].isin([5, 6]).astype(int)
df["receipt_month"] = df["receipt_time"].dt.month

# lunch dinner time flags
df["is_lunch_peak"]  = df["receipt_hour"].between(11, 14).astype(int)
df["is_dinner_peak"] = df["receipt_hour"].between(17, 20).astype(int)
df["is_peak"] = df["is_lunch_peak"] | df["is_dinner_peak"]

# Interaction features
df["peak_x_distance"]  = df["is_peak"]  * df["distance_km"]

| New Feature | Description |
|---------|-------------|
| **receipt_dow** | Day of week the order was received (0 = Monday, 6 = Sunday). |
| **is_weekend** | 1 if the order was received on a weekend (Saturday or Sunday). |
| **receipt_month** | Month of the year the order was received. |
| **is_lunch_peak** | 1 if receipt time is during lunch peak (11:00–14:00). |
| **is_dinner_peak** | 1 if receipt time is during dinner peak (17:00–20:00). |
| **is_peak**       |  lunch/dinner peak delivery window. |
| **peak_x_distance** | Interaction feature: peak-hour flag × distance. |

In [8]:
print(df.columns.tolist())

['order_id', 'from_dipan_id', 'delivery_user_id', 'poi_lng', 'poi_lat', 'aoi_id', 'receipt_time', 'receipt_lng', 'receipt_lat', 'sign_time', 'ds', 'duration_sec', 'is_late', 'distance_km', 'receipt_hour', 'from_city_name_杭州市', 'from_city_name_重庆市', 'typecode_203ac3454d75e02ebb0a3c6f51d735e4', 'typecode_339d14e62a5bbd67de62f461a5f7db1e', 'typecode_37b081598a86e85e8887b0539ab61824', 'typecode_4602b38053ece07a9ca5153f1df2e404', 'typecode_592363aed428fd9addffa38da2724834', 'typecode_6771c4e2ecb275c95c43f6c639a2cbad', 'typecode_73ffcbd1b26557b462b14e4dd4c57fcb', 'typecode_7a0cb6a50445e24c12a407687b28fa06', 'typecode_84c7d46d654e5a8bd329a3e8ed0293ce', 'typecode_90b3cbb280f865d62b2fd2efdd0fb0f3', 'typecode_e83a6cefa7e4bde8a8af866f3f4e90eb', 'typecode_e8b508bbdada69046e4dd74ef59ee85a', 'typecode_fe76dff35bb199cdb7329eba2b918f18', 'rider_late', 'aoi_late', 'dipan_late', 'receipt_dow', 'is_weekend', 'receipt_month', 'is_lunch_peak', 'is_dinner_peak', 'is_peak', 'peak_x_distance']


In [9]:
# Columns not needed for modeling
cols_to_drop = [
    "order_id",
    "from_dipan_id",
    "delivery_user_id",
    "aoi_id",

    "poi_lng", "poi_lat",
    "receipt_lng", "receipt_lat",

    "receipt_time", "sign_time",

    "duration_sec",
    "ds",
    "is_late",
]

# Prepare feature matrix X and target vector y
feature_cols = [c for c in df.columns if c not in cols_to_drop]

X = df[feature_cols].copy()
y = df["is_late"].values

In [10]:
X.head()

Unnamed: 0,distance_km,receipt_hour,from_city_name_杭州市,from_city_name_重庆市,typecode_203ac3454d75e02ebb0a3c6f51d735e4,typecode_339d14e62a5bbd67de62f461a5f7db1e,typecode_37b081598a86e85e8887b0539ab61824,typecode_4602b38053ece07a9ca5153f1df2e404,typecode_592363aed428fd9addffa38da2724834,typecode_6771c4e2ecb275c95c43f6c639a2cbad,...,rider_late,aoi_late,dipan_late,receipt_dow,is_weekend,receipt_month,is_lunch_peak,is_dinner_peak,is_peak,peak_x_distance
0,2.14459,17,True,False,True,False,False,False,False,False,...,0.0622,0.138222,0.149256,6,1,3,0,1,1,2.14459
1,1.943352,16,True,False,False,False,False,False,False,False,...,0.145533,0.082933,0.072284,6,1,3,0,0,0,0.0
2,0.998927,13,False,False,True,False,False,False,False,False,...,0.053314,0.207333,0.011309,6,1,3,1,0,1,0.998927
3,4.010138,15,False,True,True,False,False,False,False,False,...,0.305156,0.1555,0.369013,6,1,3,0,0,0,0.0
4,2.481613,15,False,False,False,False,False,False,False,False,...,0.183093,0.320571,0.124962,6,1,3,0,0,0,0.0


In [11]:
print(y[:5])

[0 0 0 0 0]


In [12]:
X.to_csv("data/X_train.csv", index=False)
y = df["is_late"]
y.to_csv("data/y_train.csv", index=False, header=True)