In [1]:
# install & Import Libraries
!pip install -q xgboost scikit-learn

import pandas as pd
import numpy as np
import math

from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from xgboost import XGBRegressor

np.random.seed(42)

In [2]:
# mount the drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Load Daily Labor Dataset
df = pd.read_csv("/content/drive/MyDrive/Coffee/Dev/Phase 1/C3/Labor/data/synthetic_daily_labor_dataset_REAL_WEATHER.csv")

df["datetime"] = pd.to_datetime(df["datetime"])
df = df.sort_values("datetime").reset_index(drop=True)

df.head()

Unnamed: 0,datetime,year,month,day,area_ha,predicted_yield_kg_per_ha,daily_harvest_kg,temp,feelslike,humidity,precip,severerisk,productivity_index,pickers_needed,harvesters_needed,loaders_needed
0,2023-01-01,2023,1,1,2.49816,1768.623014,142.9,70.0,70.1,76.1,0.0,,0.85,2,2,1
1,2023-01-02,2023,1,2,4.802857,1565.066975,245.7,70.2,70.4,79.3,0.0,,0.85,3,3,1
2,2023-01-03,2023,1,3,3.927976,1691.415889,191.7,69.5,69.7,84.1,0.024,,0.85,3,2,1
3,2023-01-04,2023,1,4,3.394634,1802.733817,228.0,70.4,70.5,82.3,0.016,,0.85,3,3,1
4,2023-01-05,2023,1,5,1.624075,1952.315863,113.9,72.5,72.9,82.8,0.0,,0.85,2,2,1


In [4]:
# Transportation Capacity Assumptions
VEHICLE_CAPACITY_KG = {
    "tractor": 1500,
    "ape": 800,
    "truck": 3000
}

VEHICLE_DISTRIBUTION = {
    "tractor": 0.4,
    "ape": 0.35,
    "truck": 0.25
}

In [5]:
# Generate Ground-Truth Vehicle Requirements
def estimate_vehicle_need_realistic(row):
    vehicles = {}
    harvest = row["daily_harvest_kg"]

    for vtype, ratio in VEHICLE_DISTRIBUTION.items():
        capacity = VEHICLE_CAPACITY_KG[vtype]

        allocation_noise = np.random.uniform(0.85, 1.15)
        inefficiency = np.random.uniform(1.05, 1.30)

        assigned = harvest * ratio * allocation_noise
        vehicles[vtype] = max(
            0,
            math.ceil((assigned / capacity) * inefficiency)
        )

    return pd.Series(vehicles)

vehicle_targets = df.apply(estimate_vehicle_need_realistic, axis=1)

df["tractors_needed"] = vehicle_targets["tractor"]
df["apes_needed"] = vehicle_targets["ape"]
df["trucks_needed"] = vehicle_targets["truck"]

In [6]:
# Weather & Road Impact Adjustments
def transport_penalty(row):
    penalty = 1.0

    if row["precip"] > 20:
        penalty *= np.random.uniform(1.15, 1.35)

    if row["severerisk"] == 1:
        penalty *= np.random.uniform(1.30, 1.60)

    return penalty

df["transport_penalty"] = df.apply(transport_penalty, axis=1)

for col in ["tractors_needed", "apes_needed", "trucks_needed"]:
    df[col] = np.ceil(df[col] * df["transport_penalty"]).astype(int)

# Irreducible Label Noise
def add_label_noise(series, noise_level=0.15):
    noise = np.random.normal(
        loc=0,
        scale=noise_level * (series.mean() + 1),
        size=len(series)
    )
    noisy = series + noise
    return np.clip(np.round(noisy), 0, None).astype(int)

df["tractors_needed"] = add_label_noise(df["tractors_needed"])
df["apes_needed"] = add_label_noise(df["apes_needed"])
df["trucks_needed"] = add_label_noise(df["trucks_needed"])

# Real-World Vehicle Availability Constraints
MAX_VEHICLES = {
    "tractors_needed": 6,
    "apes_needed": 8,
    "trucks_needed": 4
}

for col, max_val in MAX_VEHICLES.items():
    df[col] = df[col].clip(0, max_val)

# Regime Shift
df["regime_shift"] = 0
df.loc[df["datetime"] >= "2025-01-01", "regime_shift"] = 1

# Feature / Target Selection
TARGETS = [
    "tractors_needed",
    "apes_needed",
    "trucks_needed"
]

FEATURES = [
    "area_ha",
    "predicted_yield_kg_per_ha",
    "temp",
    "feelslike",
    "humidity",
    "precip",
    "severerisk",
    "productivity_index",
    "month",
    "regime_shift"
]

X = df[FEATURES]
y = df[TARGETS]

# Train / Validation Split
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    shuffle=False
)

# Feature Scaling
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# AI Model (Multi-Output XGBoost)
base_model = XGBRegressor(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror",
    random_state=42
)

model = MultiOutputRegressor(base_model)

# model train
model.fit(X_train_scaled, y_train)


In [8]:
# Model Evaluation
import numpy as np

def within_one_accuracy(y_true, y_pred):
    return np.mean(np.abs(y_true - y_pred) <= 1)

def underestimation_rate(y_true, y_pred):
    return np.mean(y_pred < y_true)

print("\nOperational Evaluation Metrics\n")

for i, target in enumerate(TARGETS):
    y_true = y_val.iloc[:, i].values
    y_hat = np.round(y_pred[:, i])

    acc = within_one_accuracy(y_true, y_hat)
    under_rate = underestimation_rate(y_true, y_hat)

    print(f"{target.upper()}")
    print(f"  Within ±1 vehicle accuracy : {acc*100:.1f}%")
    print(f"  Underestimation risk        : {under_rate*100:.1f}%")


Operational Evaluation Metrics

TRACTORS_NEEDED
  Within ±1 vehicle accuracy : 100.0%
  Underestimation risk        : 9.1%
APES_NEEDED
  Within ±1 vehicle accuracy : 100.0%
  Underestimation risk        : 5.0%
TRUCKS_NEEDED
  Within ±1 vehicle accuracy : 98.3%
  Underestimation risk        : 1.7%


In [9]:
# Aggregation
df_val = df.iloc[len(X_train):].copy()
df_val[TARGETS] = np.round(y_pred).astype(int)

In [10]:
# weekly
weekly_transport = df_val.resample("W", on="datetime")[TARGETS].sum()
weekly_transport.head()

Unnamed: 0_level_0,tractors_needed,apes_needed,trucks_needed
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2026-02-01,2,2,2
2026-02-08,7,7,7
2026-02-15,7,7,8
2026-02-22,7,7,8
2026-03-01,6,6,8


In [11]:
# monthly
monthly_transport = df_val.resample("M", on="datetime")[TARGETS].sum()
monthly_transport.head()

  monthly_transport = df_val.resample("M", on="datetime")[TARGETS].sum()


Unnamed: 0_level_0,tractors_needed,apes_needed,trucks_needed
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2026-01-31,1,1,1
2026-02-28,28,28,32
2026-03-31,0,0,0
2026-04-30,0,0,0
2026-05-31,0,0,0


In [12]:
# Save Model & Scaler
import joblib

joblib.dump(model, "/content/drive/MyDrive/Coffee/Dev/Phase 1/C3/transport/transport_demand_model.pkl")
joblib.dump(scaler, "/content/drive/MyDrive/Coffee/Dev/Phase 1/C3/transport/transport_feature_scaler.pkl")

print("Transportation demand AI model saved")

Transportation demand AI model saved
