In [1]:
# install & Import Libraries
!pip install -q xgboost scikit-learn

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

from xgboost import XGBRegressor

In [2]:
# mount the drive
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [3]:
# Load Labor Dataset
df = pd.read_csv("/content/drive/MyDrive/Coffee/Dev/Phase 1/C3/Labor/data/synthetic_daily_labor_dataset_REAL_WEATHER.csv")

df["datetime"] = pd.to_datetime(df["datetime"])
df.head()


Unnamed: 0,datetime,year,month,day,area_ha,predicted_yield_kg_per_ha,daily_harvest_kg,temp,feelslike,humidity,precip,severerisk,productivity_index,pickers_needed,harvesters_needed,loaders_needed
0,2023-01-01,2023,1,1,2.49816,1768.623014,142.9,70.0,70.1,76.1,0.0,,0.85,2,2,1
1,2023-01-02,2023,1,2,4.802857,1565.066975,245.7,70.2,70.4,79.3,0.0,,0.85,3,3,1
2,2023-01-03,2023,1,3,3.927976,1691.415889,191.7,69.5,69.7,84.1,0.024,,0.85,3,2,1
3,2023-01-04,2023,1,4,3.394634,1802.733817,228.0,70.4,70.5,82.3,0.016,,0.85,3,3,1
4,2023-01-05,2023,1,5,1.624075,1952.315863,113.9,72.5,72.9,82.8,0.0,,0.85,2,2,1


In [4]:
# Feature / Target Definition
TARGETS = [
    "pickers_needed",
    "harvesters_needed",
    "loaders_needed"
]


FEATURES = [
    "area_ha",
    "predicted_yield_kg_per_ha",
    "daily_harvest_kg",
    "temp",
    "feelslike",
    "humidity",
    "precip",
    "severerisk",
    "productivity_index",
    "month"
]

X = df[FEATURES]
y = df[TARGETS]

In [5]:
# Train / Validation Split
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    shuffle=False
)

# Feature Scaling
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

In [6]:
# Build AI Model (Multi-Output XGBoost)
base_model = XGBRegressor(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror",
    random_state=42
)

model = MultiOutputRegressor(base_model)

# Train the AI Model
model.fit(X_train_scaled, y_train)

In [9]:
# Model Evaluation
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

y_pred = model.predict(X_val_scaled)

for i, target in enumerate(TARGETS):
    mae = mean_absolute_error(y_val.iloc[:, i], y_pred[:, i])

    mse = mean_squared_error(y_val.iloc[:, i], y_pred[:, i])
    rmse = np.sqrt(mse)

    r2 = r2_score(y_val.iloc[:, i], y_pred[:, i])

    print(f"{target.upper()}")
    print(f"  MAE : {mae:.2f} workers")
    print(f"  RMSE: {rmse:.2f} workers")
    print(f"  R²  : {r2:.3f}")

PICKERS_NEEDED
  MAE : 0.13 workers
  RMSE: 0.25 workers
  R²  : 0.950
HARVESTERS_NEEDED
  MAE : 0.11 workers
  RMSE: 0.23 workers
  R²  : 0.940
LOADERS_NEEDED
  MAE : 0.00 workers
  RMSE: 0.00 workers
  R²  : 1.000


In [10]:
# Daily -> Weekly / Monthly / Yearly Aggregation
df_val = df.iloc[len(X_train):].copy()

df_val[TARGETS] = np.round(y_pred).astype(int)

In [11]:
# Weekly
weekly = df_val.resample("W", on="datetime")[TARGETS].sum()
weekly.head()

Unnamed: 0_level_0,pickers_needed,harvesters_needed,loaders_needed
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2026-02-01,6,5,2
2026-02-08,18,15,7
2026-02-15,21,17,7
2026-02-22,17,16,7
2026-03-01,20,17,6


In [12]:
# monthly
monthly = df_val.resample("M", on="datetime")[TARGETS].sum()
monthly.head()

  monthly = df_val.resample("M", on="datetime")[TARGETS].sum()


Unnamed: 0_level_0,pickers_needed,harvesters_needed,loaders_needed
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2026-01-31,2,2,1
2026-02-28,80,68,28
2026-03-31,0,0,0
2026-04-30,0,0,0
2026-05-31,0,0,0


In [13]:
# yearly
yearly = df_val.resample("Y", on="datetime")[TARGETS].sum()
yearly.head()

  yearly = df_val.resample("Y", on="datetime")[TARGETS].sum()


Unnamed: 0_level_0,pickers_needed,harvesters_needed,loaders_needed
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2026-12-31,330,283,121


In [14]:
# Save Trained Model & Scaler
import joblib

joblib.dump(model, "/content/drive/MyDrive/Coffee/Dev/Phase 1/C3/Labor/labor_demand_model.pkl")
joblib.dump(scaler, "/content/drive/MyDrive/Coffee/Dev/Phase 1/C3/Labor/labor_feature_scaler.pkl")

print("Labor AI model and scaler saved")


Labor AI model and scaler saved
