In [7]:
# Imports & Seed
import pandas as pd
import numpy as np
import math

np.random.seed(42)

In [2]:
# mount the drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
# Load REAL Weather Data
weather_1 = pd.read_csv("/content/drive/MyDrive/Coffee/Dev/Phase 1/C3/Labor/data/weather/Kandy, Sri Lanka 2023-01-01 to 2024-12-31.csv")
weather_2 = pd.read_csv("/content/drive/MyDrive/Coffee/Dev/Phase 1/C3/Labor/data/weather/Kandy, Sri Lanka 2025-01-01 to 2026-12-31.csv")

weather = pd.concat([weather_1, weather_2], ignore_index=True)

weather["datetime"] = pd.to_datetime(weather["datetime"])
weather["year"] = weather["datetime"].dt.year
weather["month"] = weather["datetime"].dt.month
weather["day"] = weather["datetime"].dt.day

weather = weather.sort_values("datetime").reset_index(drop=True)

weather.head()

Unnamed: 0,name,datetime,tempmax,tempmin,temp,feelslikemax,feelslikemin,feelslike,dew,humidity,...,sunset,moonphase,conditions,description,stations,source,severerisk,year,month,day
0,"Kandy, Sri Lanka",2023-01-01,82.8,58.4,70.0,84.1,58.4,70.1,61.5,76.1,...,2023-01-01T18:02:01,0.3,Partially cloudy,Partly cloudy throughout the day.,"43479099999,43486099999,43476099999,4344409999...",obs,,2023,1,1
1,"Kandy, Sri Lanka",2023-01-02,84.5,61.6,70.2,86.4,61.6,70.4,63.1,79.3,...,2023-01-02T18:02:32,0.34,Partially cloudy,Partly cloudy throughout the day.,"43479099999,43486099999,43476099999,4344409999...",obs,,2023,1,2
2,"Kandy, Sri Lanka",2023-01-03,82.5,61.1,69.5,84.6,61.1,69.7,64.1,84.1,...,2023-01-03T18:03:03,0.37,"Rain, Partially cloudy",Partly cloudy throughout the day with rain.,"43479099999,43486099999,43476099999,4344409999...",obs,,2023,1,3
3,"Kandy, Sri Lanka",2023-01-04,83.1,62.7,70.4,84.9,62.7,70.5,64.3,82.3,...,2023-01-04T18:03:34,0.41,"Rain, Partially cloudy",Partly cloudy throughout the day with rain.,"43479099999,43486099999,43476099999,4344409999...",obs,,2023,1,4
4,"Kandy, Sri Lanka",2023-01-05,85.9,65.4,72.5,89.5,65.4,72.9,66.6,82.8,...,2023-01-05T18:04:05,0.44,Partially cloudy,Partly cloudy throughout the day.,"43479099999,43486099999,43476099999,4344409999...",obs,,2023,1,5


In [9]:
# Fixed Operational Assumptions
# Base productivity (kg / worker / day)
BASE_PRODUCTIVITY = {
    "picking": 100,
    "harvesting": 120,
    "loading": 700
}

# Harvest season (Sri Lanka)
HARVEST_MONTHS = [10, 11, 12, 1, 2]

# Daily harvest fraction (only part of annual yield harvested per day)
HARVEST_WINDOW_DAYS = 30

In [10]:
# Weather - Productivity Index
def compute_productivity_index(row):
    index = 1.0

    # Rainfall impact
    if row["precip"] > 20:
        index *= 0.6
    elif row["precip"] > 5:
        index *= 0.85

    # Severe weather risk (stop work)
    if row["severerisk"] == 1:
        index = 0.0

    # Heat stress
    if row["feelslike"] > 32:
        index *= 0.85

    # High humidity fatigue
    if row["humidity"] > 85:
        index *= 0.9

    return round(max(index, 0.0), 2)


# Filter Harvest Days & Add Productivity Index
weather = weather[weather["month"].isin(HARVEST_MONTHS)].copy()

weather["productivity_index"] = weather.apply(
    compute_productivity_index, axis=1
)

weather.head()

Unnamed: 0,name,datetime,tempmax,tempmin,temp,feelslikemax,feelslikemin,feelslike,dew,humidity,...,moonphase,conditions,description,stations,source,severerisk,year,month,day,productivity_index
0,"Kandy, Sri Lanka",2023-01-01,82.8,58.4,70.0,84.1,58.4,70.1,61.5,76.1,...,0.3,Partially cloudy,Partly cloudy throughout the day.,"43479099999,43486099999,43476099999,4344409999...",obs,,2023,1,1,0.85
1,"Kandy, Sri Lanka",2023-01-02,84.5,61.6,70.2,86.4,61.6,70.4,63.1,79.3,...,0.34,Partially cloudy,Partly cloudy throughout the day.,"43479099999,43486099999,43476099999,4344409999...",obs,,2023,1,2,0.85
2,"Kandy, Sri Lanka",2023-01-03,82.5,61.1,69.5,84.6,61.1,69.7,64.1,84.1,...,0.37,"Rain, Partially cloudy",Partly cloudy throughout the day with rain.,"43479099999,43486099999,43476099999,4344409999...",obs,,2023,1,3,0.85
3,"Kandy, Sri Lanka",2023-01-04,83.1,62.7,70.4,84.9,62.7,70.5,64.3,82.3,...,0.41,"Rain, Partially cloudy",Partly cloudy throughout the day with rain.,"43479099999,43486099999,43476099999,4344409999...",obs,,2023,1,4,0.85
4,"Kandy, Sri Lanka",2023-01-05,85.9,65.4,72.5,89.5,65.4,72.9,66.6,82.8,...,0.44,Partially cloudy,Partly cloudy throughout the day.,"43479099999,43486099999,43476099999,4344409999...",obs,,2023,1,5,0.85


In [11]:
# Generate Synthetic Farm & Yield Parameters
weather["area_ha"] = np.random.uniform(1.0, 5.0, len(weather))
weather["predicted_yield_kg_per_ha"] = np.random.uniform(1400, 2200, len(weather))

weather["annual_yield_kg"] = (
    weather["area_ha"] * weather["predicted_yield_kg_per_ha"]
)

# Daily Harvest Estimation
def estimate_daily_harvest(annual_yield):
    base = annual_yield / HARVEST_WINDOW_DAYS
    noise = np.random.uniform(0.8, 1.2)
    return base * noise

weather["daily_harvest_kg"] = weather["annual_yield_kg"].apply(
    estimate_daily_harvest
).round(1)

# AI Training Target Generation (Labor Counts)
def estimate_labor(row):
    labor = {}

    for task, base_cap in BASE_PRODUCTIVITY.items():
        effective_cap = base_cap * row["productivity_index"]

        if effective_cap == 0:
            labor[task] = 0
        else:
            labor[task] = math.ceil(
                row["daily_harvest_kg"] / effective_cap
            )

    return pd.Series(labor)

labor_targets = weather.apply(estimate_labor, axis=1)

weather["pickers_needed"] = labor_targets["picking"]
weather["harvesters_needed"] = labor_targets["harvesting"]
weather["loaders_needed"] = labor_targets["loading"]


In [12]:
# Final AI Dataset (Features + Targets)
labor_ai_df = weather[[
    "datetime",
    "year",
    "month",
    "day",
    "area_ha",
    "predicted_yield_kg_per_ha",
    "daily_harvest_kg",
    "temp",
    "feelslike",
    "humidity",
    "precip",
    "severerisk",
    "productivity_index",
    "pickers_needed",
    "harvesters_needed",
    "loaders_needed"
]].copy()

labor_ai_df.head()


Unnamed: 0,datetime,year,month,day,area_ha,predicted_yield_kg_per_ha,daily_harvest_kg,temp,feelslike,humidity,precip,severerisk,productivity_index,pickers_needed,harvesters_needed,loaders_needed
0,2023-01-01,2023,1,1,2.49816,1768.623014,142.9,70.0,70.1,76.1,0.0,,0.85,2,2,1
1,2023-01-02,2023,1,2,4.802857,1565.066975,245.7,70.2,70.4,79.3,0.0,,0.85,3,3,1
2,2023-01-03,2023,1,3,3.927976,1691.415889,191.7,69.5,69.7,84.1,0.024,,0.85,3,2,1
3,2023-01-04,2023,1,4,3.394634,1802.733817,228.0,70.4,70.5,82.3,0.016,,0.85,3,3,1
4,2023-01-05,2023,1,5,1.624075,1952.315863,113.9,72.5,72.9,82.8,0.0,,0.85,2,2,1


In [13]:
# Save Dataset
labor_ai_df.to_csv(
    "/content/drive/MyDrive/Coffee/Dev/Phase 1/C3/Labor/data/synthetic_daily_labor_dataset_REAL_WEATHER.csv",
    index=False
)

print("Daily labor dataset saved")

Daily labor dataset saved
