In [1]:
# Cell 0: Paths & Imports
from pathlib import Path
import numpy as np
import pandas as pd
from autogluon.tabular import TabularPredictor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

DATA = Path("/teamspace/studios/this_studio/data/dengue")
INTERIM = DATA/"interim"
FEATURES = DATA/"features"
REPORTS = DATA/"reports"
MODELS = DATA/"models"
for p in [FEATURES, REPORTS, MODELS]: p.mkdir(parents=True, exist_ok=True)

BASE_FE = FEATURES/"feature_matrix_weekly.parquet"
assert BASE_FE.exists(), f"Missing {BASE_FE}"
dm0 = pd.read_parquet(BASE_FE).copy()


In [2]:
# Cell 1: Clean categorical keys + seasonal flags
dm = dm0.copy()
dm["week_start"] = pd.to_datetime(dm["week_start"])
if "district_id_txt_clean" not in dm.columns:
    dm["district_id_txt_clean"] = dm["district_id_txt"] if "district_id_txt" in dm.columns else dm["district"].astype(str)+"|"+dm["province"].astype(str)
dm["district_id_txt_clean"] = dm["district_id_txt_clean"].astype(str)
if "province_code" not in dm.columns:
    dm["province_code"] = dm["district_id_txt_clean"].str.split("|").str[-1]
dm["province_code"] = dm["province_code"].astype(str)
dm["district_num"] = pd.to_numeric(dm.get("district_num", np.nan), errors="coerce")
dm["district_num_cat"] = ("D"+dm["district_num"].fillna(-1).astype(int).astype(str)).astype(str)
dm["year"] = dm["week_start"].dt.year.astype(int)
dm["weekofyear"] = dm["week_start"].dt.isocalendar().week.astype(int)
dm["month"] = dm["week_start"].dt.month.astype(int)
dm["sin_woy"] = np.sin(2*np.pi*dm["weekofyear"]/52.0)
dm["cos_woy"] = np.cos(2*np.pi*dm["weekofyear"]/52.0)
dm["is_monsoon"] = dm["month"].between(5,10).astype(int)
dm["is_term1"] = dm["month"].between(5,9).astype(int)
dm["is_term2"] = dm["month"].isin([11,12,1,2,3]).astype(int)


In [3]:
dm

Unnamed: 0,district_id_txt_clean,week_start,cases,ipd_rate,fatal_rate,age_mean,district_num,province_code,temperature_2m_mean,temperature_2m_min,...,vpd_kpa_roll4_sum,vpd_kpa_roll8_mean,vpd_kpa_roll8_sum,vpd_kpa_roll12_mean,vpd_kpa_roll12_sum,district_num_cat,month,is_monsoon,is_term1,is_term2
0,None|ปทุมธานี,2023-04-10,0,0.0,0.0,0.000000,1312,3,27.800000,24.414286,...,,,,,,D1312,4,0,0,0
1,None|ปทุมธานี,2023-04-17,0,0.0,0.0,0.000000,1312,3,27.800000,24.414286,...,1.966208,,,,,D1312,4,0,0,0
2,None|ปทุมธานี,2023-04-24,0,0.0,0.0,0.000000,1312,3,27.800000,24.414286,...,2.949312,,,,,D1312,4,0,0,0
3,None|ปทุมธานี,2023-05-01,1,0.0,0.0,59.841553,1308,3,27.800000,24.414286,...,3.932416,0.983104,3.932416,,,D1308,5,1,1,0
4,None|ปทุมธานี,2023-05-08,0,0.0,0.0,0.000000,1308,3,27.800000,24.414286,...,3.932416,0.983104,4.915520,,,D1308,5,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21364,ไทรน้อย|นนทบุรี,2023-12-04,0,0.0,0.0,0.000000,1205,2,27.971429,23.771429,...,4.385863,0.881510,7.052083,0.786095,9.433136,D1205,12,0,0,1
21365,ไทรน้อย|นนทบุรี,2023-12-11,3,0.0,0.0,29.638889,1205,2,28.628571,24.042857,...,4.757268,0.927452,7.419616,0.808851,9.706210,D1205,12,0,0,1
21366,ไทรน้อย|นนทบุรี,2023-12-18,0,0.0,0.0,0.000000,1205,2,25.800000,21.642857,...,4.985332,0.992610,7.940877,0.856844,10.282133,D1205,12,0,0,1
21367,ไทรน้อย|นนทบุรี,2023-12-25,0,0.0,0.0,0.000000,1205,2,26.142857,21.971429,...,5.093844,1.097686,8.781491,0.934298,11.211580,D1205,12,0,0,1


In [4]:
# Cell 2: Trend/enhanced temporal features per-district
def enrich_group(g):
    g = g.sort_values("week_start").copy()
    c = g["cases"].astype(float)
    lag1 = c.shift(1)
    g["cases_lag1"] = lag1
    g["cases_delta1"] = c - lag1
    g["cases_delta2"] = g["cases_delta1"] - g["cases_delta1"].shift(1)
    g["cases_ratio1"] = (c + 1.0) / (lag1 + 1.0)
    g["cases_logdiff1"] = np.log1p(c) - np.log1p(lag1)
    for W in [2,4,8]:
        g[f"cases_ewm{W}"] = c.ewm(span=W, adjust=False).mean().shift(1)
        g[f"cases_roll{W}_max"] = c.rolling(W, min_periods=max(1, W//2)).max().shift(1)
        g[f"cases_roll{W}_min"] = c.rolling(W, min_periods=max(1, W//2)).min().shift(1)
        g[f"cases_roll{W}_mean"] = c.rolling(W, min_periods=max(1, W//2)).mean().shift(1)
        g[f"cases_roll{W}_sum"] = c.rolling(W, min_periods=max(1, W//2)).sum().shift(1)
    wx_mean_cols = ["temperature_2m_mean","relative_humidity_2m_mean","vpd_kpa","windspeed_10m_max"]
    wx_sum_cols = ["precipitation_sum","shortwave_radiation_sum","rain_sum"]
    for W in [2,3,4,6,8]:
        for col in wx_mean_cols:
            if col in g:
                g[f"{col}_roll{W}_mean2"] = g[col].rolling(W, min_periods=max(1, W//2)).mean().shift(1)
        for col in wx_sum_cols:
            if col in g:
                g[f"{col}_roll{W}_sum2"] = g[col].rolling(W, min_periods=max(1, W//2)).sum().shift(1)
    if "precipitation_sum" in g:
        g["no_rain_flag"] = (g["precipitation_sum"].fillna(0.0) <= 0.001).astype(int)
        g["zero_rain_last3"] = g["no_rain_flag"].rolling(3, min_periods=1).sum().shift(1)
        g["zero_rain_last6"] = g["no_rain_flag"].rolling(6, min_periods=1).sum().shift(1)
    return g

dm = dm.groupby("district_id_txt_clean", group_keys=False).apply(enrich_group).reset_index(drop=True)
crit = ["cases_lag1"]
dm = dm.dropna(subset=[c for c in crit if c in dm.columns]).reset_index(drop=True)


In [5]:
# Cell 3: Build horizons (h=1,h=2) targets
def build_horizon(df, h):
    def add_target(g):
        g = g.sort_values("week_start").copy()
        g["target_cases"] = g["cases"].shift(-h)
        g["target_log1p"] = np.log1p(g["target_cases"].astype(float))
        return g
    out = df.groupby("district_id_txt_clean", group_keys=False).apply(add_target).reset_index(drop=True)
    out = out.dropna(subset=["target_cases","target_log1p"]).reset_index(drop=True)
    return out

dm_h1 = build_horizon(dm, 1)
dm_h2 = build_horizon(dm, 2)


In [6]:
# Cell 4: Time split per horizon (last 26 weeks as validation)
def time_split(df, last_weeks=26):
    uniq = df["week_start"].drop_duplicates().sort_values().to_list()
    cut = uniq[-last_weeks]
    train = df[df["week_start"] < cut].copy()
    valid = df[df["week_start"] >= cut].copy()
    return train, valid, cut

train_h1, valid_h1, cut_h1 = time_split(dm_h1, last_weeks=26)
train_h2, valid_h2, cut_h2 = time_split(dm_h2, last_weeks=26)

print("H1 valid from:", str(cut_h1.date()))
print("H2 valid from:", str(cut_h2.date()))


H1 valid from: 2023-11-27
H2 valid from: 2023-11-20


In [7]:
train_h2

Unnamed: 0,district_id_txt_clean,week_start,cases,ipd_rate,fatal_rate,age_mean,district_num,province_code,temperature_2m_mean,temperature_2m_min,...,vpd_kpa_roll8_mean2,windspeed_10m_max_roll8_mean2,precipitation_sum_roll8_sum2,shortwave_radiation_sum_roll8_sum2,rain_sum_roll8_sum2,no_rain_flag,zero_rain_last3,zero_rain_last6,target_cases,target_log1p
0,None|ปทุมธานี,2023-04-17,0,0.0,0.0,0.000000,1312,3,27.800000,24.414286,...,,,,,,1,1.0,1.0,1.0,0.693147
1,None|ปทุมธานี,2023-04-24,0,0.0,0.0,0.000000,1312,3,27.800000,24.414286,...,,,,,,1,2.0,2.0,0.0,0.000000
2,None|ปทุมธานี,2023-05-01,1,0.0,0.0,59.841553,1308,3,27.800000,24.414286,...,,,,,,1,3.0,3.0,0.0,0.000000
3,None|ปทุมธานี,2023-05-08,0,0.0,0.0,0.000000,1308,3,27.800000,24.414286,...,0.983104,12.957143,0.0,0.00,0.0,1,3.0,4.0,0.0,0.000000
4,None|ปทุมธานี,2023-05-15,0,0.0,0.0,0.000000,1308,3,27.800000,24.414286,...,0.983104,12.957143,0.0,0.00,0.0,1,3.0,5.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21146,ไทรน้อย|นนทบุรี,2023-10-16,2,0.5,0.0,21.802055,1205,2,27.885714,24.571429,...,0.696805,15.903571,663.3,966.78,663.3,0,0.0,0.0,0.0,0.000000
21147,ไทรน้อย|นนทบุรี,2023-10-23,4,1.0,0.0,36.649943,1205,2,27.557143,24.385714,...,0.685628,15.001786,630.7,961.35,630.7,0,0.0,0.0,0.0,0.000000
21148,ไทรน้อย|นนทบุรี,2023-10-30,0,0.0,0.0,0.000000,1205,2,27.657143,24.642857,...,0.644517,14.348214,572.3,960.73,572.3,0,0.0,0.0,0.0,0.000000
21149,ไทรน้อย|นนทบุรี,2023-11-06,0,0.0,0.0,0.000000,1205,2,27.614286,24.328571,...,0.630909,13.932143,552.7,964.95,552.7,0,0.0,0.0,0.0,0.000000


In [8]:
valid_h2

Unnamed: 0,district_id_txt_clean,week_start,cases,ipd_rate,fatal_rate,age_mean,district_num,province_code,temperature_2m_mean,temperature_2m_min,...,vpd_kpa_roll8_mean2,windspeed_10m_max_roll8_mean2,precipitation_sum_roll8_sum2,shortwave_radiation_sum_roll8_sum2,rain_sum_roll8_sum2,no_rain_flag,zero_rain_last3,zero_rain_last6,target_cases,target_log1p
336,คลองหลวง|ปทุมธานี,2023-11-20,10,0.20,0.0,25.911324,1302,3,26.214286,22.471429,...,0.616195,11.773214,462.5,971.51,462.5,0,0.0,0.0,10.0,2.397895
337,คลองหลวง|ปทุมธานี,2023-11-27,12,0.25,0.0,17.285426,1302,3,27.628571,24.342857,...,0.707826,11.600000,328.2,1001.62,328.2,0,0.0,0.0,8.0,2.197225
338,คลองหลวง|ปทุมธานี,2023-12-04,10,0.10,0.0,23.917740,1302,3,27.614286,23.700000,...,0.803685,11.780357,214.4,1024.75,214.4,0,0.0,0.0,7.0,2.079442
339,คลองหลวง|ปทุมธานี,2023-12-11,8,0.25,0.0,20.995120,1302,3,28.328571,23.942857,...,0.847160,11.723214,174.4,1021.24,174.4,0,0.0,0.0,2.0,1.098612
645,ค่ายบางระจัน|สิงห์บุรี,2023-11-20,1,0.00,0.0,5.073973,1703,7,26.314286,21.757143,...,0.728313,13.608929,395.3,981.66,395.3,0,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21151,ไทรน้อย|นนทบุรี,2023-11-20,0,0.00,0.0,0.000000,1205,2,26.685714,22.442857,...,0.662100,13.689286,425.5,960.25,425.5,0,0.0,0.0,0.0,0.000000
21152,ไทรน้อย|นนทบุรี,2023-11-27,0,0.00,0.0,0.000000,1205,2,28.185714,24.571429,...,0.764717,13.712500,331.8,990.35,331.8,0,0.0,0.0,3.0,1.386294
21153,ไทรน้อย|นนทบุรี,2023-12-04,0,0.00,0.0,0.000000,1205,2,27.971429,23.771429,...,0.881510,14.108929,221.8,1012.80,221.8,0,0.0,0.0,0.0,0.000000
21154,ไทรน้อย|นนทบุรี,2023-12-11,3,0.00,0.0,29.638889,1205,2,28.628571,24.042857,...,0.927452,13.921429,166.7,1009.09,166.7,0,0.0,0.0,0.0,0.000000


In [9]:
# Cell 5: Feature lists and categorical casting
def prepare_xy(df, label_key):
    drop_cols = {"week_start","target_cases","target_log1p"}
    if label_key == "target_cases":
        drop_cols.add("cases")
        drop_cols.add("cases_log1p")
    if label_key == "target_log1p":
        drop_cols.add("cases")
        drop_cols.add("cases_log1p")
    feats = [c for c in df.columns if c not in drop_cols]
    X = df[feats].copy()
    y = df[label_key].astype(float).copy()
    if "district_id_txt_clean" in X.columns:
        X["district_id_txt_clean"] = X["district_id_txt_clean"].astype("category")
    if "province_code" in X.columns:
        X["province_code"] = X["province_code"].astype("category")
    if "district_num_cat" in X.columns:
        X["district_num_cat"] = X["district_num_cat"].astype("category")
    return X, y, feats

def pack_train_valid(train_df, valid_df, label_key):
    Xtr, ytr, feats = prepare_xy(train_df, label_key)
    Xva, yva, _ = prepare_xy(valid_df, label_key)
    train_pack = Xtr.copy()
    train_pack[label_key] = ytr.values
    valid_pack = Xva.copy()
    valid_pack[label_key] = yva.values
    return train_pack, valid_pack, feats


In [10]:
# Cell 6: Train function (AutoGluon, best_quality, 3000s) + evaluation
TIME_LIMIT = 3000

def train_and_eval(train_pack, valid_pack, label_key, model_tag):
    model_dir = MODELS / f"ag02_{model_tag}"
    predictor = TabularPredictor(
        label=label_key,
        problem_type="regression",
        eval_metric="root_mean_squared_error",
        path=str(model_dir)
    ).fit(
        train_data=train_pack,
        presets="best_quality",
        time_limit=TIME_LIMIT,
        verbosity=2
    )
    lb = predictor.leaderboard(valid_pack, silent=True)
    lb_path = REPORTS / f"leaderboard_{model_tag}.csv"
    lb.to_csv(lb_path, index=False)
    y_true = valid_pack[label_key].to_numpy()
    y_pred = predictor.predict(valid_pack.drop(columns=[label_key])).astype(float).to_numpy()
    return predictor, lb, lb_path, y_true, y_pred, model_dir

def metrics_on_cases(y_true_cases, y_pred_cases):
    mse = mean_squared_error(y_true_cases, y_pred_cases)
    rmse = float(np.sqrt(mse))
    mae = mean_absolute_error(y_true_cases, y_pred_cases)
    r2 = r2_score(y_true_cases, y_pred_cases)
    wape = (np.abs(y_pred_cases - y_true_cases).sum() / np.abs(y_true_cases).sum()) * 100 if y_true_cases.sum()!=0 else np.nan
    rmsle = float(np.sqrt(np.mean((np.log1p(np.maximum(0.0,y_pred_cases)) - np.log1p(np.maximum(0.0,y_true_cases)))**2)))
    return dict(RMSE=rmse, MAE=mae, R2=r2, WAPE=wape, RMSLE=rmsle)


In [11]:
# Cell 7: H1 cases
train_pack_h1_cases, valid_pack_h1_cases, feats_h1_cases = pack_train_valid(train_h1, valid_h1, "target_cases")
pred_h1_cases, lb_h1_cases, lb_h1_cases_path, y_true_h1_cases, y_pred_h1_cases, dir_h1_cases = train_and_eval(
    train_pack_h1_cases, valid_pack_h1_cases, "target_cases", "h1_cases"
)
print("Saved leaderboard:", lb_h1_cases_path)
print(dir_h1_cases)
display(lb_h1_cases.head(20))
m_h1_cases = metrics_on_cases(y_true_h1_cases, np.maximum(0.0, y_pred_h1_cases))
print("[H1 | cases] Metrics:",
      f"\n - RMSE : {m_h1_cases['RMSE']:.4f}",
      f"\n - MAE  : {m_h1_cases['MAE']:.4f}",
      f"\n - R2   : {m_h1_cases['R2']:.4f}",
      f"\n - WAPE : {m_h1_cases['WAPE']:.2f}%",
      f"\n - RMSLE: {m_h1_cases['RMSLE']:.4f}")
pd.DataFrame({"y_true": y_true_h1_cases, "y_pred": np.maximum(0.0, y_pred_h1_cases)}).to_parquet(
    FEATURES/"ag02_valid_pred_h1_cases.parquet", index=False
)


Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.4.0
Python Version:     3.12.11
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #7-Ubuntu SMP Sat Apr 20 00:58:31 UTC 2024
CPU Count:          4
Memory Avail:       12.15 GB / 15.62 GB (77.8%)
Disk Space Avail:   320.68 GB / 368.04 GB (87.1%)
Presets specified: ['best_quality']
Using hyperparameters preset: hyperparameters='zeroshot'
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on subsets of the data. Then holdout validation data is used to detect stacked 

[36m(_ray_fit pid=51712)[0m [1000]	valid_set's rmse: 1.28493
[36m(_ray_fit pid=51712)[0m [2000]	valid_set's rmse: 1.28102
[36m(_ray_fit pid=51712)[0m [3000]	valid_set's rmse: 1.27914
[36m(_ray_fit pid=51712)[0m [4000]	valid_set's rmse: 1.27881
[36m(_ray_fit pid=51712)[0m [5000]	valid_set's rmse: 1.27851


[36m(_ray_fit pid=51712)[0m 	Ran out of time, early stopping on iteration 5883. Best iteration is:
[36m(_ray_fit pid=51712)[0m 	[5875]	valid_set's rmse: 1.27839
[36m(_dystack pid=19465)[0m 	-1.2203	 = Validation score   (-root_mean_squared_error)
[36m(_dystack pid=19465)[0m 	89.1s	 = Training   runtime
[36m(_dystack pid=19465)[0m 	1.14s	 = Validation runtime
[36m(_dystack pid=19465)[0m Fitting model: RandomForestMSE_BAG_L2 ... Training model for up to 126.71s of the 126.69s of remaining time.
[36m(_dystack pid=19465)[0m 	-1.2364	 = Validation score   (-root_mean_squared_error)
[36m(_dystack pid=19465)[0m 	320.16s	 = Training   runtime
[36m(_dystack pid=19465)[0m 	0.89s	 = Validation runtime
[36m(_dystack pid=19465)[0m Fitting model: WeightedEnsemble_L3 ... Training model for up to 360.00s of the -194.73s of remaining time.
[36m(_dystack pid=19465)[0m 	Ensemble Weights: {'LightGBMXT_BAG_L1': 0.409, 'CatBoost_BAG_L1': 0.364, 'LightGBM_BAG_L1': 0.136, 'LightGBMXT_BAG

Saved leaderboard: /teamspace/studios/this_studio/data/dengue/reports/leaderboard_h1_cases.csv
/teamspace/studios/this_studio/data/dengue/models/ag02_h1_cases


Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,LightGBM_BAG_L2,-2.509079,-1.213709,root_mean_squared_error,2.581684,9.790654,1351.931091,0.054251,0.114754,18.829639,2,True,15
1,LightGBMLarge_BAG_L1,-2.513853,-1.254554,root_mean_squared_error,0.244957,1.470918,130.901315,0.244957,1.470918,130.901315,1,True,9
2,LightGBM_BAG_L1,-2.521944,-1.219807,root_mean_squared_error,0.097499,0.522917,34.498787,0.097499,0.522917,34.498787,1,True,2
3,XGBoost_BAG_L1,-2.538175,-1.227427,root_mean_squared_error,0.078413,0.363012,61.577958,0.078413,0.363012,61.577958,1,True,7
4,CatBoost_BAG_L2,-2.543113,-1.194837,root_mean_squared_error,2.58451,10.843373,1384.145703,0.057078,1.167473,51.04425,2,True,17
5,LightGBMXT_BAG_L2,-2.54533,-1.192157,root_mean_squared_error,2.946061,12.117631,1442.552919,0.418629,2.441731,109.451467,2,True,14
6,ExtraTreesMSE_BAG_L2,-2.547289,-1.203221,root_mean_squared_error,2.841459,10.599618,1363.970095,0.314026,0.923718,30.868643,2,True,18
7,CatBoost_r177_BAG_L1,-2.548257,-1.211868,root_mean_squared_error,0.036449,0.962554,112.336015,0.036449,0.962554,112.336015,1,True,10
8,CatBoost_BAG_L1,-2.558455,-1.207513,root_mean_squared_error,0.355339,1.105098,225.426921,0.355339,1.105098,225.426921,1,True,4
9,RandomForestMSE_BAG_L2,-2.560174,-1.216546,root_mean_squared_error,2.838946,10.48681,1715.554585,0.311513,0.81091,382.453133,2,True,16


[H1 | cases] Metrics: 
 - RMSE : 2.5781 
 - MAE  : 1.6321 
 - R2   : 0.4587 
 - WAPE : 69.05% 
 - RMSLE: 0.6108


In [12]:
# Cell 8: H1 log1p
train_pack_h1_log, valid_pack_h1_log, feats_h1_log = pack_train_valid(train_h1, valid_h1, "target_log1p")
pred_h1_log, lb_h1_log, lb_h1_log_path, y_true_h1_log, y_pred_h1_log, dir_h1_log = train_and_eval(
    train_pack_h1_log, valid_pack_h1_log, "target_log1p", "h1_log1p"
)
print("Saved leaderboard:", lb_h1_log_path)
print(dir_h1_log)
display(lb_h1_log.head(20))
y_true_h1 = np.expm1(y_true_h1_log)
y_pred_h1 = np.maximum(0.0, np.expm1(y_pred_h1_log))
m_h1_log = metrics_on_cases(y_true_h1, y_pred_h1)
print("[H1 | log1p→cases] Metrics:",
      f"\n - RMSE : {m_h1_log['RMSE']:.4f}",
      f"\n - MAE  : {m_h1_log['MAE']:.4f}",
      f"\n - R2   : {m_h1_log['R2']:.4f}",
      f"\n - WAPE : {m_h1_log['WAPE']:.2f}%",
      f"\n - RMSLE: {m_h1_log['RMSLE']:.4f}")
pd.DataFrame({"y_true": y_true_h1, "y_pred": y_pred_h1}).to_parquet(
    FEATURES/"ag02_valid_pred_h1_log1p.parquet", index=False
)


Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.4.0
Python Version:     3.12.11
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #7-Ubuntu SMP Sat Apr 20 00:58:31 UTC 2024
CPU Count:          4
Memory Avail:       10.57 GB / 15.62 GB (67.7%)
Disk Space Avail:   320.09 GB / 368.04 GB (87.0%)
Presets specified: ['best_quality']
Using hyperparameters preset: hyperparameters='zeroshot'
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on subsets of the data. Then holdout validation data is used to detect stacked 

Saved leaderboard: /teamspace/studios/this_studio/data/dengue/reports/leaderboard_h1_log1p.csv
/teamspace/studios/this_studio/data/dengue/models/ag02_h1_log1p


Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,XGBoost_BAG_L1,-0.58353,-0.350697,root_mean_squared_error,0.083164,0.283715,51.708973,0.083164,0.283715,51.708973,1,True,7
1,LightGBM_BAG_L1,-0.58453,-0.348806,root_mean_squared_error,0.099473,0.491634,34.408065,0.099473,0.491634,34.408065,1,True,2
2,RandomForestMSE_BAG_L1,-0.587957,-0.359823,root_mean_squared_error,0.209408,0.846023,288.235757,0.209408,0.846023,288.235757,1,True,3
3,LightGBM_r131_BAG_L1,-0.588605,-0.347814,root_mean_squared_error,0.267191,2.263148,98.15863,0.267191,2.263148,98.15863,1,True,12
4,ExtraTreesMSE_BAG_L1,-0.589688,-0.356307,root_mean_squared_error,0.120193,0.860317,26.429469,0.120193,0.860317,26.429469,1,True,5
5,LightGBMLarge_BAG_L1,-0.591893,-0.351536,root_mean_squared_error,0.139286,0.74438,81.263514,0.139286,0.74438,81.263514,1,True,9
6,CatBoost_BAG_L1,-0.592166,-0.345711,root_mean_squared_error,0.04449,0.999231,197.474966,0.04449,0.999231,197.474966,1,True,4
7,CatBoost_r177_BAG_L1,-0.596788,-0.345741,root_mean_squared_error,0.036313,1.081136,134.393888,0.036313,1.081136,134.393888,1,True,10
8,LightGBMXT_BAG_L2,-0.599298,-0.348114,root_mean_squared_error,1.970488,13.882079,1402.999389,0.028463,0.183401,16.961075,2,True,16
9,WeightedEnsemble_L2,-0.601689,-0.342826,root_mean_squared_error,1.215469,8.285881,866.694734,0.003287,0.00069,0.053487,2,True,15


[H1 | log1p→cases] Metrics: 
 - RMSE : 2.6341 
 - MAE  : 1.5926 
 - R2   : 0.4349 
 - WAPE : 67.38% 
 - RMSLE: 0.6036


In [13]:
# Cell 9: H2 cases
train_pack_h2_cases, valid_pack_h2_cases, feats_h2_cases = pack_train_valid(train_h2, valid_h2, "target_cases")
pred_h2_cases, lb_h2_cases, lb_h2_cases_path, y_true_h2_cases, y_pred_h2_cases, dir_h2_cases = train_and_eval(
    train_pack_h2_cases, valid_pack_h2_cases, "target_cases", "h2_cases"
)
print("Saved leaderboard:", lb_h2_cases_path)
print(dir_h2_cases)
display(lb_h2_cases.head(20))
m_h2_cases = metrics_on_cases(y_true_h2_cases, np.maximum(0.0, y_pred_h2_cases))
print("[H2 | cases] Metrics:",
      f"\n - RMSE : {m_h2_cases['RMSE']:.4f}",
      f"\n - MAE  : {m_h2_cases['MAE']:.4f}",
      f"\n - R2   : {m_h2_cases['R2']:.4f}",
      f"\n - WAPE : {m_h2_cases['WAPE']:.2f}%",
      f"\n - RMSLE: {m_h2_cases['RMSLE']:.4f}")
pd.DataFrame({"y_true": y_true_h2_cases, "y_pred": np.maximum(0.0, y_pred_h2_cases)}).to_parquet(
    FEATURES/"ag02_valid_pred_h2_cases.parquet", index=False
)


Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.4.0
Python Version:     3.12.11
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #7-Ubuntu SMP Sat Apr 20 00:58:31 UTC 2024
CPU Count:          4
Memory Avail:       10.19 GB / 15.62 GB (65.2%)
Disk Space Avail:   319.47 GB / 368.04 GB (86.8%)
Presets specified: ['best_quality']
Using hyperparameters preset: hyperparameters='zeroshot'
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on subsets of the data. Then holdout validation data is used to detect stacked 

Saved leaderboard: /teamspace/studios/this_studio/data/dengue/reports/leaderboard_h2_cases.csv
/teamspace/studios/this_studio/data/dengue/models/ag02_h2_cases


Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,LightGBM_BAG_L2,-2.451276,-1.213002,root_mean_squared_error,1.866459,10.966688,1306.861956,0.045573,0.094367,18.53282,2,True,14
1,LightGBMXT_BAG_L2,-2.455933,-1.214,root_mean_squared_error,1.916485,11.256797,1311.069164,0.095599,0.384476,22.740028,2,True,13
2,CatBoost_BAG_L2,-2.475263,-1.19969,root_mean_squared_error,1.872169,11.918627,1343.021906,0.051283,1.046306,54.69277,2,True,16
3,XGBoost_BAG_L1,-2.478274,-1.271222,root_mean_squared_error,0.073293,0.454827,64.062612,0.073293,0.454827,64.062612,1,True,7
4,CatBoost_BAG_L1,-2.482132,-1.233783,root_mean_squared_error,0.047151,0.929334,212.231642,0.047151,0.929334,212.231642,1,True,4
5,RandomForestMSE_BAG_L2,-2.484073,-1.239323,root_mean_squared_error,2.086263,11.80881,1669.903743,0.265377,0.936489,381.574608,2,True,15
6,CatBoost_r177_BAG_L1,-2.486622,-1.236913,root_mean_squared_error,0.047571,1.059083,140.278697,0.047571,1.059083,140.278697,1,True,10
7,LightGBMLarge_BAG_L1,-2.487576,-1.243622,root_mean_squared_error,0.325954,1.418852,134.245235,0.325954,1.418852,134.245235,1,True,9
8,ExtraTreesMSE_BAG_L2,-2.490568,-1.21978,root_mean_squared_error,2.09144,12.101617,1319.926661,0.270554,1.229296,31.597525,2,True,17
9,XGBoost_BAG_L2,-2.500649,-1.217288,root_mean_squared_error,1.913912,11.148768,1330.198347,0.093026,0.276447,41.869211,2,True,19


[H2 | cases] Metrics: 
 - RMSE : 2.5178 
 - MAE  : 1.5959 
 - R2   : 0.4837 
 - WAPE : 67.52% 
 - RMSLE: 0.6061


In [14]:
# Cell 10: H2 log1p
train_pack_h2_log, valid_pack_h2_log, feats_h2_log = pack_train_valid(train_h2, valid_h2, "target_log1p")
pred_h2_log, lb_h2_log, lb_h2_log_path, y_true_h2_log, y_pred_h2_log, dir_h2_log = train_and_eval(
    train_pack_h2_log, valid_pack_h2_log, "target_log1p", "h2_log1p"
)
print("Saved leaderboard:", lb_h2_log_path)
print(dir_h2_log)
display(lb_h2_log.head(20))
y_true_h2 = np.expm1(y_true_h2_log)
y_pred_h2 = np.maximum(0.0, np.expm1(y_pred_h2_log))
m_h2_log = metrics_on_cases(y_true_h2, y_pred_h2)
print("[H2 | log1p→cases] Metrics:",
      f"\n - RMSE : {m_h2_log['RMSE']:.4f}",
      f"\n - MAE  : {m_h2_log['MAE']:.4f}",
      f"\n - R2   : {m_h2_log['R2']:.4f}",
      f"\n - WAPE : {m_h2_log['WAPE']:.2f}%",
      f"\n - RMSLE: {m_h2_log['RMSLE']:.4f}")
pd.DataFrame({"y_true": y_true_h2, "y_pred": y_pred_h2}).to_parquet(
    FEATURES/"ag02_valid_pred_h2_log1p.parquet", index=False
)


Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.4.0
Python Version:     3.12.11
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #7-Ubuntu SMP Sat Apr 20 00:58:31 UTC 2024
CPU Count:          4
Memory Avail:       10.09 GB / 15.62 GB (64.6%)
Disk Space Avail:   318.88 GB / 368.04 GB (86.6%)
Presets specified: ['best_quality']
Using hyperparameters preset: hyperparameters='zeroshot'
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on subsets of the data. Then holdout validation data is used to detect stacked 

Saved leaderboard: /teamspace/studios/this_studio/data/dengue/reports/leaderboard_h2_log1p.csv
/teamspace/studios/this_studio/data/dengue/models/ag02_h2_log1p


Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,LightGBM_r131_BAG_L1,-0.58643,-0.353199,root_mean_squared_error,0.137758,1.092363,37.767546,0.137758,1.092363,37.767546,1,True,12
1,ExtraTreesMSE_BAG_L1,-0.586717,-0.36367,root_mean_squared_error,0.134086,0.87892,26.350863,0.134086,0.87892,26.350863,1,True,5
2,LightGBMLarge_BAG_L1,-0.591645,-0.354918,root_mean_squared_error,0.173758,1.051826,88.028604,0.173758,1.051826,88.028604,1,True,9
3,RandomForestMSE_BAG_L1,-0.59412,-0.365551,root_mean_squared_error,0.245878,0.906362,282.261362,0.245878,0.906362,282.261362,1,True,3
4,XGBoost_BAG_L1,-0.596577,-0.359901,root_mean_squared_error,0.10233,0.296624,59.69176,0.10233,0.296624,59.69176,1,True,7
5,CatBoost_r177_BAG_L1,-0.599044,-0.353304,root_mean_squared_error,0.043491,0.999378,140.723111,0.043491,0.999378,140.723111,1,True,10
6,CatBoost_BAG_L1,-0.60241,-0.352915,root_mean_squared_error,0.0667,1.149796,302.980679,0.0667,1.149796,302.980679,1,True,4
7,LightGBMXT_BAG_L2,-0.603804,-0.351697,root_mean_squared_error,1.989953,11.008633,1417.234808,0.031789,0.145583,16.986959,2,True,14
8,ExtraTreesMSE_BAG_L2,-0.604436,-0.351996,root_mean_squared_error,2.210335,11.798172,1429.792129,0.252171,0.935121,29.544281,2,True,18
9,LightGBM_BAG_L1,-0.604499,-0.352706,root_mean_squared_error,0.151733,0.912739,45.350771,0.151733,0.912739,45.350771,1,True,2


[H2 | log1p→cases] Metrics: 
 - RMSE : 2.6281 
 - MAE  : 1.5830 
 - R2   : 0.4375 
 - WAPE : 66.97% 
 - RMSLE: 0.6089


In [15]:
# Cell 11: Consolidated comparison table
rows = [
    {"run":"h1_cases", **m_h1_cases},
    {"run":"h1_log1p", **m_h1_log},
    {"run":"h2_cases", **m_h2_cases},
    {"run":"h2_log1p", **m_h2_log},
]
cmp = pd.DataFrame(rows)
cmp_path = REPORTS/"ag02_compare_h1_h2_cases_log1p.csv"
cmp.to_csv(cmp_path, index=False)
display(cmp)
print("Saved:", cmp_path)


Unnamed: 0,run,RMSE,MAE,R2,WAPE,RMSLE
0,h1_cases,2.578065,1.632101,0.458728,69.050434,0.610785
1,h1_log1p,2.634134,1.592603,0.434928,67.379359,0.603592
2,h2_cases,2.517788,1.595916,0.483743,67.519503,0.606127
3,h2_log1p,2.628084,1.582962,0.437521,66.971467,0.608925


Saved: /teamspace/studios/this_studio/data/dengue/reports/ag02_compare_h1_h2_cases_log1p.csv


In [16]:
# Cell 12: Artifact index
index = {
    "leaderboards": {
        "h1_cases": str(REPORTS/"leaderboard_h1_cases.csv"),
        "h1_log1p": str(REPORTS/"leaderboard_h1_log1p.csv"),
        "h2_cases": str(REPORTS/"leaderboard_h2_cases.csv"),
        "h2_log1p": str(REPORTS/"leaderboard_h2_log1p.csv"),
    },
    "predictions": {
        "h1_cases": str(FEATURES/"ag02_valid_pred_h1_cases.parquet"),
        "h1_log1p": str(FEATURES/"ag02_valid_pred_h1_log1p.parquet"),
        "h2_cases": str(FEATURES/"ag02_valid_pred_h2_cases.parquet"),
        "h2_log1p": str(FEATURES/"ag02_valid_pred_h2_log1p.parquet"),
    },
    "comparison_table": str(REPORTS/"ag02_compare_h1_h2_cases_log1p.csv"),
}
print(index)


{'leaderboards': {'h1_cases': '/teamspace/studios/this_studio/data/dengue/reports/leaderboard_h1_cases.csv', 'h1_log1p': '/teamspace/studios/this_studio/data/dengue/reports/leaderboard_h1_log1p.csv', 'h2_cases': '/teamspace/studios/this_studio/data/dengue/reports/leaderboard_h2_cases.csv', 'h2_log1p': '/teamspace/studios/this_studio/data/dengue/reports/leaderboard_h2_log1p.csv'}, 'predictions': {'h1_cases': '/teamspace/studios/this_studio/data/dengue/features/ag02_valid_pred_h1_cases.parquet', 'h1_log1p': '/teamspace/studios/this_studio/data/dengue/features/ag02_valid_pred_h1_log1p.parquet', 'h2_cases': '/teamspace/studios/this_studio/data/dengue/features/ag02_valid_pred_h2_cases.parquet', 'h2_log1p': '/teamspace/studios/this_studio/data/dengue/features/ag02_valid_pred_h2_log1p.parquet'}, 'comparison_table': '/teamspace/studios/this_studio/data/dengue/reports/ag02_compare_h1_h2_cases_log1p.csv'}
