In [1]:
import duckdb, pandas as pd, numpy as np

file_train_cls = 'data/NMAD_with_embeddings_cls.parquet'
con = duckdb.connect(file_train_cls)

# забираємо все потрібне під FABDEM
df = con.execute("""
  SELECT
    -- групування / просторові поля
    rgt, track, spot, x_merc, y_merc,
    -- таргет FABDEM
    cls_nmad_fab, nmad_fab,
    h_fab_dem, delta_fab_dem, abs_delta_fab_dem,
    fab_dem_slope, fab_dem_twi, fab_dem_2000, fab_dem_stream,
    x, y,
    -- категоріальні LULC/geomorphon/landform
    lulc_class, lulc_name, fab_dem_geomorphon, fab_dem_landform,
    -- ембедінги (64 колонок або emb_all)
    emb_001, emb_002, emb_003, emb_004, emb_005, emb_006, emb_007, emb_008,
    emb_009, emb_010, emb_011, emb_012, emb_013, emb_014, emb_015, emb_016,
    emb_017, emb_018, emb_019, emb_020, emb_021, emb_022, emb_023, emb_024,
    emb_025, emb_026, emb_027, emb_028, emb_029, emb_030, emb_031, emb_032,
    emb_033, emb_034, emb_035, emb_036, emb_037, emb_038, emb_039, emb_040,
    emb_041, emb_042, emb_043, emb_044, emb_045, emb_046, emb_047, emb_048,
    emb_049, emb_050, emb_051, emb_052, emb_053, emb_054, emb_055, emb_056,
    emb_057, emb_058, emb_059, emb_060, emb_061, emb_062, emb_063, emb_064,
    -- або emb_all якщо зберігав як масив/список
  FROM NMAD_with_embeddings_cls
""").fetchdf()

# Таргет
df["cls_nmad_fab"] = df["cls_nmad_fab"].astype(int)

# Група для spatial split
df["group_id"] = (df["rgt"].astype(int).astype(str) + "_" +
                  df["track"].astype(int).astype(str))


In [2]:
# рахуємо розмір груп
gsize = df.groupby("group_id").size().rename("grp_n")  # <-- унікальна назва!

# приєднуємо розмір груп до df (без конфлікту імен)
df = df.join(gsize, on="group_id")

MIN_TEST_N = 3000
df["is_big_group"] = df["grp_n"] >= MIN_TEST_N


In [3]:
num_cols = [
    "x", "y","h_fab_dem","delta_fab_dem","abs_delta_fab_dem",
    "fab_dem_slope","fab_dem_twi","fab_dem_2000","fab_dem_stream",
    "x_merc","y_merc"
]
cat_cols = ["lulc_class","lulc_name","fab_dem_geomorphon","fab_dem_landform"]

# ембедінги (якщо є всі 64 канали окремими колонками)
emb_cols = [f"emb_{i:03d}" for i in range(1,65) if f"emb_{i:03d}" in df.columns]


In [4]:
def collapse_rare(df, col, min_frac=0.01):
    vc = df[col].value_counts(normalize=True)
    rare = vc[vc < min_frac].index
    df[col] = df[col].where(~df[col].isin(rare), "__OTHER__")
    return df

for c in cat_cols:
    if c in df.columns:
        df[c] = df[c].astype(str)
        df = collapse_rare(df, c, 0.01)


In [5]:
from sklearn.model_selection import StratifiedGroupKFold

y = df["cls_nmad_fab"].astype(int)
groups = df["group_id"]
is_big = df["is_big_group"].values

sgkf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)

folds = []
big_idx = np.where(is_big)[0]
y_big   = y.iloc[big_idx]
grp_big = groups.iloc[big_idx]

for fold, (tr_big, te_big) in enumerate(sgkf.split(big_idx, y_big, groups=grp_big), 1):
    tr_idx = big_idx[tr_big]
    te_idx = big_idx[te_big]
    # усі "малі" групи — тільки в train
    tr_idx = np.concatenate([tr_idx, np.where(~is_big)[0]])
    folds.append((np.sort(tr_idx), np.sort(te_idx)))

    print(
        f"Fold {fold}: "
        f"train groups = {df.iloc[tr_idx]['group_id'].nunique()} | "
        f"test groups = {df.iloc[te_idx]['group_id'].nunique()}"
    )

# Перевірки
for k, (tr, te) in enumerate(folds, 1):
    assert set(df.iloc[tr]["group_id"]).isdisjoint(set(df.iloc[te]["group_id"]))
    print(f"Fold {k} class dist train:",
          y.iloc[tr].value_counts(normalize=True).sort_index().round(3).to_dict(),
          "| test:",
          y.iloc[te].value_counts(normalize=True).sort_index().round(3).to_dict())


Fold 1: train groups = 8 | test groups = 3
Fold 2: train groups = 8 | test groups = 3
Fold 3: train groups = 9 | test groups = 2
Fold 4: train groups = 10 | test groups = 1
Fold 5: train groups = 10 | test groups = 1
Fold 1 class dist train: {0: 0.332, 1: 0.329, 2: 0.34} | test: {0: 0.326, 1: 0.332, 2: 0.342}
Fold 2 class dist train: {0: 0.326, 1: 0.332, 2: 0.343} | test: {0: 0.338, 1: 0.326, 2: 0.337}
Fold 3 class dist train: {0: 0.331, 1: 0.333, 2: 0.336} | test: {0: 0.319, 1: 0.308, 2: 0.372}
Fold 4 class dist train: {0: 0.33, 1: 0.326, 2: 0.344} | test: {0: 0.328, 1: 0.359, 2: 0.313}
Fold 5 class dist train: {0: 0.33, 1: 0.33, 2: 0.34} | test: {0: 0.33, 1: 0.327, 2: 0.343}


In [6]:
import os, json
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score, balanced_accuracy_score, confusion_matrix, classification_report
from catboost import CatBoostClassifier, Pool
from sklearn.decomposition import PCA
import joblib

# --- налаштування ---
SAVE_DIR = "artifacts_fabdem_cv"
os.makedirs(SAVE_DIR, exist_ok=True)
REDUCED_DIM = 32  # скільки компонент залишати з 64-ембедінгів

# --- підготовка PCA об'єкта (фітитимемо на train усередині 1-го фолду) ---
pca = PCA(n_components=REDUCED_DIM, random_state=42) if len(emb_cols) > 0 else None

all_fold_metrics = []
best_model = None
best_macro_f1 = -1.0
best_fold_id = None

# для збереження усіх тестових прогнозів по фолдах
test_preds_concat = []

for k, (tr, te) in enumerate(folds, 1):
    print(f"\n===== FOLD {k} =====")
    y_tr, y_te = y.iloc[tr], y.iloc[te]

    # числові/категоріальні
    Xn_tr = df.loc[tr, num_cols].copy()
    Xn_te = df.loc[te, num_cols].copy()
    Xc_tr = df.loc[tr, cat_cols].astype(str).copy()
    Xc_te = df.loc[te, cat_cols].astype(str).copy()

    # ембедінги + PCA (fit на train у першому фолді, далі тільки transform)
    if len(emb_cols) > 0:
        Z_tr = df.loc[tr, emb_cols].to_numpy(dtype="float32")
        Z_te = df.loc[te, emb_cols].to_numpy(dtype="float32")
        if k == 1:
            pca.fit(Z_tr)
        Z_tr = pca.transform(Z_tr)
        Z_te = pca.transform(Z_te)
        Ztr_df = pd.DataFrame(Z_tr, index=Xn_tr.index, columns=[f"pca_emb_{i:02d}" for i in range(REDUCED_DIM)])
        Zte_df = pd.DataFrame(Z_te, index=Xn_te.index, columns=[f"pca_emb_{i:02d}" for i in range(REDUCED_DIM)])
        Xn_tr = pd.concat([Xn_tr, Ztr_df], axis=1)
        Xn_te = pd.concat([Xn_te, Zte_df], axis=1)

    # фінальна матриця фіч
    X_tr = pd.concat([Xn_tr, Xc_tr], axis=1)
    X_te = pd.concat([Xn_te, Xc_te], axis=1)
    cat_idx = [X_tr.columns.get_loc(c) for c in Xc_tr.columns]

    # пули
    train_pool = Pool(X_tr, y_tr, cat_features=cat_idx)
    test_pool  = Pool(X_te, y_te, cat_features=cat_idx)

    # модель
    model = CatBoostClassifier(
        loss_function="MultiClass",
        iterations=3000, learning_rate=0.05, depth=8,
        l2_leaf_reg=8, auto_class_weights="Balanced",
        random_state=42, task_type="GPU",
        od_type="Iter", od_wait=100, verbose=200
    )
    model.fit(train_pool, eval_set=test_pool, use_best_model=True)

    # прогнози
    y_pred = model.predict(test_pool).astype(int).ravel()
    y_proba = model.predict_proba(test_pool)  # shape: (N, 3)
    macro_f1 = f1_score(y_te, y_pred, average="macro")
    bal_acc  = balanced_accuracy_score(y_te, y_pred)

    # per-class F1
    f1_per_class = f1_score(y_te, y_pred, average=None)  # порядок класів 0,1,2
    cm = confusion_matrix(y_te, y_pred, labels=[0,1,2])

    print(f"FOLD {k}  macro-F1={macro_f1:.3f}  balanced-acc={bal_acc:.3f}")
    print("Per-class F1:", {c: round(v,3) for c, v in zip([0,1,2], f1_per_class)})
    print("Confusion matrix (rows=true, cols=pred):\n", cm)

    # зберегти метрики фолду
    all_fold_metrics.append({
        "fold": k,
        "macro_f1": float(macro_f1),
        "balanced_acc": float(bal_acc),
        "f1_class_0": float(f1_per_class[0]),
        "f1_class_1": float(f1_per_class[1]),
        "f1_class_2": float(f1_per_class[2]),
        "cm_00": int(cm[0,0]), "cm_01": int(cm[0,1]), "cm_02": int(cm[0,2]),
        "cm_10": int(cm[1,0]), "cm_11": int(cm[1,1]), "cm_12": int(cm[1,2]),
        "cm_20": int(cm[2,0]), "cm_21": int(cm[2,1]), "cm_22": int(cm[2,2]),
    })

    # трекінг найкращої моделі
    if macro_f1 > best_macro_f1:
        best_macro_f1 = macro_f1
        best_model = model
        best_fold_id = k

    # збережемо тестові прогнози цього фолду з координатами — для карти та аналізу
    out_te = df.loc[te, ["x_merc","y_merc","rgt","track","spot"]].copy()
    out_te["y_true"] = y_te.values
    out_te["y_pred"] = y_pred
    out_te["prob_low"]  = y_proba[:,0]
    out_te["prob_mid"]  = y_proba[:,1]
    out_te["prob_high"] = y_proba[:,2]
    out_te["fold"] = k
    test_preds_concat.append(out_te)

# --- підсумки по фолдах ---
metrics_df = pd.DataFrame(all_fold_metrics)
metrics_df.to_csv(os.path.join(SAVE_DIR, "cv_metrics.csv"), index=False)

print("\n===== CV SUMMARY =====")
for col in ["macro_f1", "balanced_acc", "f1_class_0", "f1_class_1", "f1_class_2"]:
    mu, sd = metrics_df[col].mean(), metrics_df[col].std()
    print(f"{col}: {mu:.3f} ± {sd:.3f}")

print(f"\nBest fold = {best_fold_id}  (macro-F1={best_macro_f1:.3f})")

# збереження найкращої моделі та (за потреби) PCA + маніфест колонок
best_model.save_model(os.path.join(SAVE_DIR, "catboost_fabdem_best.cbm"))
if pca is not None:
    joblib.dump(pca, os.path.join(SAVE_DIR, "pca_embeddings.pkl"))

columns_manifest = {
    "numeric": list(X_tr.select_dtypes(include=[np.number]).columns),
    "categorical": list(Xc_tr.columns),
    "target": "cls_nmad_fab",
    "pca_dim": REDUCED_DIM if pca is not None else 0
}
json.dump(columns_manifest, open(os.path.join(SAVE_DIR, "columns_manifest.json"), "w"), ensure_ascii=False, indent=2)

# усі тестові прогнози разом (для швидкої перевірки на карті)
test_preds_df = pd.concat(test_preds_concat, axis=0).reset_index(drop=True)
test_preds_df.to_parquet(os.path.join(SAVE_DIR, "cv_test_predictions.parquet"))
print("\nSaved:")
print(" - CV metrics ->", os.path.join(SAVE_DIR, "cv_metrics.csv"))
print(" - Best model ->", os.path.join(SAVE_DIR, "catboost_fabdem_best.cbm"))
print(" - Columns manifest ->", os.path.join(SAVE_DIR, "columns_manifest.json"))
print(" - PCA (optional) ->", os.path.join(SAVE_DIR, "pca_embeddings.pkl"))
print(" - Test preds ->", os.path.join(SAVE_DIR, "cv_test_predictions.parquet"))



===== FOLD 1 =====
0:	learn: 1.0026285	test: 1.0027050	best: 1.0027050 (0)	total: 81.7ms	remaining: 4m 5s
200:	learn: 0.0125199	test: 0.0138228	best: 0.0138228 (200)	total: 5.82s	remaining: 1m 20s
400:	learn: 0.0112413	test: 0.0136391	best: 0.0136367 (371)	total: 12.1s	remaining: 1m 18s
bestTest = 0.01361867171
bestIteration = 486
Shrink model to first 487 iterations.
FOLD 1  macro-F1=0.993  balanced-acc=0.993
Per-class F1: {0: np.float64(0.994), 1: np.float64(0.989), 2: np.float64(0.995)}
Confusion matrix (rows=true, cols=pred):
 [[222388   1661      0]
 [  1136 226724    557]
 [     0   1687 233476]]

===== FOLD 2 =====
0:	learn: 1.0027753	test: 1.0027642	best: 1.0027642 (0)	total: 26.2ms	remaining: 1m 18s
200:	learn: 0.0158536	test: 0.0162352	best: 0.0162352 (200)	total: 5.83s	remaining: 1m 21s
400:	learn: 0.0143062	test: 0.0159153	best: 0.0159153 (400)	total: 12.4s	remaining: 1m 20s
600:	learn: 0.0132718	test: 0.0158759	best: 0.0158735 (583)	total: 19.2s	remaining: 1m 16s
bestTest