
# Pasture Biomass Prediction — **Stacked Ensemble (Offline, Kaggle No-Internet)**


- Base models: `RandomForestRegressor`, `ExtraTreesRegressor`, `Ridge` (scaled), `GradientBoostingRegressor` (\*MultiOutput wrapper


In [None]:

# =====================
# 1) Packages and settings
# =====================
import os, glob, time, math, random
import numpy as np
import pandas as pd
from PIL import Image

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import GradientBoostingRegressor

import matplotlib.pyplot as plt

SEED = 42
random.seed(SEED)
np.random.seed(SEED)

IMG_MAX_SIDE = 512
TARGETS = ["Dry_Green_g","Dry_Dead_g","Dry_Clover_g","GDM_g","Dry_Total_g"]


In [None]:

# =====================
# 2) Data path
# =====================

def guess_base_dir():
    if os.path.exists('/kaggle/input/csiro-biomass/train.csv') and os.path.exists('/kaggle/input/csiro-biomass/test.csv'):
        return '.'
    cands = glob.glob('/kaggle/input/**/train.csv', recursive=True)
    if cands:
        return os.path.dirname(cands[0])
    return '.'

BASE_DIR = '/kaggle/input/csiro-biomass'
print('BASE_DIR =', BASE_DIR)

TRAIN_CSV = os.path.join(BASE_DIR, 'train.csv')
TEST_CSV  = os.path.join(BASE_DIR, 'test.csv')

assert os.path.exists(TRAIN_CSV), f"train.csv олдсонгүй: {TRAIN_CSV}"
assert os.path.exists(TEST_CSV),  f"test.csv олдсонгүй: {TEST_CSV}"

train_df = pd.read_csv(TRAIN_CSV)
print('train_df:', train_df.shape)
print(train_df.head(3))

test_df = pd.read_csv(TEST_CSV)
print('test_df:', test_df.shape)
print(test_df.head(3))


In [None]:

# =====================
# 3) Img feature
# =====================

def resolve_image_path(image_path: str):
    p = os.path.join(BASE_DIR, image_path)
    if os.path.exists(p):
        return p
    # fallback variations
    alt1 = os.path.join(BASE_DIR, 'train/')
    if os.path.exists(alt1):
        return alt1
    alt2 = os.path.join(BASE_DIR, 'test/')
    if os.path.exists(alt2):
        return alt2
    if os.path.exists(image_path):
        return image_path
    return p


def load_image_rgb(path: str, max_side=IMG_MAX_SIDE):
    with Image.open(path) as im:
        im = im.convert('RGB')
        w,h = im.size
        if max(w,h) > max_side:
            scale = max_side / float(max(w,h))
            im = im.resize((int(round(w*scale)), int(round(h*scale))), resample=Image.BILINEAR)
        arr = np.asarray(im).astype(np.float32) / 255.0
    return arr


def rgb_to_hsv_np(rgb):
    r,g,b = rgb[...,0], rgb[...,1], rgb[...,2]
    cmax = np.max(rgb, axis=-1)
    cmin = np.min(rgb, axis=-1)
    delta = cmax - cmin + 1e-12
    h = np.zeros_like(cmax)
    m = delta > 0
    idx = (cmax==r) & m; h[idx] = ((g[idx]-b[idx]) / delta[idx]) % 6
    idx = (cmax==g) & m; h[idx] = ((b[idx]-r[idx]) / delta[idx]) + 2
    idx = (cmax==b) & m; h[idx] = ((r[idx]-g[idx]) / delta[idx]) + 4
    h = h / 6.0
    s = np.zeros_like(cmax); s[m] = delta[m] / (cmax[m] + 1e-12)
    v = cmax
    return np.stack([h,s,v], axis=-1)


def basic_stats(x, prefix):
    feats = {}
    x = x.reshape(-1)
    x = x[np.isfinite(x)]
    if x.size==0:
        x = np.array([0.0])
    feats[f'{prefix}_mean'] = float(np.mean(x))
    feats[f'{prefix}_std']  = float(np.std(x))
    feats[f'{prefix}_min']  = float(np.min(x))
    for p in [1,5,25,50,75,95,99]:
        feats[f'{prefix}_p{p:02d}'] = float(np.percentile(x, p))
    feats[f'{prefix}_max']  = float(np.max(x))
    return feats


def channel_hist(x, bins=16, prefix='r'):
    x = x.reshape(-1)
    x = x[np.isfinite(x)]
    if x.size==0:
        hist = np.zeros(bins, dtype=np.float32)
    else:
        hist, _ = np.histogram(x, bins=bins, range=(0.0,1.0), density=True)
    return {f'{prefix}_hist_{i}': float(v) for i,v in enumerate(hist)}


def laplacian_var(gray):
    c = gray
    up    = np.roll(c, -1, axis=0)
    down  = np.roll(c,  1, axis=0)
    left  = np.roll(c,  1, axis=1)
    right = np.roll(c, -1, axis=1)
    lap = (-4.0*c + up + down + left + right)
    return float(np.var(lap))


def gradient_stats(gray, prefix='grad'):
    gx = np.gradient(gray, axis=1)
    gy = np.gradient(gray, axis=0)
    mag = np.sqrt(gx*gx + gy*gy)
    feats = basic_stats(mag, prefix)
    feats[f'{prefix}_mean_gx'] = float(np.mean(np.abs(gx)))
    feats[f'{prefix}_mean_gy'] = float(np.mean(np.abs(gy)))
    return feats


def vegetation_indices(rgb):
    r = rgb[...,0]; g = rgb[...,1]; b = rgb[...,2]
    eps = 1e-6
    exg  = 2*g - r - b
    vari = (g - r) / (g + r - b + eps)
    ndi  = (g - r) / (g + r + eps)
    cive = 0.441*r - 0.811*g + 0.385*b + 18.787
    tgi  = -0.5*(190*(r-g) - 120*(r-b))
    feats = {}
    feats.update(basic_stats(exg,  'exg'))
    feats.update(basic_stats(vari, 'vari'))
    feats.update(basic_stats(ndi,  'ndi'))
    feats.update(basic_stats(cive, 'cive'))
    feats.update(basic_stats(tgi,  'tgi'))
    feats['green_dom_frac'] = float(np.mean((g>r) & (g>b)))
    feats['exg_pos_frac']   = float(np.mean(exg>0))
    return feats


def extract_features_from_image(rgb):
    H,W,_ = rgb.shape
    feats = {'img_h':float(H),'img_w':float(W),'img_ratio':float(W/max(H,1))}
    r,g,b = rgb[...,0], rgb[...,1], rgb[...,2]
    feats.update(basic_stats(r,'r'))
    feats.update(basic_stats(g,'g'))
    feats.update(basic_stats(b,'b'))
    feats.update(channel_hist(r,16,'r'))
    feats.update(channel_hist(g,16,'g'))
    feats.update(channel_hist(b,16,'b'))
    hsv = rgb_to_hsv_np(rgb)
    h,s,v = hsv[...,0], hsv[...,1], hsv[...,2]
    feats.update(basic_stats(h,'h'))
    feats.update(basic_stats(s,'s'))
    feats.update(basic_stats(v,'v'))
    gray = 0.299*r + 0.587*g + 0.114*b
    feats.update(gradient_stats(gray,'grad'))
    feats['lap_var'] = laplacian_var(gray)
    feats.update(vegetation_indices(rgb))
    return feats

_FEATURE_CACHE = {}

def compute_image_features(image_path: str):
    abspath = resolve_image_path(image_path)
    if abspath in _FEATURE_CACHE:
        return _FEATURE_CACHE[abspath]
    if not os.path.exists(abspath):
        raise FileNotFoundError(f"Зураг олдсонгүй: {abspath}")
    rgb = load_image_rgb(abspath)
    feats = extract_features_from_image(rgb)
    _FEATURE_CACHE[abspath] = feats
    return feats


In [None]:

# =====================
# 4) Train matrices
# =====================

uniq_images = train_df[['image_path']].drop_duplicates().reset_index(drop=True)
rows = []
start = time.time()
for i, r in uniq_images.iterrows():
    if (i+1)%100==0:
        print(f"Features: {i+1}/{len(uniq_images)}")
    feats = compute_image_features(r['image_path'])
    feats['image_path'] = r['image_path']
    rows.append(feats)
print(f"Feature extraction finished in {time.time()-start:.1f}s for {len(uniq_images)} images")

features_df = pd.DataFrame(rows)

# pivot targets to wide
piv = train_df.pivot_table(index='image_path', columns='target_name', values='target', aggfunc='mean').reset_index()
for t in TARGETS:
    if t not in piv.columns:
        piv[t] = np.nan

full_df = features_df.merge(piv, on='image_path', how='inner')
print('full_df:', full_df.shape)

Y = full_df[TARGETS].values
Y_log = np.log1p(np.clip(Y, a_min=0, a_max=None))
feature_cols = [c for c in full_df.columns if c not in (['image_path'] + TARGETS)]
X = full_df[feature_cols].values.astype(np.float32)
print('X:', X.shape, 'Y:', Y.shape)


In [None]:

# =====================
# 5) 5-Fold OOF — Base models & Meta blender (Stacking)
# =====================

n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=SEED)

n = X.shape[0]
M = 4  # number of base models
# OOF predictions for each base model, log-space
OOF_LOG = np.zeros((n, len(TARGETS), M), dtype=np.float32)

fold_idx = 0
for tr_idx, va_idx in kf.split(X):
    fold_idx += 1
    print(f"\n===== Fold {fold_idx}/{n_splits} =====")
    X_tr, X_va = X[tr_idx], X[va_idx]
    Y_tr, Y_va = Y_log[tr_idx], Y_log[va_idx]

    # Base-1: RandomForest (multioutput)
    rf = RandomForestRegressor(
        n_estimators=500,
        max_depth=None,
        min_samples_leaf=2,
        n_jobs=-1,
        random_state=SEED+fold_idx,
    )
    rf.fit(X_tr, Y_tr)
    pred_rf = rf.predict(X_va)

    # Base-2: ExtraTrees (multioutput)
    et = ExtraTreesRegressor(
        n_estimators=500,
        max_depth=None,
        min_samples_leaf=2,
        n_jobs=-1,
        random_state=SEED+100+fold_idx,
    )
    et.fit(X_tr, Y_tr)
    pred_et = et.predict(X_va)

    # Base-3: Ridge (scaled)
    rg = Pipeline([
        ('scaler', StandardScaler(with_mean=True, with_std=True)),
        ('ridge', Ridge(alpha=1.0, random_state=SEED+fold_idx))
    ])
    rg.fit(X_tr, Y_tr)
    pred_rg = rg.predict(X_va)

    # Base-4: GradientBoosting (wrapped for multioutput)
    gbr_base = GradientBoostingRegressor(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=3,
        subsample=1.0,
        random_state=SEED+fold_idx,
    )
    gbr = MultiOutputRegressor(gbr_base)
    gbr.fit(X_tr, Y_tr)
    pred_gb = gbr.predict(X_va)

    # collect OOF base preds (log-space)
    OOF_LOG[va_idx, :, 0] = pred_rf
    OOF_LOG[va_idx, :, 1] = pred_et
    OOF_LOG[va_idx, :, 2] = pred_rg
    OOF_LOG[va_idx, :, 3] = pred_gb

    # quick fold metric using simple average
    pred_log_avg = (pred_rf + pred_et + pred_rg + pred_gb) / 4.0
    pred_avg = np.expm1(pred_log_avg)
    fold_rmse = [math.sqrt(mean_squared_error(np.expm1(Y_va[:,j]), pred_avg[:,j])) for j in range(len(TARGETS))]
    print('Fold simple-avg mean RMSE:', np.mean(fold_rmse))

# Train meta blender (one Ridge per target) on OOF base predictions
META = []  # list of (scaler, ridge) pipelines per target
OOF_final = np.zeros((n, len(TARGETS)), dtype=np.float32)

for j, t in enumerate(TARGETS):
    Z = OOF_LOG[:, j, :]  # (n_samples, M)
    meta = Pipeline([
        ('scaler', StandardScaler(with_mean=True, with_std=True)),
        ('ridge', Ridge(alpha=1.0, random_state=SEED))
    ])
    meta.fit(Z, Y_log[:, j])
    META.append(meta)
    pred_log = meta.predict(Z)
    OOF_final[:, j] = np.expm1(pred_log)

# OOF metrics for meta-stacked predictions
oof_rmse = []
for j, t in enumerate(TARGETS):
    rmse = math.sqrt(mean_squared_error(np.expm1(Y_log[:, j]), OOF_final[:, j]))
    oof_rmse.append(rmse)
    print(f"OOF {t}: RMSE = {rmse:.4f}")
print(f"OOF mean RMSE (Stacked) = {np.mean(oof_rmse):.4f}")


In [None]:

# =====================
# 6) All data base models full-fit + Test inference (Stacked)
# =====================

# 6.1 Full-fit base models on entire training set
rf_final = RandomForestRegressor(
    n_estimators=700,
    max_depth=None,
    min_samples_leaf=2,
    n_jobs=-1,
    random_state=SEED,
)
rf_final.fit(X, Y_log)

et_final = ExtraTreesRegressor(
    n_estimators=700,
    max_depth=None,
    min_samples_leaf=2,
    n_jobs=-1,
    random_state=SEED+111,
)
et_final.fit(X, Y_log)

rg_final = Pipeline([
    ('scaler', StandardScaler(with_mean=True, with_std=True)),
    ('ridge', Ridge(alpha=1.0, random_state=SEED))
])
rg_final.fit(X, Y_log)

gbr_final = MultiOutputRegressor(GradientBoostingRegressor(
    n_estimators=400,
    learning_rate=0.05,
    max_depth=3,
    subsample=1.0,
    random_state=SEED,
))
gbr_final.fit(X, Y_log)

print('Full-fit base models trained.')

# 6.2 Build test features
uniq_test = test_df[['image_path']].drop_duplicates().reset_index(drop=True)
rows = []
start = time.time()
for i, r in uniq_test.iterrows():
    if (i+1)%100==0:
        print(f"Test features: {i+1}/{len(uniq_test)}")
    feats = compute_image_features(r['image_path'])
    feats['image_path'] = r['image_path']
    rows.append(feats)
print(f"Test feature extraction in {time.time()-start:.1f}s for {len(uniq_test)} images")

feat_test = pd.DataFrame(rows)
for c in [col for col in feature_cols if col not in feat_test.columns]:
    feat_test[c] = 0.0
X_test = feat_test[feature_cols].values.astype(np.float32)

# 6.3 Base predictions (log-space)
log_rf = rf_final.predict(X_test)
log_et = et_final.predict(X_test)
log_rg = rg_final.predict(X_test)
log_gb = gbr_final.predict(X_test)

# 6.4 Stack through meta blender per target
Z_test_all = np.stack([log_rf, log_et, log_rg, log_gb], axis=2)  # (n_test, n_targets, M)
final_log = np.zeros_like(log_rf)

for j in range(len(TARGETS)):
    Zj = Z_test_all[:, j, :]  # (n_test, M)
    final_log[:, j] = META[j].predict(Zj)

pred_test = np.expm1(final_log)
# clip to >= 0 for safety
pred_test = np.clip(pred_test, a_min=0.0, a_max=None)

pred_df = pd.DataFrame(pred_test, columns=TARGETS)
pred_df['image_path'] = feat_test['image_path']

# 6.5 Map to submission rows
sub = test_df.merge(pred_df, on='image_path', how='left')
name_to_col = {t:t for t in TARGETS}
values = []
for _, r in sub.iterrows():
    values.append(r[name_to_col.get(r['target_name'], TARGETS[-1])])

submission = pd.DataFrame({'sample_id': sub['sample_id'], 'target': values})

out_path = 'submission.csv'
submission.to_csv(out_path, index=False)
print('Saved:', out_path, 'shape =', submission.shape)
print(submission.head(10))


In [None]:

# =====================
# 7) Fast visualition(Optional)
# =====================
try:
    samp = test_df.sample(n=min(6, len(test_df)), random_state=SEED)
    fig, axes = plt.subplots(2,3, figsize=(12,8)); axes = axes.ravel()
    for i, p in enumerate(samp['image_path'].tolist()[:6]):
        im = load_image_rgb(resolve_image_path(p), max_side=512)
        axes[i].imshow(im); axes[i].set_title(os.path.basename(p)); axes[i].axis('off')
    plt.tight_layout(); plt.show()
except Exception as e:
    print('Viz skipped:', e)
