In [None]:
import os, sys, math
import numpy as np
import pandas as pd
from PIL import Image
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor

In [4]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

train.head()

Unnamed: 0,sample_id,image_path,Sampling_Date,State,Species,Pre_GSHH_NDVI,Height_Ave_cm,target_name,target
0,ID1011485656__Dry_Clover_g,train/ID1011485656.jpg,2015/9/4,Tas,Ryegrass_Clover,0.62,4.6667,Dry_Clover_g,0.0
1,ID1011485656__Dry_Dead_g,train/ID1011485656.jpg,2015/9/4,Tas,Ryegrass_Clover,0.62,4.6667,Dry_Dead_g,31.9984
2,ID1011485656__Dry_Green_g,train/ID1011485656.jpg,2015/9/4,Tas,Ryegrass_Clover,0.62,4.6667,Dry_Green_g,16.2751
3,ID1011485656__Dry_Total_g,train/ID1011485656.jpg,2015/9/4,Tas,Ryegrass_Clover,0.62,4.6667,Dry_Total_g,48.2735
4,ID1011485656__GDM_g,train/ID1011485656.jpg,2015/9/4,Tas,Ryegrass_Clover,0.62,4.6667,GDM_g,16.275


In [6]:
# AI generated code, just for testing:
# Full functioning baseline pipeline — single cell
# Requirements: pandas, numpy, pillow, scikit-learn, tqdm
# Optional: lightgbm (preferred). If not present, RandomForest will be used.
# Place train.csv and test.csv in the working dir. Image paths will be taken from CSV if available,
# otherwise the cell will try IMAGE_DIR/{image_id}.jpg

import os, sys, math
import numpy as np
import pandas as pd
from PIL import Image
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor

# -------- USER SETTINGS (edit if needed) --------
IMAGE_DIR = "images"          # fallback dir (tries CSV image_path first)
IMG_RESIZE = (400, 200)       # downscale for speed (width, height)
RANDOM_SEED = 42
USE_LIGHTGBM = True           # try to use LightGBM; will auto-fallback if not installed
LGBM_PARAMS = {
    "n_estimators": 1000,
    "learning_rate": 0.05,
    "num_leaves": 31,
    "min_child_samples": 20,
    "n_jobs": -1,
    "random_state": RANDOM_SEED,
}
RF_PARAMS = {"n_estimators": 200, "n_jobs": -1, "random_state": RANDOM_SEED}
# ------------------------------------------------

# load csvs
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# derive image_id robustly
def derive_image_id_from_row(row):
    # sample_id like ID123__Dry_Green_g
    if "sample_id" in row and isinstance(row["sample_id"], str) and "__" in row["sample_id"]:
        return row["sample_id"].split("__")[0]
    # image_path like "train/ID123.jpg"
    if "image_path" in row and isinstance(row["image_path"], str) and len(str(row["image_path"]).strip())>0:
        return os.path.splitext(os.path.basename(row["image_path"]))[0]
    # fallback to index string
    return str(row.name)

train = train.copy()
test  = test.copy()
train["image_id"] = train.apply(derive_image_id_from_row, axis=1)
test["image_id"]  = test.apply(derive_image_id_from_row, axis=1)

# Validate expected long-format columns in train
if not {"target_name", "target"}.issubset(set(train.columns)):
    raise ValueError("train.csv must contain 'target_name' and 'target' columns (one row per image+target).")

targets = ["Dry_Green_g","Dry_Dead_g","Dry_Clover_g","GDM_g","Dry_Total_g"]

# pivot train to wide: one row per image_id with 5 target columns
train_wide = train.pivot_table(index="image_id", columns="target_name", values="target", aggfunc="first").reset_index()
# ensure all target columns exist
for t in targets:
    if t not in train_wide.columns:
        train_wide[t] = np.nan

# collect a single metadata row per image (first occurrence)
meta_columns = [c for c in ["image_path","Pre_GSHH_NDVI","Height_Ave_cm","Sampling_Date","State","Species"] if c in train.columns]
meta = train.drop_duplicates(subset=["image_id"]).set_index("image_id")[meta_columns].reset_index()
train_wide = train_wide.merge(meta, on="image_id", how="left")

# For test, build test_images with unique image_id + image_path if present
test_meta_cols = [c for c in ["image_path","Pre_GSHH_NDVI","Height_Ave_cm","Sampling_Date","State","Species"] if c in test.columns]
if len(test_meta_cols) > 0:
    test_images = test.drop_duplicates(subset=["image_id"]).set_index("image_id")[test_meta_cols].reset_index()
else:
    test_images = pd.DataFrame({"image_id": test["image_id"].unique()})

# Build path_map (image_id -> image_path) from available CSV rows
path_map = {}
for df in (train, test):
    if "image_path" in df.columns:
        for _, r in df.iterrows():
            imgid = r["image_id"]
            p = r["image_path"]
            if pd.isna(p) or str(p).strip()=="":
                continue
            path_map[imgid] = str(p)

# --- Image feature extractor ---
def extract_image_features_by_path_or_id(img_id, resize=IMG_RESIZE, image_dir=IMAGE_DIR):
    # candidates to try
    candidates = []
    if img_id in path_map:
        candidates.append(path_map[img_id])
        candidates.append(os.path.basename(path_map[img_id]))
    candidates.append(os.path.join(image_dir, f"{img_id}.jpg"))
    candidates.append(os.path.join(image_dir, f"{img_id}.JPG"))
    candidates.append(f"{img_id}.jpg")
    candidates.append(f"{img_id}.JPG")
    img = None
    used_path = None
    for c in candidates:
        try:
            img = Image.open(c).convert("RGB")
            used_path = c
            break
        except Exception:
            img = None
    if img is None:
        # fallback zeros
        return {
            "r_mean": 0.0, "g_mean": 0.0, "b_mean": 0.0,
            "r_std": 0.0, "g_std": 0.0, "b_std": 0.0,
            "exg": 0.0, "brightness": 0.0, "g_ratio": 0.0, "p90_g": 0.0,
            "img_found": 0
        }
    img = img.resize(resize)
    arr = np.asarray(img).astype(np.float32)
    r = arr[...,0]; g = arr[...,1]; b = arr[...,2]
    r_mean = float(r.mean()); g_mean = float(g.mean()); b_mean = float(b.mean())
    r_std = float(r.std()); g_std = float(g.std()); b_std = float(b.std())
    exg = float((2.0*g - r - b).mean())
    brightness = float(arr.mean())
    denom = (r_mean + b_mean)/2.0 + 1e-6
    g_ratio = float(g_mean / denom)
    p90_g = float(np.percentile(g, 90))
    return {
        "r_mean": r_mean, "g_mean": g_mean, "b_mean": b_mean,
        "r_std": r_std, "g_std": g_std, "b_std": b_std,
        "exg": exg, "brightness": brightness, "g_ratio": g_ratio, "p90_g": p90_g,
        "img_found": 1
    }

# compute image features for train_wide images
train_ids = train_wide["image_id"].tolist()
train_img_feats = []
for iid in tqdm(train_ids, desc="Extract train image features"):
    f = extract_image_features_by_path_or_id(iid)
    f["image_id"] = iid
    train_img_feats.append(f)
train_img_feats = pd.DataFrame(train_img_feats)

# compute image features for test images
test_ids = test_images["image_id"].tolist()
test_img_feats = []
for iid in tqdm(test_ids, desc="Extract test image features"):
    f = extract_image_features_by_path_or_id(iid)
    f["image_id"] = iid
    test_img_feats.append(f)
test_img_feats = pd.DataFrame(test_img_feats)

# merge image features into train_wide and test_images
train_final = train_wide.merge(train_img_feats, on="image_id", how="left")
test_final  = test_images.merge(test_img_feats, on="image_id", how="left")

# --- Build feature table ---
# Use numeric columns + selected metadata (Pre_GSHH_NDVI, Height_Ave_cm) + label-encoded State/Species
def prepare_features(df, fit_encoders=None):
    df2 = df.copy()
    # numeric features from dataframe
    numeric = df2.select_dtypes(include=[np.number]).copy()
    # fill NaNs
    numeric = numeric.fillna(0.0)
    # label encode State and Species
    encoders = {} if fit_encoders is None else fit_encoders
    for col in ["State","Species"]:
        if col in df2.columns:
            if fit_encoders is None:
                le = LabelEncoder()
                numeric[col+"_le"] = le.fit_transform(df2[col].fillna("NA").astype(str))
                encoders[col] = le
            else:
                le = encoders.get(col, None)
                if le is not None:
                    numeric[col+"_le"] = le.transform(df2[col].fillna("NA").astype(str))
                else:
                    numeric[col+"_le"] = 0
    return numeric, encoders

X_all, encs = prepare_features(train_final, fit_encoders=None)
X_test_all, _ = prepare_features(test_final, fit_encoders=encs)

# Ensure X_test has same columns in same order
for c in X_all.columns:
    if c not in X_test_all.columns:
        X_test_all[c] = 0.0
X_test_all = X_test_all[X_all.columns]

y = train_final[targets].copy()

print("Train rows (images):", X_all.shape[0], "features:", X_all.shape[1])
print("Test rows (images):", X_test_all.shape[0], "features:", X_test_all.shape[1])

# train/validation split (by image)
X_tr, X_val, y_tr, y_val = train_test_split(X_all, y, test_size=0.20, random_state=RANDOM_SEED)

# Try to import LightGBM
use_lgbm = False
if USE_LIGHTGBM:
    try:
        import lightgbm as lgb
        from lightgbm import LGBMRegressor
        use_lgbm = True
        print("LightGBM available — using LGBMRegressor per target.")
    except Exception as e:
        print("LightGBM not available, falling back to RandomForest. (Import error: {})".format(e))

# Build per-target regressors
models = {}
if use_lgbm:
    # train a separate LGBM for each target (simple, without early stopping to keep runtime bounded)
    for t in targets:
        print(f"Training LGBM for target: {t}")
        m = LGBMRegressor(**LGBM_PARAMS)
        m.fit(X_tr, y_tr[t], eval_set=[(X_val, y_val[t])], verbose=100)
        models[t] = m
else:
    # MultiOutput RandomForest fallback
    base = RandomForestRegressor(**RF_PARAMS)
    mor = MultiOutputRegressor(base)
    print("Training MultiOutput RandomForest...")
    mor.fit(X_tr, y_tr)
    for i,t in enumerate(targets):
        models[t] = mor.estimators_[i]  # estimator per target

# Predict on validation
y_val_pred = pd.DataFrame({t: models[t].predict(X_val) for t in targets}, index=y_val.index)

# Print per-target R^2
print("\nPer-target R² on validation:")
for t in targets:
    r2 = r2_score(y_val[t], y_val_pred[t])
    print(f"  {t}: {r2:.4f}")

# Weighted R^2 (competition weights)
weights = {"Dry_Green_g":0.1,"Dry_Dead_g":0.1,"Dry_Clover_g":0.1,"GDM_g":0.2,"Dry_Total_g":0.5}
def weighted_r2(y_true_df, y_pred_df, weights_map):
    arr_true = []
    arr_pred = []
    arr_w = []
    for t in y_true_df.columns:
        arr_true.append(y_true_df[t].values)
        arr_pred.append(y_pred_df[t].values)
        arr_w.append(np.full_like(y_true_df[t].values, fill_value=weights_map.get(t,1.0), dtype=float))
    y_true_flat = np.concatenate(arr_true)
    y_pred_flat = np.concatenate(arr_pred)
    w_flat = np.concatenate(arr_w)
    mu = np.sum(w_flat * y_true_flat) / (np.sum(w_flat) + 1e-12)
    ss_res = np.sum(w_flat * (y_true_flat - y_pred_flat)**2)
    ss_tot = np.sum(w_flat * (y_true_flat - mu)**2)
    return 1.0 - ss_res / (ss_tot + 1e-12)

w_r2 = weighted_r2(y_val.reset_index(drop=True), y_val_pred.reset_index(drop=True), weights)
print(f"\nWeighted R² on validation: {w_r2:.6f}")

# Predict on test set
print("Predicting test set...")
test_preds = np.column_stack([models[t].predict(X_test_all) for t in targets])  # shape n_test x 5

# Build long-format submission: sample_id = "{image_id}__{target}"
sub_rows = []
for i, img_id in enumerate(test_ids):
    for j, t in enumerate(targets):
        sample_id = f"{img_id}__{t}"
        val = float(test_preds[i, j])
        # Ensure non-negative prediction (biomass cannot be negative)
        if np.isnan(val):
            val = 0.0
        if val < 0:
            val = 0.0
        sub_rows.append([sample_id, val])

submission = pd.DataFrame(sub_rows, columns=["sample_id", "target"])
submission.to_csv("submission.csv", index=False)
print("Wrote submission.csv with", len(submission), "rows. First 6 rows:")
print(submission.head(6).to_string(index=False))


Extract train image features: 100%|██████████████████████████████████████████████████| 357/357 [01:01<00:00,  5.83it/s]
Extract test image features: 100%|███████████████████████████████████████████████████████| 1/1 [00:00<00:00,  5.51it/s]


Train rows (images): 357 features: 20
Test rows (images): 1 features: 20
LightGBM not available, falling back to RandomForest. (Import error: No module named 'lightgbm')
Training MultiOutput RandomForest...

Per-target R² on validation:
  Dry_Green_g: 0.9981
  Dry_Dead_g: 0.9952
  Dry_Clover_g: 0.9952
  GDM_g: 0.9990
  Dry_Total_g: 0.9993

Weighted R² on validation: 0.999150
Predicting test set...
Wrote submission.csv with 5 rows. First 6 rows:
                 sample_id   target
 ID1001187975__Dry_Green_g 0.035444
  ID1001187975__Dry_Dead_g 0.000000
ID1001187975__Dry_Clover_g 0.000500
       ID1001187975__GDM_g 2.372839
 ID1001187975__Dry_Total_g 4.313450
