Adding Random forest and ExtraTrees models to the comparison.

In [4]:
import pandas as pd
import numpy as np

from sklearn.model_selection import RepeatedKFold, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.metrics import make_scorer, mean_squared_error

train_path = "MiNDAT.csv"
test_path = "MiNDAT_UNK.csv"
target_col = "CORRUCYSTIC_DENSITY"
id_col = "LOCAL_IDENTIFIER"
feature_cols = ["jNhEum"]  # exact column names

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

X = train_df[feature_cols].apply(pd.to_numeric, errors="coerce")
y = pd.to_numeric(train_df[target_col], errors="coerce")
X_test = test_df[feature_cols].apply(pd.to_numeric, errors="coerce")


# Remove outliers using a hybrid Z-score (features) + IQR (target) rule
# - Z-score threshold on features: 3.0
# - IQR factor on target: 1.5 (Tukey's rule)
# This typically trims extreme rows that destabilize RMSE.
feat_mean = X.mean()
feat_std = X.std(ddof=0).replace(0, 1.0)  # guard against zero std to avoid inf
z_scores = (X - feat_mean) / feat_std
z_keep = (z_scores.abs() <= 3.0).all(axis=1)

q1, q3 = y.quantile([0.25, 0.75])
iqr = q3 - q1
lower, upper = q1 - 1.5 * iqr, q3 + 1.5 * iqr
y_keep = y.between(lower, upper)

outlier_keep_mask = z_keep & y_keep
removed = len(X) - int(outlier_keep_mask.sum())
if removed > 0:
    print(
        f"[Outlier filter] Removed {removed} rows "
        f"({removed / len(outlier_keep_mask) * 100:.2f}% of cleaned training)."
    )
X, y = X.loc[outlier_keep_mask].copy(), y.loc[outlier_keep_mask].copy()

mask_y = y.notna()
X, y = X.loc[mask_y].copy(), y.loc[mask_y].copy()

train_medians = X.median(numeric_only=True).fillna(0.0)
X = X.fillna(train_medians)
row_mask = X.notna().all(axis=1)
X, y = X.loc[row_mask], y.loc[row_mask]
# Prepare test set:
# - Replace NaNs with the same training medians
X_test = X_test.fillna(train_medians)
X_test = X_test.fillna(0.0)


from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor


def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))


rmse_scorer = make_scorer(rmse, greater_is_better=False)

# Cross-validation setup
rkf = RepeatedKFold(n_splits=5, n_repeats=3, random_state=42)

# Candidate models
models = []

# Ridge with polynomial features
ridge_pipe = make_pipeline(
    RobustScaler(with_centering=True, with_scaling=True, unit_variance=False),
    PolynomialFeatures(include_bias=False),
    Ridge(),
)
ridge_grid = {
    "polynomialfeatures__degree": [1, 2, 3, 4],
    "ridge__alpha": np.logspace(-4, 4, 17),
}
models.append(("RidgePoly", ridge_pipe, ridge_grid, "grid"))

# Lasso
lasso_pipe = make_pipeline(
    RobustScaler(with_centering=True, with_scaling=True, unit_variance=False),
    PolynomialFeatures(include_bias=False),
    Lasso(max_iter=10000),
)
lasso_grid = {
    "polynomialfeatures__degree": [1, 2, 3],
    "lasso__alpha": np.logspace(-5, 1, 13),
}
models.append(("LassoPoly", lasso_pipe, lasso_grid, "grid"))

# ElasticNet
enet_pipe = make_pipeline(
    RobustScaler(with_centering=True, with_scaling=True, unit_variance=False),
    PolynomialFeatures(include_bias=False),
    ElasticNet(max_iter=10000),
)
enet_grid = {
    "polynomialfeatures__degree": [1, 2, 3],
    "elasticnet__alpha": np.logspace(-5, 1, 13),
    "elasticnet__l1_ratio": [0.1, 0.3, 0.5, 0.7, 0.9],
}
models.append(("ElasticNetPoly", enet_pipe, enet_grid, "grid"))

# XGBoost
xgb = XGBRegressor(
    objective="reg:squarederror",
    n_estimators=500,
    random_state=42,
    tree_method="hist",
    n_jobs=-1,
)
xgb_param_dist = {
    "n_estimators": [300, 500, 800, 1000],
    "learning_rate": [0.01, 0.03, 0.05, 0.1],
    "max_depth": [3, 4, 5, 6],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "reg_lambda": [0.0, 0.1, 1.0, 5.0, 10.0],
    "reg_alpha": [0.0, 0.001, 0.01, 0.1, 1.0],
}
models.append(("XGB", xgb, xgb_param_dist, "rand"))

# RandomForest
rf = RandomForestRegressor(
    n_estimators=600,
    random_state=42,
    n_jobs=-1,
    oob_score=False,
)
rf_param_dist = {
    "n_estimators": [300, 500, 700, 900],
    "max_depth": [None, 6, 8, 10, 12],
    "min_samples_split": [2, 5, 10, 20],
    "min_samples_leaf": [1, 2, 4, 8],
    "max_features": ["log2", "sqrt", 0.3, 0.5, 0.8],
    "bootstrap": [True, False],
}
models.append(("RF", rf, rf_param_dist, "rand"))

# ExtraTrees
et = ExtraTreesRegressor(
    n_estimators=700,
    random_state=42,
    n_jobs=-1,
)
et_param_dist = {
    "n_estimators": [400, 600, 800, 1000],
    "max_depth": [None, 6, 8, 10, 12],
    "min_samples_split": [2, 5, 10, 20],
    "min_samples_leaf": [1, 2, 4, 8],
    "max_features": ["log2", "sqrt", 0.3, 0.5, 0.8],
    "bootstrap": [False],  # ExtraTrees typically without bootstrap
}
models.append(("ExtraTrees", et, et_param_dist, "rand"))

best_name, best_est, best_cv_rmse = None, None, np.inf

for name, est, params, mode in models:
    if mode == "grid":
        search = GridSearchCV(
            est,
            param_grid=params,
            scoring=rmse_scorer,
            cv=rkf,
            n_jobs=-1,
            refit=True,
        )
    else:
        search = RandomizedSearchCV(
            est,
            param_distributions=params,
            n_iter=25,
            scoring=rmse_scorer,
            cv=rkf,
            n_jobs=-1,
            refit=True,
            random_state=42,
        )
    search.fit(X, y)
    mean_rmse = -search.best_score_
    print(f"[{name}] best params: {search.best_params_}")
    print(f"[{name}] CV RMSE: mean={mean_rmse:.4f}")
    if mean_rmse < best_cv_rmse:
        best_cv_rmse = mean_rmse
        best_name = name
        best_est = search.best_estimator_

print(f"\nSelected model: {best_name} with CV RMSE={best_cv_rmse:.4f}")

# Evaluate R^2 for the selected model
r2_scores = cross_val_score(best_est, X, y, scoring="r2", cv=rkf, n_jobs=-1)
print(f"CV R² (selected): mean={np.mean(r2_scores):.4f}, std={np.std(r2_scores):.4f}")

# Fit on full training data
best_est.fit(X, y)

# Predict and build submission
preds = best_est.predict(X_test)
submission = pd.DataFrame({id_col: test_df[id_col].values, target_col: preds})[
    [id_col, target_col]
]

out_file = "corrucystic_density_predictions22.csv"
submission.to_csv(out_file, index=False)
print(f"\nWrote submission file: {out_file}")

[Outlier filter] Removed 1860 rows (15.50% of cleaned training).
[RidgePoly] best params: {'polynomialfeatures__degree': 3, 'ridge__alpha': np.float64(3162.2776601683795)}
[RidgePoly] CV RMSE: mean=188.1077
[LassoPoly] best params: {'lasso__alpha': np.float64(1e-05), 'polynomialfeatures__degree': 1}
[LassoPoly] CV RMSE: mean=188.1097
[ElasticNetPoly] best params: {'elasticnet__alpha': np.float64(0.31622776601683794), 'elasticnet__l1_ratio': 0.1, 'polynomialfeatures__degree': 3}
[ElasticNetPoly] CV RMSE: mean=188.1078
[XGB] best params: {'subsample': 0.8, 'reg_lambda': 0.0, 'reg_alpha': 0.001, 'n_estimators': 500, 'max_depth': 4, 'learning_rate': 0.01, 'colsample_bytree': 0.8}
[XGB] CV RMSE: mean=187.8730
[RF] best params: {'n_estimators': 900, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 0.3, 'max_depth': 10, 'bootstrap': True}
[RF] CV RMSE: mean=187.2875
[ExtraTrees] best params: {'n_estimators': 400, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'l