Using multiple models (Ridge and ElasticNet with XGBoost) and selecting the best via cross-validation.

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import RepeatedKFold, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer, mean_squared_error

train_path = "MiNDAT.csv"
test_path = "MiNDAT_UNK.csv"
target_col = "CORRUCYSTIC_DENSITY"
id_col = "LOCAL_IDENTIFIER"
feature_cols = ["jNhEum", "T\!", "b1oRb13", "~7*"]  # exact column names

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)


# Coerce to numeric
X = train_df[feature_cols].apply(pd.to_numeric, errors="coerce")
y = pd.to_numeric(train_df[target_col], errors="coerce")
X_test = test_df[feature_cols].apply(pd.to_numeric, errors="coerce")

# Remove NaNs from training
mask_y = y.notna()
X, y = X.loc[mask_y].copy(), y.loc[mask_y].copy()

train_medians = X.median(numeric_only=True).fillna(0.0)
X = X.fillna(train_medians)

# If any row still has NaN, drop them
row_mask = X.notna().all(axis=1)
X, y = X.loc[row_mask], y.loc[row_mask]

# Prepare the test set:
# - Replace NaNs with the same training medians
X_test = X_test.fillna(train_medians)
X_test = X_test.fillna(0.0)

# Model pipeline: add polynomial features and ridge regularization
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import ElasticNet
from xgboost import XGBRegressor


# Define RMSE and scorer
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))


rmse_scorer = make_scorer(rmse, greater_is_better=False)

# Cross-validation setup
rkf = RepeatedKFold(n_splits=5, n_repeats=3, random_state=42)

# Build candidate models and hyperparameter grids
candidates = []

# Ridge with polynomial features
ridge_pipe = make_pipeline(
    StandardScaler(with_mean=True, with_std=True),
    PolynomialFeatures(include_bias=False),
    Ridge(),
)
ridge_grid = {
    "polynomialfeatures__degree": [1, 2, 3],
    "ridge__alpha": np.logspace(-3, 3, 13),
}
candidates.append(("ridge_poly", ridge_pipe, ridge_grid))

# ElasticNet with polynomial features
enet_pipe = make_pipeline(
    StandardScaler(with_mean=True, with_std=True),
    PolynomialFeatures(include_bias=False),
    ElasticNet(max_iter=10000),
)
enet_grid = {
    "polynomialfeatures__degree": [
        1,
        2,
    ],  # keep modest to avoid over-regularization needs
    "elasticnet__alpha": np.logspace(-3, 1, 9),
    "elasticnet__l1_ratio": [0.1, 0.3, 0.5, 0.7, 0.9],
}
candidates.append(("elasticnet_poly", enet_pipe, enet_grid))

xgb = XGBRegressor(
    objective="reg:squarederror",
    n_estimators=600,
    learning_rate=0.1,
    max_depth=4,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_lambda=1.0,
    random_state=42,
    n_jobs=-1,
)
xgb_grid = {
    "n_estimators": [400, 800, 1200],
    "max_depth": [3, 4, 6],
    "learning_rate": [0.05, 0.1, 0.2],
    "subsample": [0.8, 1.0],
    "colsample_bytree": [0.8, 1.0],
    "reg_lambda": [0.5, 1.0, 2.0],
}
candidates.append(("xgb", xgb, xgb_grid))

best_name, best_search, best_rmse = None, None, np.inf

for name, est, grid_params in candidates:
    gs = GridSearchCV(
        estimator=est,
        param_grid=grid_params,
        scoring=rmse_scorer,
        cv=rkf,
        n_jobs=-1,
        refit=True,
    )
    gs.fit(X, y)
    mean_rmse = -gs.best_score_
    print(f"[{name}] best params: {gs.best_params_}")
    print(f"[{name}] CV RMSE: {mean_rmse:.4f}")
    if mean_rmse < best_rmse:
        best_rmse = mean_rmse
        best_name = name
        best_search = gs

best_pipe = best_search.best_estimator_
print(f"\nSelected model: {best_name} with CV RMSE={best_rmse:.4f}")

r2_scores = cross_val_score(best_pipe, X, y, scoring="r2", cv=rkf, n_jobs=-1)
print(f"CV R² (best): mean={np.mean(r2_scores):.4f}, std={np.std(r2_scores):.4f}")

# Fit on full cleaned training data
best_pipe.fit(X, y)

# Inspect learned coefficients if linear with polynomial features
if best_name in {"ridge_poly", "elasticnet_poly"}:
    poly = best_pipe.named_steps["polynomialfeatures"]
    feature_names = poly.get_feature_names_out()
    if best_name == "ridge_poly":
        est = best_pipe.named_steps["ridge"]
    else:
        est = best_pipe.named_steps["elasticnet"]
    coefs = pd.Series(est.coef_, index=feature_names, name="coefficient")
    print("\nTop-weighted polynomial features (after standardization):")
    print(coefs.sort_values(key=np.abs, ascending=False).head(20))

# Predict and build submission
preds = best_pipe.predict(X_test)
submission = pd.DataFrame({id_col: test_df[id_col].values, target_col: preds})[
    [id_col, target_col]
]

out_file = "corrucystic_density_predictions16.csv"
submission.to_csv(out_file, index=False)
print(f"\nWrote submission file: {out_file}")