Update attempt_8 with hyperparameter tuning using GridSearchCV and using random forest regressor instead of ridge regression.

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import RepeatedKFold, cross_val_score
from sklearn.metrics import make_scorer, mean_squared_error

train_path = "MiNDAT.csv"
test_path = "MiNDAT_UNK.csv"
target_col = "CORRUCYSTIC_DENSITY"
id_col = "LOCAL_IDENTIFIER"
feature_cols = ["jNhEum"]  # exact column names

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

X = train_df[feature_cols].apply(pd.to_numeric, errors="coerce")
y = pd.to_numeric(train_df[target_col], errors="coerce")
X_test = test_df[feature_cols].apply(pd.to_numeric, errors="coerce")

row_mask = X.notna().all(axis=1)
X, y = X.loc[row_mask], y.loc[row_mask]

# Remove outliers using a hybrid Z-score (features) + IQR (target) rule
# - Z-score threshold on features: 3.0
# - IQR factor on target: 1.5 (Tukey's rule)
# This typically trims extreme rows that destabilize RMSE.
feat_mean = X.mean()
feat_std = X.std(ddof=0).replace(0, 1.0)  # guard against zero std to avoid inf
z_scores = (X - feat_mean) / feat_std
z_keep = (z_scores.abs() <= 3.0).all(axis=1)

q1, q3 = y.quantile([0.25, 0.75])
iqr = q3 - q1
lower, upper = q1 - 1.5 * iqr, q3 + 1.5 * iqr
y_keep = y.between(lower, upper)

outlier_keep_mask = z_keep & y_keep
removed = len(X) - int(outlier_keep_mask.sum())
if removed > 0:
    print(
        f"[Outlier filter] Removed {removed} rows "
        f"({removed / len(outlier_keep_mask) * 100:.2f}% of cleaned training)."
    )
X, y = X.loc[outlier_keep_mask].copy(), y.loc[outlier_keep_mask].copy()

# Replace NaNs with the same training medians
train_medians = X.median(numeric_only=True).fillna(0.0)
X_test = X_test.fillna(train_medians)
row_mask = X.notna().all(axis=1)
X, y = X.loc[row_mask], y.loc[row_mask]
# Prepare the test set using the same medians
X_test = X_test.fillna(train_medians)
X_test = X_test.fillna(0.0)

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV


def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))


rmse_scorer = make_scorer(rmse, greater_is_better=False)

rf = RandomForestRegressor(
    n_estimators=100,
    max_depth=None,
    min_samples_split=1,
    min_samples_leaf=1,
    max_features="sqrt",
    n_jobs=-1,
    random_state=42,
)

rkf = RepeatedKFold(n_splits=2, n_repeats=1, random_state=42)

param_grid = {
    "n_estimators": [100, 300],
    "max_depth": [None, 2, 4],
    "min_samples_split": [2, 4],
    "min_samples_leaf": [1, 2],
    "max_features": ["sqrt", 0.75],
}

grid = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    scoring=rmse_scorer,
    cv=rkf,
    n_jobs=-1,
    refit=True,
    verbose=0,
)

grid.fit(X, y)
best_rf = grid.best_estimator_
best_rmse = -grid.best_score_
print(f"[rf] best params: {grid.best_params_}")
print(f"[rf] CV RMSE: mean={best_rmse:.4f}")

# Also print CV R²
r2_scores = cross_val_score(best_rf, X, y, scoring="r2", cv=rkf, n_jobs=-1)
print(f"[rf] CV R²: mean={np.mean(r2_scores):.4f}, std={np.std(r2_scores):.4f}")

# Fit on full data and predict test
best_rf.fit(X, y)
preds = best_rf.predict(X_test)

submission = pd.DataFrame({id_col: test_df[id_col].values, target_col: preds})[
    [id_col, target_col]
]

out_file = "corrucystic_density_predictions16.csv"
submission.to_csv(out_file, index=False)
print(f"Wrote submission file: {out_file}")