Instead of Linear Regression, I tried Ridge regression with polynomial features and hyperparameter tuning via grid search.

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import RepeatedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer, mean_squared_error

train_path = "MiNDAT.csv"
test_path = "MiNDAT_UNK.csv"
target_col = "CORRUCYSTIC_DENSITY"
id_col = "LOCAL_IDENTIFIER"
feature_cols = ["T\\!", "b1oRb13", "~7*"]  # exact column names

# Load
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

# Coerce to numeric
X = train_df[feature_cols].apply(pd.to_numeric, errors="coerce")
y = pd.to_numeric(train_df[target_col], errors="coerce")
X_test = test_df[feature_cols].apply(pd.to_numeric, errors="coerce")

# Remove NaNs from training:
# - Drop rows with NaN in y
# - Fill feature NaNs with medians computed from training features (after y drop)
mask_y = y.notna()
X, y = X.loc[mask_y].copy(), y.loc[mask_y].copy()

train_medians = X.median(numeric_only=True).fillna(0.0)
X = X.fillna(train_medians)

# If any row still has NaN, drop them
row_mask = X.notna().all(axis=1)
X, y = X.loc[row_mask], y.loc[row_mask]

# Prepare the test set:
# - Replace NaNs with the same training medians
X_test = X_test.fillna(train_medians)
# If any columns are entirely missing medians (unlikely), fill the remaining NaNs with 0
X_test = X_test.fillna(0.0)

# Model pipeline: add polynomial features and ridge regularization
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

pipe = Pipeline(
    steps=[
        ("poly", PolynomialFeatures(include_bias=False)),
        ("scaler", StandardScaler(with_mean=True, with_std=True)),
        ("ridge", Ridge()),
    ]
)


def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))


rmse_scorer = make_scorer(rmse, greater_is_better=False)

# Cross-validation setup
rkf = RepeatedKFold(n_splits=5, n_repeats=3, random_state=42)

# Hyperparameter grid: tune polynomial degree and ridge alpha
param_grid = {
    "poly__degree": [1, 2, 3],
    "ridge__alpha": np.logspace(-3, 3, 13),
}

grid = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring=rmse_scorer,  # negative RMSE internally
    cv=rkf,
    n_jobs=-1,
    refit=True,  # refit the best pipeline on full data
)

# Run the search (optimizing RMSE)
grid.fit(X, y)

best_pipe = grid.best_estimator_
best_rmse = -grid.best_score_
print(f"Best params: {grid.best_params_}")
print(f"CV RMSE (best): mean={best_rmse:.4f}")

# Also, printing CV R² using the best pipeline
r2_scores = cross_val_score(best_pipe, X, y, scoring="r2", cv=rkf, n_jobs=None)
print(f"CV R² (best): mean={np.mean(r2_scores):.4f}, std={np.std(r2_scores):.4f}")

# Fit on full cleaned training data (already fitted by refit=True, but call for clarity)
best_pipe.fit(X, y)

# Inspect learned coefficients (after scaling and poly); coefficients correspond to expanded features
ridge_est = best_pipe.named_steps["ridge"]
poly = best_pipe.named_steps["poly"]
feature_names = poly.get_feature_names_out(feature_cols)
coefs = pd.Series(ridge_est.coef_, index=feature_names, name="coefficient")
print("\nTop-weighted polynomial features (after standardization):")
print(coefs.sort_values(key=np.abs, ascending=False).head(20))

# Predict and build submission
preds = best_pipe.predict(X_test)
submission = pd.DataFrame({id_col: test_df[id_col].values, target_col: preds})[
    [id_col, target_col]
]

out_file = "corrucystic_density_predictions8.csv"
submission.to_csv(out_file, index=False)
print(f"\nWrote submission file: {out_file}")