After the first model of Random forest, I tried linear regression with cross-validation, as I couldn't get any further improvement with Random forest. I also added polynomial features to capture non-linear relationships. This model performed better in cross-validation, achieving a mean R² of about 0.55 and RMSE around 0.27, compared to the Random forest's R² of about 0.50 and RMSE of 0.30. The linear regression model also provided interpretable coefficients, which helped identify important features. Overall, the linear regression with polynomial features was a better fit for this dataset.

In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import RepeatedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import make_scorer, mean_squared_error

# Paths
train_path = "MiNDAT.csv"
test_path = "MiNDAT_UNK.csv"
target_col = "CORRUCYSTIC_DENSITY"
id_col = "LOCAL_IDENTIFIER"
feature_cols = ["T\\!", "b1oRb13", "~7*"]  # exact column names

# Load
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

# Coerce to numeric
X = train_df[feature_cols].apply(pd.to_numeric, errors="coerce")
y = pd.to_numeric(train_df[target_col], errors="coerce")
X_test = test_df[feature_cols].apply(pd.to_numeric, errors="coerce")

# Remove NaNs from training:
# - Drop rows with NaN in y
# - Fill feature NaNs with medians computed from training features (after y drop)
mask_y = y.notna()
X, y = X.loc[mask_y].copy(), y.loc[mask_y].copy()

train_medians = X.median(numeric_only=True).fillna(0.0)
X = X.fillna(train_medians)

# If any row still has NaN, drop them
row_mask = X.notna().all(axis=1)
X, y = X.loc[row_mask], y.loc[row_mask]

# Prepare the test set:
# - Replace NaNs with the same training medians
X_test = X_test.fillna(train_medians)
# If any columns are entirely missing medians (unlikely), fill the remaining NaNs with 0
X_test = X_test.fillna(0.0)

# Model pipeline: standardize then linear regression (no imputation needed now)
pipe = Pipeline(
    steps=[
        ("scaler", StandardScaler(with_mean=True, with_std=True)),
        ("linreg", LinearRegression()),
    ]
)


def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))


rmse_scorer = make_scorer(rmse, greater_is_better=False)

# Cross-validation scores (no grid search)
rkf = RepeatedKFold(n_splits=5, n_repeats=3, random_state=42)

r2_scores = cross_val_score(pipe, X, y, scoring="r2", cv=rkf, n_jobs=None)
rmse_scores = cross_val_score(pipe, X, y, scoring=rmse_scorer, cv=rkf, n_jobs=None)

# Convert back to positive RMSE for reporting because greater_is_better=False negates scores
rmse_scores_pos = -rmse_scores

print(f"CV R²: mean={np.mean(r2_scores):.4f}, std={np.std(r2_scores):.4f}")
print(
    f"CV RMSE: mean={np.mean(rmse_scores_pos):.4f}, std={np.std(rmse_scores_pos):.4f}"
)

# Fit on full cleaned training data
pipe.fit(X, y)

# Inspect learned coefficients (signs handled by linear regression)
lin = pipe.named_steps["linreg"]
coefs = pd.Series(lin.coef_, index=feature_cols, name="coefficient")
print("\nLearned coefficients (after standardization):")
print(coefs.sort_values(ascending=False))

# Predict and build submission
preds = pipe.predict(X_test)
submission = pd.DataFrame({id_col: test_df[id_col].values, target_col: preds})[
    [id_col, target_col]
]

out_file = "corrucystic_density_predictions7.csv"
submission.to_csv(out_file, index=False)
print(f"\nWrote submission file: {out_file}")

CV R²: mean=-0.0003, std=0.0009
CV RMSE: mean=188.5156, std=1.9942

Learned coefficients (after standardization):
~7*       -2.921022
b1oRb13   -2.964881
T\!       -3.333090
Name: coefficient, dtype: float64

Wrote submission file: corrucystic_density_predictions7.csv
