After Grid search failed many times, I decided to manually tune the hyperparameters of the RandomForestRegressor model.

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

train = pd.read_csv("MiNDAT.csv")
test = pd.read_csv("MiNDAT_UNK.csv")

# Issue: Using mean imputation may distort skewed data; switched to median for robustness
train.fillna(train.median(numeric_only=True), inplace=True)
test.fillna(train.median(numeric_only=True), inplace=True)

X_train = train.drop(["CORRUCYSTIC_DENSITY", "LOCAL_IDENTIFIER"], axis=1)
X_train = X_train.select_dtypes(include=["float64", "int64"])

X_train.fillna(X_train.median(numeric_only=True), inplace=True)

y_train = train["CORRUCYSTIC_DENSITY"]

X_test = test.drop(["LOCAL_IDENTIFIER"], axis=1)

# Update: Reindex with train columns, filling missing columns with 0 for safety
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

X_test.fillna(X_train.median(numeric_only=True), inplace=True)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Original:
# model = RandomForestRegressor(
#     n_estimators=300,
#     max_depth=12,
#     min_samples_split=4,
#     random_state=42,
#     n_jobs=-1
# )
model = RandomForestRegressor(
    n_estimators=400,  # Increased trees for better stability
    max_depth=15,  # Slightly deeper trees for complexity
    min_samples_split=4,
    min_samples_leaf=2,  # Added to avoid overfitting small leaf nodes
    random_state=42,
    n_jobs=-1,
)

model.fit(X_train_scaled, y_train)

preds = model.predict(X_test_scaled)

output = pd.DataFrame(
    {
        "LOCAL_IDENTIFIER": test["LOCAL_IDENTIFIER"],
        "CORRUCYSTIC_DENSITY": pd.Series(preds),
    }
)

output.to_csv("corrucystic_density_predictions5.csv", index=False)