# 1) Using only Random forest

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

# Load only necessary columns
usecols = None  # Set to list of needed columns if known
train = pd.read_csv("MiNDAT.csv", usecols=usecols)
test = pd.read_csv("MiNDAT_UNK.csv", usecols=usecols)

# Fill missing values in-place
train.fillna(train.mean(numeric_only=True), inplace=True)
test.fillna(train.mean(numeric_only=True), inplace=True)

# Select numeric features and fill NaNs
X_train = train.drop(["CORRUCYSTIC_DENSITY", "LOCAL_IDENTIFIER"], axis=1)
X_train = X_train.select_dtypes(include=["float64", "int64"])
X_train.fillna(0, inplace=True)
y_train = train["CORRUCYSTIC_DENSITY"]

X_test = test.drop(["LOCAL_IDENTIFIER"], axis=1)
X_test = X_test[X_train.columns]
X_test.fillna(0, inplace=True)

# Train model
model = RandomForestRegressor(random_state=42, n_jobs=-1)
model.fit(X_train, y_train)

# Predict and format output efficiently
preds = model.predict(X_test)
output = pd.DataFrame(
    {
        "LOCAL_IDENTIFIER": test["LOCAL_IDENTIFIER"],
        "CORRUCYSTIC_DENSITY": pd.Series(preds).round(6).astype(str),
    }
)

output.to_csv("corrucystic_density_predictions1.csv", index=False)

# 2) Using standard scaler with Random forest

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

# Load data
train = pd.read_csv("MiNDAT.csv")
test = pd.read_csv("MiNDAT_UNK.csv")

# Fill missing values
train.fillna(train.mean(numeric_only=True), inplace=True)
test.fillna(train.mean(numeric_only=True), inplace=True)

# Select numeric features
X_train = train.drop(["CORRUCYSTIC_DENSITY", "LOCAL_IDENTIFIER"], axis=1)
X_train = X_train.select_dtypes(include=["float64", "int64"])
X_train.fillna(0, inplace=True)
y_train = train["CORRUCYSTIC_DENSITY"]

X_test = test.drop(["LOCAL_IDENTIFIER"], axis=1)
X_test = X_test[X_train.columns]
X_test.fillna(0, inplace=True)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train model with more trees
model = RandomForestRegressor(
    n_estimators=300, max_depth=12, min_samples_split=4, random_state=42, n_jobs=-1
)
model.fit(X_train_scaled, y_train)

# Predict and format output
preds = model.predict(X_test_scaled)
output = pd.DataFrame(
    {
        "LOCAL_IDENTIFIER": test["LOCAL_IDENTIFIER"],
        "CORRUCYSTIC_DENSITY": pd.Series(preds).round(6).astype(str),
    }
)

output.to_csv("corrucystic_density_predictions2.csv", index=False)

# 3) Using grid search on top of the previous adjustments.

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

train = pd.read_csv("MiNDAT.csv")
test = pd.read_csv("MiNDAT_UNK.csv")

train.fillna(train.mean(numeric_only=True), inplace=True)
test.fillna(train.mean(numeric_only=True), inplace=True)

X_train = train.drop(["CORRUCYSTIC_DENSITY", "LOCAL_IDENTIFIER"], axis=1)
X_train = X_train.select_dtypes(include=["float64", "int64"])
X_train.fillna(0, inplace=True)
y_train = train["CORRUCYSTIC_DENSITY"]

X_test = test.drop(["LOCAL_IDENTIFIER"], axis=1)
X_test = X_test[X_train.columns]
X_test.fillna(0, inplace=True)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

param_grid = {
    "n_estimators": [200, 300, 400],
    "max_depth": [10, 12, 15],
    "min_samples_split": [2, 4, 6],
}
grid = GridSearchCV(
    RandomForestRegressor(random_state=42, n_jobs=-1), param_grid, cv=5, n_jobs=-1
)
grid.fit(X_train_scaled, y_train)

import numpy as np

best_model = grid.best_estimator_
importances = best_model.feature_importances_
indices = np.argsort(importances)[::-1]
top_n = int(0.8 * len(indices))
selected_indices = indices[:top_n]
X_train_selected = X_train_scaled[:, selected_indices]
X_test_selected = X_test_scaled[:, selected_indices]

best_model.fit(X_train_selected, y_train)
preds = best_model.predict(X_test_selected)

output = pd.DataFrame(
    {
        "LOCAL_IDENTIFIER": test["LOCAL_IDENTIFIER"],
        "CORRUCYSTIC_DENSITY": pd.Series(preds).round(6).astype(str),
    }
)
output.to_csv("corrucystic_density_predictions3.csv", index=False)

Grid search failed as cv was too high and the computer was unable to run it.