In [24]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib
import sys

print("Python version:", sys.version)
print("pandas version:", pd.__version__)
print("numpy version:", np.__version__)
print("scikit-learn version:", sklearn.__version__)
print("matplotlib version:", matplotlib.__version__)


Python version: 3.13.5 (tags/v3.13.5:6cb20a2, Jun 11 2025, 16:15:46) [MSC v.1943 64 bit (AMD64)]
pandas version: 2.3.1
numpy version: 2.3.1
scikit-learn version: 1.7.1
matplotlib version: 3.10.3


In [19]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    roc_auc_score, precision_score, recall_score, f1_score
)
from sklearn.utils import resample
# import ace_tools as tools

In [20]:
# Reload with correct dataset
df = pd.read_csv("RCS_data_06_23_2025.csv")

# Define survey columns
survey_columns = [c for c in df.columns if c.startswith("RCS_Inferred_")]

# Base excluded columns (fixed exclusions)
base_excluded_cols = [
    "Survey_2017", "Survey_2021", "Survey_2022", "Survey_2023", "Survey_2024",
    "Water_Presence", "water_saturation"
]+ survey_columns

# Dynamic columns to exclude one-by-one
dynamic_excluded_candidates = [
    "Assumed_Presence", "Inferred_Phrag_Presence", "Phragmites_Presence",
    "Xmax", "MaxEnt_Phragmites", "Xmin", "GridSquare", "MaxEnt_RCS_cold_native""MaxEnt_RCS_cold_native","Ymin","Ymax","MaxEnt_Nipponaclerda", "MaxEnt_RCS_native","NRI_RISK_SCORE", "NRI_HRCN_RISKS",
    "NRI_EAL_SCORE","Yellowthroat_breeding",
    "NRI_SOVI_SCORE",                         
    "NRI_RESL_SCORE",                         
    "NRI_RFLD_RISKS",                         
    "NRI_TRND_RISKS",                         
    "Wren_breeding",                          
    "Wren_postbreeding_migration"  
]

# Append survey columns to dynamic exclusion
all_dynamic_excluded = dynamic_excluded_candidates 

# Define target
target_col = "RCS_Inferred_2021"
df = df[df[target_col] != -50].copy()
df["Presence"] = (df[target_col] > 0).astype(int)

# Parameters
n_iterations = 10
base_seed = 42

# Store results
results = []

for i in range(1, len(all_dynamic_excluded) + 1):
    current_excluded = base_excluded_cols + all_dynamic_excluded[:i]
    feature_cols = [c for c in df.columns if c not in current_excluded + ["Presence", target_col]]

    if not feature_cols:
        continue

    aucs, precisions, recalls, f1s = [], [], [], []

    for j in range(n_iterations):
        seed = base_seed + j
        df_majority = df[df.Presence == 0]
        df_minority = df[df.Presence == 1]
        df_maj_down = resample(df_majority, replace=False, n_samples=len(df_minority), random_state=seed)
        df_balanced = pd.concat([df_maj_down, df_minority]).sample(frac=1, random_state=seed)

        X = df_balanced[feature_cols]
        y = df_balanced["Presence"]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=seed)

        rf = RandomForestClassifier(n_estimators=100, random_state=seed)
        rf.fit(X_train, y_train)

        y_prob = rf.predict_proba(X_test)[:, 1]
        y_pred = rf.predict(X_test)

        aucs.append(roc_auc_score(y_test, y_prob))
        precisions.append(precision_score(y_test, y_pred))
        recalls.append(recall_score(y_test, y_pred))
        f1s.append(f1_score(y_test, y_pred))

    feat_imp = pd.Series(rf.feature_importances_, index=feature_cols).sort_values(ascending=False)

    result_entry = {
        "Excluded_Features": all_dynamic_excluded[:i],
        "Num_Features": len(feature_cols),
        "AUC_Mean": np.mean(aucs),
        "AUC_Std": np.std(aucs),
        "Precision_Mean": np.mean(precisions),
        "Recall_Mean": np.mean(recalls),
        "F1_Mean": np.mean(f1s),
    }

    # Add top 15 features dynamically
    for rank in range(15):
        feat_name = feat_imp.index[rank] if rank < len(feat_imp) else None
        feat_value = feat_imp.iloc[rank] if rank < len(feat_imp) else None
        result_entry[f"Top_Feature_{rank+1}"] = feat_name
        result_entry[f"Importance_{rank+1}"] = feat_value

    results.append(result_entry)

results_df = pd.DataFrame(results)
print(results_df)
results_df.to_csv("2021_excluded_feature_impact_summary.csv", index=False)


                                    Excluded_Features  Num_Features  AUC_Mean  \
0                                  [Assumed_Presence]            30  1.000000   
1         [Assumed_Presence, Inferred_Phrag_Presence]            29  0.810569   
2   [Assumed_Presence, Inferred_Phrag_Presence, Ph...            28  0.810656   
3   [Assumed_Presence, Inferred_Phrag_Presence, Ph...            27  0.806039   
4   [Assumed_Presence, Inferred_Phrag_Presence, Ph...            26  0.806678   
5   [Assumed_Presence, Inferred_Phrag_Presence, Ph...            25  0.801829   
6   [Assumed_Presence, Inferred_Phrag_Presence, Ph...            24  0.799768   
7   [Assumed_Presence, Inferred_Phrag_Presence, Ph...            24  0.799768   
8   [Assumed_Presence, Inferred_Phrag_Presence, Ph...            23  0.799274   
9   [Assumed_Presence, Inferred_Phrag_Presence, Ph...            22  0.780314   
10  [Assumed_Presence, Inferred_Phrag_Presence, Ph...            21  0.771835   
11  [Assumed_Presence, Infer