In [None]:
# Feature Selection & Preparation
import pandas as pd

# Example: Select predictors (adjust as needed for your dataset)
predictors = [
    'lon', 'lat', 'year', 'geoUncertaintyInM', 'areaInM2', 
    'region', 'country'
]

# Fill any remaining missing numerical values
for col in ['areaInM2', 'geoUncertaintyInM']:
    pa_chunk[col] = pa_chunk[col].fillna(pa_chunk[col].median())

# One-hot encode categorical variables
pa_chunk = pd.get_dummies(pa_chunk, columns=['region', 'country'], drop_first=True)

# Define target variable (1 for presence, 0 for absence/background)
y = pa_chunk['presence']      # Replace with the correct label column if named differently
X = pa_chunk.drop(['presence', 'surveyId', 'county', 'district'], axis=1)

In [None]:
# Split Training and Test Data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [None]:
# Train the Random Forest Model
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=200,
    class_weight='balanced',
    random_state=42
)
rf.fit(X_train, y_train)


In [None]:
# Evaluate the Model
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix

y_pred = rf.predict(X_test)
y_pred_prob = rf.predict_proba(X_test)[:, 1]

print('AUC:', roc_auc_score(y_test, y_pred_prob))
print('Classification Report:\n', classification_report(y_test, y_pred))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred))


In [None]:
# Visualize Predictions (Example: Scatter plot of predicted probabilities)
import matplotlib.pyplot as plt

plt.scatter(X_test['lon'], X_test['lat'], c=y_pred_prob, cmap='viridis', alpha=0.5)
plt.colorbar(label='Habitat Suitability')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title('Ecological Niche Map')
plt.show()
