In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
import xgboost as xgb
import numpy as np

print("--- Starting ML Model Training (Step 2: XGBoost) ---")

# Load the Master Dataset
try:
    df_master = pd.read_csv('../data/master_dataset_state_year.csv')
    print(f"Loaded master dataset with {len(df_master)} rows.")
except FileNotFoundError:
    print("ERROR: Could not find 'master_dataset_state_year.csv'.")
    raise SystemExit()

# Prepare Data for Modeling
df_model = df_master.dropna().copy()
print(f"Prepared data: Using {len(df_model)} complete rows for modeling.")

# Define Features (X) and Target (y)
y = df_model['loss_pct']
X = df_model.drop(columns=['year', 'state', 'loss_pct'])

print(f"Features (X) selected: {list(X.columns)}")

# Split Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print("Data split into training (80%) and testing (20%) sets.")

# Train the XGBoost Model
print("Training the XGBoost model...")
# Prevent the model from overfitting
model_xgb = xgb.XGBRegressor(
    n_estimators=500, 
    early_stopping_rounds=10, 
    learning_rate=0.05,
    random_state=42
)

# Find the best stopping point
model_xgb.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
print("Model training complete.")

# Evaluate the model
print("\n--- ✅ XGBoost MODEL EVALUATION ---")
y_pred = model_xgb.predict(X_test)
r2_xgb = r2_score(y_test, y_pred)
mae_xgb = mean_absolute_error(y_test, y_pred)

print(f"R-squared (R²): {r2_xgb:.3f}")
print(f"Mean Absolute Error (MAE): {mae_xgb:.3f} (percentage points)")

print("\n--- Baseline Model Comparison ---")
print(f"Baseline R²: 0.488  |  New XGBoost R²: {r2_xgb:.3f}")
print(f"Baseline MAE: 2.115 |  New XGBoost MAE: {mae_xgb:.3f}")

# Show Feature Importance
print("\n--- XGBoost FEATURE IMPORTANCE ---")
importances = model_xgb.feature_importances_
feature_names = X.columns
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

print("The most important factors (according to XGBoost):")
print(importance_df)

--- Starting ML Model Training (Step 2: XGBoost) ---
Loaded master dataset with 378 rows.
Prepared data: Using 249 complete rows for modeling.
Features (X) selected: ['inventory_colonies', 'stressor_disease_pct', 'stressor_other_pct', 'stressor_pesticides_pct', 'stressor_pests_pct', 'stressor_unknown_pct', 'stressor_varroa_mites_pct', 'annual_avg_temp', 'annual_total_precip', 'annual_avg_ndvi', 'annual_total_sightings']
Data split into training (80%) and testing (20%) sets.
Training the XGBoost model...
Model training complete.

--- ✅ XGBoost MODEL EVALUATION ---
R-squared (R²): 0.374
Mean Absolute Error (MAE): 2.403 (percentage points)

--- Baseline Model Comparison ---
Baseline R²: 0.488  |  New XGBoost R²: 0.374
Baseline MAE: 2.115 |  New XGBoost MAE: 2.403

--- XGBoost FEATURE IMPORTANCE ---
The most important factors (according to XGBoost):
                      Feature  Importance
2          stressor_other_pct    0.246529
6   stressor_varroa_mites_pct    0.105621
9             an