In [2]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, root_mean_squared_error
import matplotlib.pyplot as plt

# Load the data and create engineered features (same as your notebook)
df = pd.read_csv('../../data/data.csv')
df[["observation_date", "date_of_introduction"]] = df[["observation_date", "date_of_introduction"]].apply(
    pd.to_datetime
)
df['days_since_introduction'] = (df['observation_date'] - df['date_of_introduction']).dt.days
df['initial_growth_potential'] = df['initial_female_count'] * df['leaf_area_cm2']

# Define features and target
features = [
    'initial_female_count',
    'leaf_area_cm2',
    'population_density',
    'days_since_introduction',
    'initial_growth_potential'
]
target = 'cumulative_mite_count'

X = df[features]
y = df[target]

# Split data (using the same random_state for a fair comparison)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Create and Train the Baseline LightGBM Model ---
lgb_model = lgb.LGBMRegressor(random_state=42)
lgb_model.fit(X_train, y_train)

# --- Make Predictions and Evaluate ---
y_pred_lgb = lgb_model.predict(X_test)
r2_lgb = r2_score(y_test, y_pred_lgb)
mae_lgb = mean_absolute_error(y_test, y_pred_lgb)
mse_lgb = mean_squared_error(y_test, y_pred_lgb)
rmse_lgb = root_mean_squared_error(y_test, y_pred_lgb)

print("--- LightGBM Regressor Baseline Performance ---")
print(f"Mean Absolute Error (MAE): {mae_lgb:.2f}")
print(f"Mean Squared Error (MSE): {mse_lgb:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse_lgb:.2f}")
print(f"R-squared (R²): {r2_lgb:.4f}")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000107 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 100
[LightGBM] [Info] Number of data points in the train set: 3028, number of used features: 5
[LightGBM] [Info] Start training from score 158.143659
--- LightGBM Regressor Baseline Performance ---
Mean Absolute Error (MAE): 33.94
Mean Squared Error (MSE): 3464.76
Root Mean Squared Error (RMSE): 58.86
R-squared (R²): 0.8253


In [3]:
# Define the parameter grid for LightGBM
param_grid_lgb = {
    'n_estimators': [100, 300, 500, 700, 1000],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [-1, 3, 5, 7],
    'num_leaves': [20, 31, 40, 50],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

# Randomized search
lgb_random = RandomizedSearchCV(
    estimator=lgb.LGBMRegressor(random_state=42),
    param_distributions=param_grid_lgb,
    n_iter=100,  # You can adjust the number of iterations
    scoring='r2',
    cv=5,
    n_jobs=-1,
    verbose=2,
    random_state=42
)

# Fit the RandomizedSearchCV
lgb_random.fit(X_train, y_train)

# Get the best model
best_lgb_model = lgb_random.best_estimator_

# Print the best parameters
print("Best parameters found for LightGBM:")
print(lgb_random.best_params_)

# Evaluate the tuned model
y_pred_tuned_lgb = best_lgb_model.predict(X_test)
r2_tuned_lgb = r2_score(y_test, y_pred_tuned_lgb)
mae_tuned_lgb = mean_absolute_error(y_test, y_pred_tuned_lgb)
mse_tuned_lgb = mean_squared_error(y_test, y_pred_tuned_lgb)
rmse_tuned_lgb = root_mean_squared_error(y_test, y_pred_tuned_lgb)

print("\n--- Tuned LightGBM Performance ---")
print(f"Mean Absolute Error (MAE): {mae_tuned_lgb:.2f}")
print(f"Mean Squared Error (MSE): {mse_tuned_lgb:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse_tuned_lgb:.2f}")
print(f"R-squared (R²): {r2_tuned_lgb:.4f}")

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000059 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 100
[LightGBM] [Info] Number of data points in the train set: 3028, number of used features: 5
[LightGBM] [Info] Start training from score 158.143659
Best parameters found for LightGBM:
{'subsample': 0.8, 'num_leaves': 20, 'n_estimators': 500, 'max_depth': 3, 'learning_rate': 0.05, 'colsample_bytree': 1.0}

--- Tuned LightGBM Performance ---
Mean Absolute Error (MAE): 34.09
Mean Squared Error (MSE): 3322.20
Root Mean Squared Error (RMSE): 57.64
R-squared (R²): 0.8325
