In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import r2_score
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import randint, uniform

# ============================================
# 1. Load DATA
# ============================================
df = pd.read_csv("../data/cleaned/final_merged_dataset.csv")

target = "Cases"
features = [
    'State', 'Crime_Type', 'Year', 'unemployment_rate', 'poverty_rate', 'per_capita_income',
    'inflation_rate', 'population_density', 'gender_ratio', 'literacy_rate',
    'youth_population_percent', 'urbanization_rate', 'human_development_index',
    'police_stations_per_district', 'conviction_rate', 'police_personnel_per_100k',
    'alcohol_consumption_per_capita'
]

numerical_cols = [
    'Year', 'unemployment_rate', 'poverty_rate', 'per_capita_income',
    'inflation_rate', 'population_density', 'gender_ratio', 'literacy_rate',
    'youth_population_percent', 'urbanization_rate', 'human_development_index',
    'police_stations_per_district', 'conviction_rate',
    'police_personnel_per_100k', 'alcohol_consumption_per_capita'
]

categorical_cols = ['State', 'Crime_Type']

# time-split
train_df = df[df["Year"] <= 2020]
test_df  = df[df["Year"] > 2020]

y_train = train_df[target]
y_test  = test_df[target]

# ============================================
# 2. Encoding + Scaling
# ============================================
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numerical_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)
])

X_train = preprocessor.fit_transform(train_df[features])
X_test  = preprocessor.transform(test_df[features])

print("Shapes:", X_train.shape, X_test.shape)

# ============================================
# 3. FAST RANDOMIZED TUNING
# ============================================

print("\nTuning XGB...")
xgb_model = XGBRegressor(random_state=42)

xgb_params = {
    "n_estimators": randint(150, 500),
    "max_depth": randint(3, 8),
    "learning_rate": uniform(0.01, 0.05),
    "subsample": uniform(0.7, 0.3),
    "colsample_bytree": uniform(0.7, 0.3)
}

xgb_search = RandomizedSearchCV(
    xgb_model, xgb_params, n_iter=15, scoring='r2',
    cv=3, random_state=42, n_jobs=-1
)
xgb_search.fit(X_train, y_train)
xgb_best = xgb_search.best_estimator_
print("Best XGB:", xgb_search.best_params_)

print("\nTuning LGBM...")
lgb_model = LGBMRegressor(random_state=42)

lgb_params = {
    "n_estimators": randint(150, 500),
    "num_leaves": randint(20, 80),
    "learning_rate": uniform(0.01, 0.05),
    "subsample": uniform(0.7, 0.3),
    "colsample_bytree": uniform(0.7, 0.3),
}

lgb_search = RandomizedSearchCV(
    lgb_model, lgb_params, n_iter=15, scoring='r2',
    cv=3, random_state=42, n_jobs=-1
)
lgb_search.fit(X_train, y_train)
lgb_best = lgb_search.best_estimator_
print("Best LGBM:", lgb_search.best_params_)

print("\nTuning RF...")
rf_model = RandomForestRegressor(random_state=42)

rf_params = {
    "n_estimators": randint(200, 600),
    "max_depth": randint(10, 30),
    "min_samples_leaf": randint(1, 4)
}

rf_search = RandomizedSearchCV(
    rf_model, rf_params, n_iter=12, scoring='r2',
    cv=3, random_state=42, n_jobs=-1
)
rf_search.fit(X_train, y_train)
rf_best = rf_search.best_estimator_
print("Best RF:", rf_search.best_params_)

# ============================================
# 4. PREDICT BASE MODELS
# ============================================

pred_xgb_train = xgb_best.predict(X_train)
pred_lgb_train = lgb_best.predict(X_train)
pred_rf_train  = rf_best.predict(X_train)

pred_xgb_test = xgb_best.predict(X_test)
pred_lgb_test = lgb_best.predict(X_test)
pred_rf_test  = rf_best.predict(X_test)

# ============================================
# 5. HYBRID WEIGHT SEARCH (Very Fast)
# ============================================

print("\nSearching best weights...")

weights = np.arange(0.1, 1.0, 0.1)

best_r2 = -999
best_combo = None
best_test_pred = None

for wx in weights:
    for wl in weights:
        wr = 1 - (wx + wl)
        if wr <= 0: 
            continue

        pred_t = wx*pred_xgb_train + wl*pred_lgb_train + wr*pred_rf_train
        r2 = r2_score(y_train, pred_t)

        if r2 > best_r2:
            best_r2 = r2
            best_combo = (wx, wl, wr)
            best_test_pred = wx*pred_xgb_test + wl*pred_lgb_test + wr*pred_rf_test

# ============================================
# 6. FINAL METRICS
# ============================================
hybrid_r2 = r2_score(y_test, best_test_pred)
hybrid_rmse = np.sqrt(np.mean((y_test - best_test_pred)**2))
hybrid_mae = np.mean(np.abs(y_test - best_test_pred))

print("\n============================")
print("ðŸ”¥ Best Hybrid Weights:")
print("XGB :", round(best_combo[0],2))
print("LGBM:", round(best_combo[1],2))
print("RF  :", round(best_combo[2],2))
print("============================")
print("Hybrid RÂ²   :", hybrid_r2)
print("Hybrid RMSE :", hybrid_rmse)
print("Hybrid MAE  :", hybrid_mae)
print("============================")

# ============================================
# 7. TRAIN METRICS FOR ALL MODELS
# ============================================

from sklearn.metrics import mean_squared_error, mean_absolute_error

def get_metrics(y_true, y_pred):
    r2 = r2_score(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    return r2, rmse, mae

print("\n================ TRAIN ERRORS ================")

# XGBoost
xgb_r2, xgb_rmse, xgb_mae = get_metrics(y_train, pred_xgb_train)
print("\nðŸ“Œ XGBoost Train Metrics")
print("RÂ²   :", xgb_r2)
print("RMSE :", xgb_rmse)
print("MAE  :", xgb_mae)

# LightGBM
lgb_r2, lgb_rmse, lgb_mae = get_metrics(y_train, pred_lgb_train)
print("\nðŸ“Œ LightGBM Train Metrics")
print("RÂ²   :", lgb_r2)
print("RMSE :", lgb_rmse)
print("MAE  :", lgb_mae)

# Random Forest
rf_r2, rf_rmse, rf_mae = get_metrics(y_train, pred_rf_train)
print("\nðŸ“Œ Random Forest Train Metrics")
print("RÂ²   :", rf_r2)
print("RMSE :", rf_rmse)
print("MAE  :", rf_mae)

# Hybrid
hybrid_train_pred = (
    best_combo[0] * pred_xgb_train +
    best_combo[1] * pred_lgb_train +
    best_combo[2] * pred_rf_train
)

hyb_tr_r2, hyb_tr_rmse, hyb_tr_mae = get_metrics(y_train, hybrid_train_pred)

print("\nðŸ”¥ Hybrid Model Train Metrics")
print("RÂ²   :", hyb_tr_r2)
print("RMSE :", hyb_tr_rmse)
print("MAE  :", hyb_tr_mae)
print("=============================================")

Shapes: (6300, 59) (630, 59)

Tuning XGB...
Best XGB: {'colsample_bytree': np.float64(0.8834959481464842), 'learning_rate': np.float64(0.01035331526098587), 'max_depth': 3, 'n_estimators': 198, 'subsample': np.float64(0.8574323980775167)}

Tuning LGBM...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000787 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2797
[LightGBM] [Info] Number of data points in the train set: 6300, number of used features: 59
[LightGBM] [Info] Start training from score 2511.639524
Best LGBM: {'colsample_bytree': np.float64(0.8703100983459974), 'learning_rate': np.float64(0.01156566462277793), 'n_estimators': 367, 'num_leaves': 63, 'subsample': np.float64(0.9818496824692566)}

Tuning RF...
Best RF: {'max_depth': 21, 'min_samples_leaf': 3, 'n_estimators': 443}





Searching best weights...

ðŸ”¥ Best Hybrid Weights:
XGB : 0.1
LGBM: 0.1
RF  : 0.8
Hybrid RÂ²   : 0.7847280395040487
Hybrid RMSE : 3256.465781789071
Hybrid MAE  : 976.934608631893


ðŸ“Œ XGBoost Train Metrics
RÂ²   : 0.6522330045700073
RMSE : 4815.791316076726
MAE  : 1488.5692138671875

ðŸ“Œ LightGBM Train Metrics
RÂ²   : 0.8404410038130484
RMSE : 3262.001333680626
MAE  : 827.0891438850267

ðŸ“Œ Random Forest Train Metrics
RÂ²   : 0.880835801807937
RMSE : 2819.0084918764833
MAE  : 617.6956034890508

ðŸ”¥ Hybrid Model Train Metrics
RÂ²   : 0.8641112371049979
RMSE : 3010.3379875919504
MAE  : 695.7081712066962
