In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from category_encoders import TargetEncoder
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import matplotlib.pyplot as plt
import os

# Load data
filename = os.path.join(os.getcwd(), "data", "claims_data.csv")
df = pd.read_csv(filename)
top_features = ['cat80', 'cat79', 'cat87', 'cat57', 'cat12', 'cat10', 'cat89', 'cat7', 'cat81', 
                'cat2', 'cat72', 'cat11', 'cat1', 'cat13', 'cat9', 'cat90', 'cat3', 'cat16', 
                'cat23', 'cat36', 'cat73', 'cat91', 'cat40', 'cat28', 'cat82', 'cat6', 'cat76',
                'cat50', 'cat5', 'cat4', 'cat94', 'cat14', 'cat38', 'cat24', 'cat25', 'cat85',
                'cat41', 'cat8', 'cat29', 'cat17', 'cat75', 'cat45', 'cat99', 'cat71', 'cat65', 
                'cat78', 'cat66', 'cat86', 'cat84', 'cat26']

# Separate features and target
X = df[top_features]
y = df.iloc[:, -1]

# 1. Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 2. Log-transform target
y_train_log = np.log1p(y_train)
y_test_log  = np.log1p(y_test)

# 3. Identify numeric & categorical columns
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()

# 4. Preprocessing
preprocessor = ColumnTransformer([
    ('num', FunctionTransformer(np.log1p, validate=False), num_cols),
    ('cat', TargetEncoder(), cat_cols)
])

# 5. Base XGBoost model
xgb_base = XGBRegressor(
    objective='reg:squarederror',
    tree_method='hist',
    random_state=42
)

# 6. Combine preprocessing + model in a pipeline
pipe = Pipeline([
    ('preprocess', preprocessor),
    ('regressor', xgb_base)
])

# 7. Expanded hyperparameter tuning
param_grid = {
    'regressor__n_estimators': [300, 500, 700, 1000],
    'regressor__learning_rate': [0.01, 0.03, 0.05, 0.1],
    'regressor__max_depth': [3, 4, 6, 8, 10],
    'regressor__subsample': [0.6, 0.8, 1.0],
    'regressor__colsample_bytree': [0.6, 0.8, 1.0],
    'regressor__min_child_weight': [1, 3, 5, 7],
    'regressor__gamma': [0, 0.1, 0.2, 0.5]
}

search = RandomizedSearchCV(
    pipe,
    param_distributions=param_grid,
    n_iter=20,  # run 20 random combinations
    scoring='neg_root_mean_squared_error',
    cv=3,
    random_state=42,
    n_jobs=-1,
    verbose=1
)

search.fit(X_train, y_train_log)

best_params = search.best_params_
print("\nBest Parameters Found:")
print(best_params)

Fitting 3 folds for each of 20 candidates, totalling 60 fits

Best Parameters Found:
{'regressor__subsample': 0.8, 'regressor__n_estimators': 1000, 'regressor__min_child_weight': 3, 'regressor__max_depth': 4, 'regressor__learning_rate': 0.05, 'regressor__gamma': 0.5, 'regressor__colsample_bytree': 0.8}


In [5]:
# 8. Preprocess data for final training
X_train_prep = search.best_estimator_.named_steps['preprocess'].transform(X_train)
X_test_prep  = search.best_estimator_.named_steps['preprocess'].transform(X_test)

# 9. Extract best hyperparameters and rebuild final model
final_params = {
    k.replace('regressor__',''): v
    for k, v in best_params.items()
    if k.replace('regressor__','') not in ['n_estimators', 'early_stopping_rounds']
}

# Add or override parameters for final training
final_params.update({
    'n_estimators': 5000,
    'learning_rate': 0.01,
    'max_depth': 6,
    'min_child_weight': 3,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'gamma': 1.0
})

# Initialize final XGBoost model
final_model = XGBRegressor(**final_params)

# 10. Fit final model with early stopping
final_model.fit(
    X_train_prep, y_train_log,
    eval_set=[(X_test_prep, y_test_log)],
    verbose=False
)

# 11. Predict & revert log-transform
y_pred_log = final_model.predict(X_test_prep)
y_pred = np.expm1(y_pred_log)

# 12. Evaluate
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2   = r2_score(y_test, y_pred)
mae  = mean_absolute_error(y_test, y_pred)

print(f"\nFinal RMSE: {rmse:.4f}")
print(f"Final R²:   {r2:.4f}")
print(f"Final MAE:  {mae:.4f}")

# 13. Plot early stopping curve
evals_result = final_model.evals_result()
if 'validation_0' in evals_result and 'validation_1' in evals_result:
    plt.figure(figsize=(8,5))
    plt.plot(evals_result['validation_0']['rmse'], label='Train RMSE')
    plt.plot(evals_result['validation_1']['rmse'], label='Test RMSE')
    plt.xlabel('Boosting Round')
    plt.ylabel('RMSE')
    plt.title('Early Stopping Curve')
    plt.legend()
    plt.grid(True)
    plt.show()



Final RMSE: 2116.1671
Final R²:   0.4512
Final MAE:  1245.4641
