In [1]:
import pandas as pd


pd.set_option('display.max_columns',None)
pd.set_option('display.max_colwidth',None)
pd.set_option('display.max_rows',None)

import  warnings
warnings.filterwarnings('ignore')


In [2]:
df = pd.read_csv("C:/POC_Project_LTI/uk_pmi_claims_200k.csv")

In [3]:
df.columns

Index(['Client Name', 'Client Identifier', 'Scheme Category/ Section Name',
       'Scheme Category/ Section Name Identifier', 'Status of Member',
       'Claimant Unique ID', 'Claimant Year of Birth', 'Claimant Gender',
       'Short Post Code', 'Unique Member Reference', 'Contract Start Date',
       'Contract End Date', 'Claim ID', 'Incurred Date', 'Paid Date',
       'Condition Code', 'Impairment Code', 'Condition Category',
       'Treatment Type', 'Claim Type', 'Ancillary Service Type',
       'Treatment Location', 'Provider Type', 'Admission Date',
       'Discharge Date', 'Calculate Length of Service', 'Claim Amount',
       'Amount Paid'],
      dtype='object')

In [4]:
df['Claimant Year of Birth'] = pd.to_datetime(df['Claimant Year of Birth'], errors='coerce')

In [5]:
df['Claimant Age'] = 2026 - df['Claimant Year of Birth'].dt.year

In [6]:
df_final = df[['Claimant Age','Claimant Gender','Claim Amount']]

In [7]:
df_final.sample(3)

Unnamed: 0,Claimant Age,Claimant Gender,Claim Amount
59414,56,Female,322.4
199248,56,Female,679.53
89751,56,Male,230.83


In [8]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 3 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   Claimant Age     200000 non-null  int32  
 1   Claimant Gender  200000 non-null  object 
 2   Claim Amount     200000 non-null  float64
dtypes: float64(1), int32(1), object(1)
memory usage: 3.8+ MB


In [9]:
import os
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.base import clone
import joblib
import matplotlib.pyplot as plt

# Optional libraries (must be installed)
try:
    import xgboost as xgb
except Exception:
    xgb = None
try:
    import lightgbm as lgb
except Exception:
    lgb = None
try:
    from catboost import CatBoostRegressor, Pool
except Exception:
    CatBoostRegressor = None
try:
    import shap
except Exception:
    shap = None
try:
    from lime.lime_tabular import LimeTabularExplainer
except Exception:
    LimeTabularExplainer = None
try:
    import tensorflow as tf
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
    from scikeras.wrappers import KerasRegressor
except Exception:
    tf = None
    KerasRegressor = None

print("Libraries availability: xgboost={}, lightgbm={}, catboost={}, shap={}, lime={}, tensorflow={}."
      .format(xgb is not None, lgb is not None, CatBoostRegressor is not None, shap is not None, LimeTabularExplainer is not None, tf is not None))


Libraries availability: xgboost=True, lightgbm=False, catboost=True, shap=True, lime=True, tensorflow=True.


In [None]:

import sklearn
from sklearn.preprocessing import OneHotEncoder
df = df_final
print("Initial shape:", df.shape)
print("Columns:", df.columns.tolist())

# Set these column names according to your file:
TARGET = "Claim Amount"   # target column
# if gender and age are named differently, adjust below:
EXPECTED_COLS = ["Claimant Gender", "Claimant Age", TARGET]

for c in EXPECTED_COLS:
    if c not in df.columns:
        print(f"Warning: expected column '{c}' not found in dataset. Found columns: {df.columns.tolist()}")


# Drop rows where target is missing
df = df.dropna(subset=[TARGET])
# Optionally: remove negative or impossible claims
df = df[df[TARGET] >= 0]

# Split features and target
X = df.drop(columns=[TARGET])
y = df[TARGET].values

# Log-transform the target to reduce skew (common for claim amounts)
y_trans = np.log1p(y)  # later we will inverse with expm1

# Identify numeric and categorical columns
numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_cols = X.select_dtypes(include=["object", "category", "bool"]).columns.tolist()

# If 'gender' is present but encoded as numbers, treat as categorical
if "gender" in numeric_cols:
    numeric_cols.remove("gender")
    categorical_cols.append("gender")

print("Numeric cols:", numeric_cols)
print("Categorical cols:", categorical_cols)

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

sk_version = tuple(map(int, sklearn.__version__.split(".")[:2]))

if sk_version >= (1, 2):
    categorical_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
    ])
else:
    categorical_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore", sparse=False))
    ])

preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_transformer, numeric_cols),
    ("cat", categorical_transformer, categorical_cols)
], remainder="drop")  # drop any other columns


X_train, X_test, y_train, y_test = train_test_split(X, y_trans, test_size=0.2, random_state=42)
print("Train/test shapes:", X_train.shape, X_test.shape)

# Fit preprocessing on training
preprocessor.fit(X_train)
X_train_prep = preprocessor.transform(X_train)
X_test_prep = preprocessor.transform(X_test)

def get_feature_names(preprocessor):
    num_feats = numeric_cols
    cat_feats = []
    if categorical_cols:
        # Get categories from the fitted OneHotEncoder
        cat_pipeline = preprocessor.named_transformers_["cat"]
        ohe = cat_pipeline.named_steps["onehot"]
        ohe_feature_names = []
        if hasattr(ohe, "get_feature_names_out"):
            ohe_feature_names = list(ohe.get_feature_names_out(categorical_cols))
        else:
            # fallback
            for i, col in enumerate(categorical_cols):
                cats = ohe.categories_[i]
                ohe_feature_names.extend([f"{col}_{c}" for c in cats])
        cat_feats = ohe_feature_names
    return num_feats + cat_feats

feature_names = get_feature_names(preprocessor)
print("Processed feature count:", len(feature_names))

def evaluate_and_print(model_name, model, X_raw, y_test_trans):
    """
    model_name: str label
    model: either a Pipeline that accepts raw X, or an estimator that expects preprocessed arrays
    X_raw: raw dataframe (not preprocessed)  --> function will preprocess if needed
    y_test_trans: transformed target (e.g. log1p)
    """

    if hasattr(model, "named_steps"):
        y_pred_trans = model.predict(X_raw)
    else:
   
        try:
            X_pre = preprocessor.transform(X_raw)
        except Exception as e:

            X_pre = X_raw
        y_pred_trans = model.predict(X_pre)


    y_pred = np.expm1(y_pred_trans)
    y_true = np.expm1(y_test_trans)


    mae = mean_absolute_error(y_true, y_pred)
    # avoid using squared= argument to support older sklearn versions
    mse = mean_squared_error(y_true, y_pred)
    rmse = float(np.sqrt(mse))
    r2 = r2_score(y_true, y_pred)

    print(f"{model_name}: MAE={mae:.4f}, RMSE={rmse:.4f}, R2={r2:.4f}")
    return {"model": model_name, "mae": mae, "rmse": rmse, "r2": r2, "y_true": y_true, "y_pred": y_pred}

results = []


cv = KFold(n_splits=3, shuffle=True, random_state=42)
RSCV_KWARGS = dict(cv=cv, n_iter=20, scoring="neg_mean_absolute_error", n_jobs=-1, verbose=1, random_state=42)


gb_pipeline = Pipeline(steps=[("pre", preprocessor),
                              ("model", GradientBoostingRegressor(random_state=42))])

gb_param_dist = {
    "model__n_estimators": [100, 200, 500],
    "model__learning_rate": [0.01, 0.05, 0.1],
    "model__max_depth": [3, 5, 8],
    "model__subsample": [0.6, 0.8, 1.0],
    "model__min_samples_split": [2, 5, 10]
}

print("\nTuning GradientBoostingRegressor...")
gb_search = RandomizedSearchCV(gb_pipeline, gb_param_dist, **RSCV_KWARGS)
gb_search.fit(X_train, y_train)
best_gb = gb_search.best_estimator_
print("GB best params:", gb_search.best_params_)
results.append(evaluate_and_print("GradientBoosting", best_gb, X_test, y_test))


if xgb is not None:
    xgb_pipeline = Pipeline(steps=[("pre", preprocessor),
                                   ("model", xgb.XGBRegressor(objective="reg:squarederror", tree_method="auto", random_state=42))])
    xgb_param_dist = {
        "model__n_estimators": [100, 200, 500],
        "model__learning_rate": [0.01, 0.05, 0.1],
        "model__max_depth": [3, 5, 8],
        "model__subsample": [0.6, 0.8, 1.0],
        "model__colsample_bytree": [0.6, 0.8, 1.0]
    }
    print("\nTuning XGBoost...")
    xgb_search = RandomizedSearchCV(xgb_pipeline, xgb_param_dist, **RSCV_KWARGS)
    xgb_search.fit(X_train, y_train)
    best_xgb = xgb_search.best_estimator_
    print("XGB best params:", xgb_search.best_params_)
    results.append(evaluate_and_print("XGBoost", best_xgb, X_test, y_test))
else:
    print("XGBoost not available; skipping.")


if lgb is not None:
    lgb_pipeline = Pipeline(steps=[("pre", preprocessor),
                                   ("model", lgb.LGBMRegressor(random_state=42))])
    lgb_param_dist = {
        "model__n_estimators": [100, 200, 500],
        "model__learning_rate": [0.01, 0.05, 0.1],
        "model__max_depth": [-1, 5, 8],
        "model__num_leaves": [31, 63, 127],
        "model__subsample": [0.6, 0.8, 1.0]
    }
    print("\nTuning LightGBM...")
    lgb_search = RandomizedSearchCV(lgb_pipeline, lgb_param_dist, **RSCV_KWARGS)
    lgb_search.fit(X_train, y_train)
    best_lgb = lgb_search.best_estimator_
    print("LGB best params:", lgb_search.best_params_)
    results.append(evaluate_and_print("LightGBM", best_lgb, X_test, y_test))
else:
    print("LightGBM not available; skipping.")


if CatBoostRegressor is not None:
 
    cat_indices = [X_train.columns.get_loc(c) for c in categorical_cols if c in X_train.columns]

    # Fit directly with Pool (CatBoost handles categorical features)
    cat_model = CatBoostRegressor(verbose=0, random_state=42)
    cat_param_dist = {
        "iterations": [200, 500],
        "learning_rate": [0.01, 0.05, 0.1],
        "depth": [4, 6, 8],
        "l2_leaf_reg": [1, 3, 7],
    }

    cat_pipeline = Pipeline(steps=[("pre", preprocessor),
                                   ("model", CatBoostRegressor(verbose=0, random_state=42))])
    # Map params names
    cat_param_dist_wrapped = {
        "model__iterations": cat_param_dist["iterations"],
        "model__learning_rate": cat_param_dist["learning_rate"],
        "model__depth": cat_param_dist["depth"],
        "model__l2_leaf_reg": cat_param_dist["l2_leaf_reg"],
    }
    print("\nTuning CatBoost...")
    cat_search = RandomizedSearchCV(cat_pipeline, cat_param_dist_wrapped, **RSCV_KWARGS)
    cat_search.fit(X_train, y_train)
    best_cat = cat_search.best_estimator_
    print("CatBoost best params:", cat_search.best_params_)
    results.append(evaluate_and_print("CatBoost", best_cat, X_test, y_test))
else:
    print("CatBoost not available; skipping.")


if tf is not None:
    try:
        from scikeras.wrappers import KerasRegressor
    except ImportError:
        raise ImportError("Please install SciKeras: pip install scikeras")

    print("\nTuning Neural Network (Keras via SciKeras)...")

    def build_model(n_hidden=1, n_neurons=32, learning_rate=0.001, dropout=0.0, input_dim=None):
        model = Sequential()
        model.add(Dense(n_neurons, activation="relu", input_shape=(input_dim,)))
        for _ in range(n_hidden - 1):
            model.add(Dense(n_neurons, activation="relu"))
            if dropout > 0:
                model.add(Dropout(dropout))
        model.add(Dense(1, activation="linear"))
        model.compile(loss="mse", optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), metrics=["mae"])
        return model

    input_dim = X_train_prep.shape[1]
    keras_reg = KerasRegressor(
        model=build_model,
        model__input_dim=input_dim,
        epochs=50,
        batch_size=32,
        verbose=0
    )

    nn_pipeline = Pipeline(steps=[("pre", preprocessor),
                                  ("model", keras_reg)])

    nn_param_dist = {
        "model__model__n_hidden": [1, 2, 3],
        "model__model__n_neurons": [32, 64, 128],
        "model__model__learning_rate": [1e-3, 1e-4],
        "model__model__dropout": [0.0, 0.2],
        "model__epochs": [50, 100],
        "model__batch_size": [32, 64]
    }

    nn_search = RandomizedSearchCV(
        nn_pipeline,
        nn_param_dist,
        n_iter=5,
        cv=3,
        scoring="neg_mean_absolute_error",
        verbose=1,
        n_jobs=1,
        random_state=42
    )
    nn_search.fit(X_train, y_train)
    best_nn = nn_search.best_estimator_
    print("NN best params:", nn_search.best_params_)
    results.append(evaluate_and_print("NeuralNetwork", best_nn, X_test, y_test))
else:
    print("TensorFlow not available; skipping neural network.")

print("\nSummary of results:")
for r in results:
    print(r["model"], f"MAE={r['mae']:.4f}", f"RMSE={r['rmse']:.4f}", f"R2={r['r2']:.4f}")

# Save best models to disk
os.makedirs("models", exist_ok=True)
# For each trained model variable, save with joblib
to_save = {
    "best_gb.pkl": best_gb if 'best_gb' in globals() else None,
    "best_xgb.pkl": best_xgb if 'best_xgb' in globals() else None,
    "best_lgb.pkl": best_lgb if 'best_lgb' in globals() else None,
    "best_cat.pkl": best_cat if 'best_cat' in globals() else None,
    "best_nn.pkl": best_nn if 'best_nn' in globals() else None,
}
for fname, m in to_save.items():
    if m is not None:
        joblib.dump(m, os.path.join("models", fname))
        print("Saved", fname)


if shap is not None:

    tree_model = None
    tree_name = None
    for name, var in [("LightGBM", globals().get("best_lgb")),
                      ("XGBoost", globals().get("best_xgb")),
                      ("CatBoost", globals().get("best_cat")),
                      ("GradientBoosting", globals().get("best_gb"))]:
        if var is not None:
            tree_model = var
            tree_name = name
            break

    if tree_model is not None:
        print("\nRunning SHAP TreeExplainer on", tree_name)
        underlying_model = None
        if hasattr(tree_model, "named_steps"):
            # pipeline
            underlying_model = tree_model.named_steps["model"]
            # Build preprocessed training matrix to explain
            X_train_for_shap = preprocessor.transform(X_train)
            X_test_for_shap = preprocessor.transform(X_test)
        else:
            underlying_model = tree_model
            X_train_for_shap = X_train_prep
            X_test_for_shap = X_test_prep

        try:
            explainer = shap.TreeExplainer(underlying_model)
            shap_values = explainer.shap_values(X_test_for_shap)
            print("SHAP values computed.")
            # Summary plot (global)
            shap.summary_plot(shap_values, X_test_for_shap, feature_names=feature_names, show=True)
        except Exception as e:
            print("SHAP TreeExplainer failed:", e)
    else:
        # fallback: KernelExplainer on one model (slower)
        print("No tree model available for TreeExplainer. Attempting KernelExplainer on first available model.")
        any_model = None
        if 'best_gb' in globals():
            any_model = best_gb
        elif 'best_xgb' in globals():
            any_model = best_xgb
        elif 'best_lgb' in globals():
            any_model = best_lgb
        elif 'best_cat' in globals():
            any_model = best_cat
        elif 'best_nn' in globals():
            any_model = best_nn

        if any_model is not None and shap is not None:
    
            def pred_func(x):

                if hasattr(any_model, "predict"):
                    # if any_model is a pipeline (expects raw), wrap appropriately:
                    if hasattr(any_model, "named_steps"):
                        return any_model.predict(x)
                    else:
                        return any_model.predict(x)
                else:
                    raise RuntimeError("Model has no predict method")
            try:
                explainer = shap.KernelExplainer(pred_func, shap.sample(X_train_prep, min(100, X_train_prep.shape[0])))
                shap_values = explainer.shap_values(X_test_prep[:50])
                shap.summary_plot(shap_values, X_test_prep[:50], feature_names=feature_names, show=True)
            except Exception as e:
                print("SHAP KernelExplainer failed:", e)
else:
    print("\nSHAP not installed; skipping SHAP analysis.")

if LimeTabularExplainer is not None:
    print("\nLIME explanations for 3 test instances (if available):")
    # Use the processed training array and a prediction function mapping raw -> predicted continuous claim
    explainer = LimeTabularExplainer(training_data=np.array(X_train_prep),
                                     mode="regression",
                                     feature_names=feature_names,
                                     verbose=False)

    # Choose 3 instances from the test set to explain
    n_local = min(3, X_test_prep.shape[0])
    for i in range(n_local):
        idx = i
        exp = explainer.explain_instance(X_test_prep[idx], 
                                         lambda z: best_gb.predict(z) if 'best_gb' in globals() else best_xgb.predict(z),
                                         num_features=min(10, len(feature_names)))
        print(f"--- LIME explanation for test instance {i} ---")
        print(exp.as_list())
else:
    print("\nLIME not installed; skipping LIME explanations.")


if results:
    best = sorted(results, key=lambda x: x["mae"])[0]
    y_true = best["y_true"]
    y_pred = best["y_pred"]
    plt.figure(figsize=(6,6))
    plt.scatter(y_true, y_pred, alpha=0.3)
    plt.plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], linestyle="--")
    plt.xlabel("Actual claim amount")
    plt.ylabel("Predicted claim amount")
    plt.title(f"{best['model']} predicted vs actual")
    plt.tight_layout()
    plt.show()





Initial shape: (200000, 3)
Columns: ['Claimant Age', 'Claimant Gender', 'Claim Amount']
Numeric cols: []
Categorical cols: ['Claimant Gender']
Train/test shapes: (160000, 2) (40000, 2)
Processed feature count: 3

Tuning GradientBoostingRegressor...
Fitting 3 folds for each of 20 candidates, totalling 60 fits
GB best params: {'model__subsample': 0.8, 'model__n_estimators': 100, 'model__min_samples_split': 2, 'model__max_depth': 3, 'model__learning_rate': 0.05}
GradientBoosting: MAE=5424.8277, RMSE=20792.8523, R2=-0.0542

Tuning XGBoost...
Fitting 3 folds for each of 20 candidates, totalling 60 fits
XGB best params: {'model__subsample': 0.8, 'model__n_estimators': 200, 'model__max_depth': 8, 'model__learning_rate': 0.1, 'model__colsample_bytree': 1.0}
XGBoost: MAE=5424.7143, RMSE=20792.9797, R2=-0.0542
LightGBM not available; skipping.

Tuning CatBoost...
Fitting 3 folds for each of 20 candidates, totalling 60 fits
CatBoost best params: {'model__learning_rate': 0.01, 'model__l2_leaf_reg'