In [None]:

# Install required libraries
!pip install pandas numpy scikit-learn imbalanced-learn xgboost



In [None]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder,StandardScaler
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [None]:
# Load dataset (upload your file to Colab and update the path)
data = pd.read_csv('/content/carclaims.csv')

# Convert 'FraudFound' to binary (Yes=1, No=0)
data['FraudFound'] = data['FraudFound'].map({'Yes': 1, 'No': 0})

# Separate features and target
X = data.drop('FraudFound', axis=1)
y = data['FraudFound']

# Handle categorical variables with one-hot encoding
categorical_cols = X.select_dtypes(include=['object']).columns
X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Scale the data using StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Check initial class distribution
print("Initial Class Distribution:")
print(y_train.value_counts())

Initial Class Distribution:
FraudFound
0    10148
1      646
Name: count, dtype: int64


In [None]:
# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
print("\nAfter SMOTE:")
print(pd.Series(y_train_smote).value_counts())

# Apply ADASYN
adasyn = ADASYN(random_state=42)
X_train_adasyn, y_train_adasyn = adasyn.fit_resample(X_train, y_train)
print("\nAfter ADASYN:")
print(pd.Series(y_train_adasyn).value_counts())


After SMOTE:
FraudFound
0    10148
1    10148
Name: count, dtype: int64

After ADASYN:
FraudFound
0    10148
1     9931
Name: count, dtype: int64


In [None]:
# Define base models
models = {
    "Logistic Regression": LogisticRegression(max_iter=5000, random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": xgb.XGBClassifier(eval_metric='logloss', random_state=42),
    "SVM": SVC(probability=True, random_state=42),
    "Balanced Bagging": BalancedBaggingClassifier(estimator=RandomForestClassifier(n_estimators=10),
                                                  n_estimators=10, random_state=42)
}

In [None]:
# Define k-fold cross-validation (reduced to 3 folds for speed)
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Function to evaluate model with cross-validation
def evaluate_model(model, X_train_fold, y_train_fold, X_val_fold, y_val_fold, name, balancing_method="SMOTE"):
    # Apply balancing within the fold
    if balancing_method == "SMOTE":
        smote = SMOTE(random_state=42)
        X_bal, y_bal = smote.fit_resample(X_train_fold, y_train_fold)
    elif balancing_method == "ADASYN":
        adasyn = ADASYN(random_state=42)
        X_bal, y_bal = adasyn.fit_resample(X_train_fold, y_train_fold)
    else:  # For Balanced Bagging or no balancing
        X_bal, y_bal = X_train_fold, y_train_fold

    # Train on balanced fold data
    model.fit(X_bal, y_bal)

    # Evaluate on validation fold (original data)
    y_pred_val = model.predict(X_val_fold)
    y_pred_proba_val = model.predict_proba(X_val_fold)[:, 1] if hasattr(model, "predict_proba") else model.decision_function(X_val_fold)

    return {
        "Model": name,
        "Balancing": balancing_method,
        "Accuracy": accuracy_score(y_val_fold, y_pred_val),
        "Precision": precision_score(y_val_fold, y_pred_val),
        "Recall": recall_score(y_val_fold, y_pred_val),
        "F1-Score": f1_score(y_val_fold, y_pred_val),
        "AUC-PR": roc_auc_score(y_val_fold, y_pred_proba_val)
    }

# Evaluate models with SMOTE only (for speed; add ADASYN later if needed)
results = []
for name, model in models.items():
    fold_results = []
    for train_index, val_index in skf.split(X_train, y_train):
        # Convert indices to integer type for NumPy array indexing
        train_index = train_index.astype(int)
        val_index = val_index.astype(int)

        # Ensure X_train is a NumPy array for integer indexing
        X_train_np = X_train.values if isinstance(X_train, pd.DataFrame) else X_train
        X_train_fold, X_val_fold = X_train_np[train_index], X_train_np[val_index]

        # Ensure y_train is a pandas Series or NumPy array and index properly
        y_train_series = y_train if isinstance(y_train, pd.Series) else pd.Series(y_train)
        y_train_fold, y_val_fold = y_train_series.iloc[train_index], y_train_series.iloc[val_index]

        # Evaluate with appropriate balancing method
        if name == "Balanced Bagging":
            result = evaluate_model(model, X_train_fold, y_train_fold, X_val_fold, y_val_fold, name, "None")
        else:
            result = evaluate_model(model, X_train_fold, y_train_fold, X_val_fold, y_val_fold, name, "SMOTE")
        fold_results.append(result)

    # Average results across folds
    avg_result = {
        "Model": name,
        "Balancing": "None" if name == "Balanced Bagging" else "SMOTE",
        "Accuracy": np.mean([r["Accuracy"] for r in fold_results]),
        "Precision": np.mean([r["Precision"] for r in fold_results]),
        "Recall": np.mean([r["Recall"] for r in fold_results]),
        "F1-Score": np.mean([r["F1-Score"] for r in fold_results]),
        "AUC-PR": np.mean([r["AUC-PR"] for r in fold_results])
    }
    results.append(avg_result)

# Convert results to DataFrame
results_df = pd.DataFrame(results)
print("\nCross-Validation Results (Averaged Over Folds):")
print(results_df)


Cross-Validation Results (Averaged Over Folds):
                 Model Balancing  Accuracy  Precision    Recall  F1-Score  \
0  Logistic Regression     SMOTE  0.695016   0.134737  0.755434  0.228685   
1        Random Forest     SMOTE  0.939596   0.375000  0.004644  0.009121   
2              XGBoost     SMOTE  0.948027   0.727076  0.210515  0.326479   
3                  SVM     SMOTE  0.905688   0.198281  0.188831  0.193177   
4     Balanced Bagging      None  0.716972   0.149381  0.794050  0.251400   

     AUC-PR  
0  0.791316  
1  0.798214  
2  0.935311  
3  0.758967  
4  0.820920  


In [None]:
# Train models on full SMOTE-balanced data and evaluate on test set
test_results = {}
for name, model in models.items():
    if name == "Balanced Bagging":
        model.fit(X_train, y_train)  # Balanced Bagging uses original data
    else:
        model.fit(X_train_smote, y_train_smote)  # Others use SMOTE data

    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else model.decision_function(X_test)

    test_results[name] = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1-Score": f1_score(y_test, y_pred),
        "AUC-PR": roc_auc_score(y_test, y_pred_proba)
    }
    print(f"\n{name} Test Set Results:")
    for metric, value in test_results[name].items():
        print(f"{metric}: {value:.4f}")

# Convert to DataFrame
test_results_df = pd.DataFrame(test_results).T
print("\nTest Set Comparison:")
print(test_results_df)


Logistic Regression Test Set Results:
Accuracy: 0.6747
Precision: 0.1292
Recall: 0.7726
F1-Score: 0.2214
AUC-PR: 0.7927

Random Forest Test Set Results:
Accuracy: 0.9401
Precision: 0.5000
Recall: 0.0108
F1-Score: 0.0212
AUC-PR: 0.8058

XGBoost Test Set Results:
Accuracy: 0.9589
Precision: 0.8537
Recall: 0.3791
F1-Score: 0.5250
AUC-PR: 0.9685

SVM Test Set Results:
Accuracy: 0.8962
Precision: 0.1896
Recall: 0.2238
F1-Score: 0.2053
AUC-PR: 0.7795

Balanced Bagging Test Set Results:
Accuracy: 0.7198
Precision: 0.1555
Recall: 0.8303
F1-Score: 0.2620
AUC-PR: 0.8384

Test Set Comparison:
                     Accuracy  Precision    Recall  F1-Score    AUC-PR
Logistic Regression  0.674665   0.129227  0.772563  0.221417  0.792735
Random Forest        0.940121   0.500000  0.010830  0.021201  0.805822
XGBoost              0.958928   0.853659  0.379061  0.525000  0.968544
SVM                  0.896239   0.189602  0.223827  0.205298  0.779503
Balanced Bagging     0.719844   0.155510  0.830325  0.2

In [None]:
# Assume `model` is your trained model and `df_scaled` is your input
prediction = model.predict(df_scaled)[0]  # 0 or 1
confidence = model.predict_proba(df_scaled)[0][1]  # Probability of class '1' (fraud)


NameError: name 'df_scaled' is not defined

In [None]:
import pickle

# Save all models to a single pickle file
with open('trained_models.pkl', 'wb') as f:
    pickle.dump(models, f)


In [None]:
# Define ensemble
ensemble = VotingClassifier(
    estimators=[
        ('lr', models["Logistic Regression"]),
        ('rf', models["Random Forest"]),
        ('xgb', models["XGBoost"]),
        ('svm', models["SVM"]),
        ('bb', models["Balanced Bagging"])
    ],
    voting='soft'
)

# Train ensemble on SMOTE-balanced data
ensemble.fit(X_train_smote, y_train_smote)

# Evaluate on test set
y_pred_ensemble = ensemble.predict(X_test)
y_pred_proba_ensemble = ensemble.predict_proba(X_test)[:, 1]

ensemble_results = {
    "Accuracy": accuracy_score(y_test, y_pred_ensemble),
    "Precision": precision_score(y_test, y_pred_ensemble),
    "Recall": recall_score(y_test, y_pred_ensemble),
    "F1-Score": f1_score(y_test, y_pred_ensemble),
    "AUC-PR": roc_auc_score(y_test, y_pred_proba_ensemble)
}

print("\nEnsemble (Voting Classifier) Test Set Results:")
for metric, value in ensemble_results.items():
    print(f"{metric}: {value:.4f}")


Ensemble (Voting Classifier) Test Set Results:
Accuracy: 0.9406
Precision: 0.5227
Recall: 0.0830
F1-Score: 0.1433
AUC-PR: 0.8839


In [None]:
# Define ensemble with weights
ensemble_weighted = VotingClassifier(
    estimators=[
        ('lr', models["Logistic Regression"]),
        ('rf', models["Random Forest"]),
        ('xgb', models["XGBoost"]),
        ('svm', models["SVM"]),
        ('bb', models["Balanced Bagging"])
    ],
    voting='soft',
    weights=[0.1, 0.3, 0.5, 0.05, 0.05]  # Higher weight for XGBoost (0.5), RF (0.3)
)

# Train ensemble on SMOTE-balanced data
ensemble_weighted.fit(X_train_smote, y_train_smote)

# Evaluate on test set
y_pred_ensemble = ensemble_weighted.predict(X_test)
y_pred_proba_ensemble = ensemble_weighted.predict_proba(X_test)[:, 1]

ensemble_results = {
    "Accuracy": accuracy_score(y_test, y_pred_ensemble),
    "Precision": precision_score(y_test, y_pred_ensemble),
    "Recall": recall_score(y_test, y_pred_ensemble),
    "F1-Score": f1_score(y_test, y_pred_ensemble),
    "AUC-PR": roc_auc_score(y_test, y_pred_proba_ensemble)
}

print("\nWeighted Ensemble (Voting Classifier) Test Set Results:")
for metric, value in ensemble_results.items():
    print(f"{metric}: {value:.4f}")


Weighted Ensemble (Voting Classifier) Test Set Results:
Accuracy: 0.9488
Precision: 0.8846
Recall: 0.1661
F1-Score: 0.2796
AUC-PR: 0.9367


In [None]:
# Define ensemble with only XGBoost and Random Forest
ensemble_subset = VotingClassifier(
    estimators=[
        ('rf', models["Random Forest"]),
        ('xgb', models["XGBoost"])
    ],
    voting='soft',
    weights=[0.3, 0.7]  # Favor XGBoost slightly more
)

# Train and evaluate
ensemble_subset.fit(X_train_smote, y_train_smote)
y_pred_ensemble = ensemble_subset.predict(X_test)
y_pred_proba_ensemble = ensemble_subset.predict_proba(X_test)[:, 1]

ensemble_results = {
    "Accuracy": accuracy_score(y_test, y_pred_ensemble),
    "Precision": precision_score(y_test, y_pred_ensemble),
    "Recall": recall_score(y_test, y_pred_ensemble),
    "F1-Score": f1_score(y_test, y_pred_ensemble),
    "AUC-PR": roc_auc_score(y_test, y_pred_proba_ensemble)
}

print("\nSubset Ensemble (XGBoost + RF) Test Set Results:")
for metric, value in ensemble_results.items():
    print(f"{metric}: {value:.4f}")


Subset Ensemble (XGBoost + RF) Test Set Results:
Accuracy: 0.9516
Precision: 0.8841
Recall: 0.2202
F1-Score: 0.3526
AUC-PR: 0.9522


#SHAP

In [None]:
!pip install shap lime

Collecting lime
  Downloading lime-0.2.0.1.tar.gz (275 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/275.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m266.2/275.7 kB[0m [31m9.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m275.7/275.7 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: lime
  Building wheel for lime (setup.py) ... [?25l[?25hdone
  Created wheel for lime: filename=lime-0.2.0.1-py3-none-any.whl size=283834 sha256=1b92d1414859e495faf8ff38394807dab270c486bea0484d81a4906112f98d2b
  Stored in directory: /root/.cache/pip/wheels/85/fa/a3/9c2d44c9f3cd77cf4e533b58900b2bf4487f2a17e8ec212a3d
Successfully built lime
Installing collected packages: lime
Successfully installed lime-0.2.0.1


In [None]:
import lime
import lime.lime_tabular
import numpy as np

# Create LIME explainer
explainer = lime.lime_tabular.LimeTabularExplainer(
    training_data=np.array(X_train),
    feature_names=X.columns.tolist(),
    class_names=np.unique(y_train).astype(str),
    mode='classification'
)

# Loop through your models
for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train, y_train)

    print(f"\nExplaining prediction with LIME for {name}...")
    i = 0  # Index of the test instance to explain
    exp = explainer.explain_instance(
        data_row=X_test[i],
        predict_fn=model.predict_proba,
        num_features=5
    )

    # Text explanation
    print(f"\nTextual explanation for model: {name}")
    for feature, weight in exp.as_list():
        direction = "increased" if weight > 0 else "decreased"
        print(f" - {feature} {direction} the prediction confidence by {abs(weight):.4f}")

    # Optional: show HTML visualization (in Jupyter Notebook)
    # exp.show_in_notebook()

    print("\n" + "-" * 60 + "\n")



Training Logistic Regression...

Explaining prediction with LIME for Logistic Regression...

Textual explanation for model: Logistic Regression
 - PolicyType_Sport - Collision <= -0.15 decreased the prediction confidence by 0.0699
 - AddressChange-Claim_2 to 3 years <= -0.14 decreased the prediction confidence by 0.0433
 - AgeOfVehicle_5 years <= -0.32 decreased the prediction confidence by 0.0252
 - Fault_Third Party <= -0.61 increased the prediction confidence by 0.0210
 - MaritalStatus_Single <= -0.67 decreased the prediction confidence by 0.0158

------------------------------------------------------------


Training Random Forest...

Explaining prediction with LIME for Random Forest...

Textual explanation for model: Random Forest
 - Fault_Third Party <= -0.61 increased the prediction confidence by 0.0728
 - AddressChange-Claim_2 to 3 years <= -0.14 decreased the prediction confidence by 0.0474
 - -0.70 < BasePolicy_Liability <= 1.44 decreased the prediction confidence by 0.0350
