In [1]:
import pandas as pd

final_df = pd.read_csv("insurance_model_ready.csv")
final_df.shape

(10000, 38)

In [2]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(final_df["CLAIM_STATUS"])
X = final_df.drop("CLAIM_STATUS", axis=1)

X.shape, y.shape

((10000, 37), (10000,))

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train.shape, X_test.shape

((8000, 37), (2000, 37))

In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

cat_cols = X_train.select_dtypes(include=["object"]).columns
num_cols = X_train.select_dtypes(exclude=["object"]).columns

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
    ]
)

model = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("clf", LogisticRegression(max_iter=1000, class_weight="balanced"))
])


In [5]:
# 1. Build pipeline
model = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("clf", LogisticRegression(max_iter=1000, class_weight="balanced"))
])

# 2. Train model  ✅ REQUIRED
model.fit(X_train, y_train)

# 3. Predict
y_pred = model.predict(X_test)

# 4. Evaluate
from sklearn.metrics import accuracy_score, classification_report
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, zero_division=0))

Accuracy: 0.946
              precision    recall  f1-score   support

           0       0.95      1.00      0.97      1899
           1       0.00      0.00      0.00       101

    accuracy                           0.95      2000
   macro avg       0.47      0.50      0.49      2000
weighted avg       0.90      0.95      0.92      2000



In [6]:
final_df["CLAIM_STATUS"].value_counts(normalize=True)

CLAIM_STATUS
A    0.9497
D    0.0503
Name: proportion, dtype: float64

In [7]:
import numpy as np

print("y_train distribution:", np.unique(y_train, return_counts=True))
print("y_test distribution:", np.unique(y_test, return_counts=True))

y_train distribution: (array([0, 1]), array([7598,  402]))
y_test distribution: (array([0, 1]), array([1899,  101]))


In [8]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)

print("Classes:", le.classes_)

Classes: [0 1]


In [9]:
X = final_df.drop(columns=["CLAIM_STATUS"])
y = final_df["CLAIM_STATUS"]

In [10]:
X = final_df.drop(columns="CLAIM_STATUS")
y = final_df["CLAIM_STATUS"]

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [12]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=True), cat_cols),
        ("num", "passthrough", num_cols)
    ]
)

X_train_enc = preprocessor.fit_transform(X_train)
X_test_enc = preprocessor.transform(X_test)

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

rf_model = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    class_weight="balanced"
)

rf_model.fit(X_train_enc, y_train_enc)

y_pred_rf = rf_model.predict(X_test_enc)

print(classification_report(y_test_enc, y_pred_rf, zero_division=0))

              precision    recall  f1-score   support

           0       0.95      1.00      0.97      1899
           1       0.00      0.00      0.00       101

    accuracy                           0.95      2000
   macro avg       0.47      0.50      0.49      2000
weighted avg       0.90      0.95      0.92      2000



In [14]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline

dt_model = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("clf", DecisionTreeClassifier(random_state=42))
])

dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)

In [15]:
y_train.value_counts()

CLAIM_STATUS
A    7598
D     402
Name: count, dtype: int64

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

rf_model = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("clf", RandomForestClassifier(
        n_estimators=300,
        max_depth=None,
        min_samples_split=5,
        class_weight={"A": 1, "D": 5},  # D = fraud (minority class)
        random_state=42,
        n_jobs=-1
    ))
])

rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

In [17]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

knn_model = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("clf", KNeighborsClassifier(n_neighbors=5))
])

knn_model.fit(X_train, y_train)
y_pred_knn = knn_model.predict(X_test)

print("KNN Results:")
print(classification_report(y_test, y_pred_knn, zero_division=0))

KNN Results:
              precision    recall  f1-score   support

           A       0.95      1.00      0.97      1899
           D       0.00      0.00      0.00       101

    accuracy                           0.95      2000
   macro avg       0.47      0.50      0.49      2000
weighted avg       0.90      0.95      0.92      2000



In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

lr_model = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("clf", LogisticRegression(max_iter=1000, class_weight="balanced"))
])

lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)

print("Logistic Regression Results:")
print(classification_report(y_test, y_pred_lr, zero_division=0))



Logistic Regression Results:
              precision    recall  f1-score   support

           A       0.94      0.18      0.31      1899
           D       0.05      0.79      0.09       101

    accuracy                           0.21      2000
   macro avg       0.50      0.49      0.20      2000
weighted avg       0.90      0.21      0.30      2000



In [19]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

num_features = X_train.select_dtypes(include=["int64", "float64"]).columns
cat_features = X_train.select_dtypes(include=["object"]).columns

preprocessor_nb = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_features),
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_features)
    ]
)

In [20]:
from sklearn.svm import SVC

In [21]:
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline

nb_model = Pipeline(steps=[
    ("preprocess", preprocessor_nb),
    ("clf", GaussianNB())
])

nb_model.fit(X_train, y_train)
y_pred_nb = nb_model.predict(X_test)

# Convert numeric labels to A/D
y_pred_nb = pd.Series(y_pred_nb).map({0: "A", 1: "D"})

In [22]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_train_enc = label_encoder.fit_transform(y_train)
y_test_enc = label_encoder.transform(y_test)

In [23]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report

svm_model = SVC(kernel="rbf", class_weight="balanced")

svm_model.fit(X_train_enc, y_train_enc)
y_pred_svm = svm_model.predict(X_test_enc)

print("SVM Results:")
print(classification_report(y_test_enc, y_pred_svm, zero_division=0))

SVM Results:
              precision    recall  f1-score   support

           0       0.95      0.53      0.68      1899
           1       0.05      0.45      0.09       101

    accuracy                           0.52      2000
   macro avg       0.50      0.49      0.38      2000
weighted avg       0.90      0.52      0.65      2000



In [24]:
# Non-encoded models → use X_test
y_pred_dt_enc = dt_model.predict(X_test)
y_pred_rf_enc = rf_model.predict(X_test)
y_pred_knn_enc = knn_model.predict(X_test)
y_pred_lr_enc = lr_model.predict(X_test)
y_pred_nb_enc = nb_model.predict(X_test)

# SVM → uses encoded input
y_pred_svm = svm_model.predict(X_test_enc)

In [25]:
y_pred_svm = label_encoder.inverse_transform(y_pred_svm)

In [26]:
models = {
    "Decision Tree": y_pred_dt_enc,
    "Random Forest": y_pred_rf_enc,
    "KNN": y_pred_knn_enc,
    "Logistic Regression": y_pred_lr_enc,
    "Naive Bayes": y_pred_nb_enc,
    "SVM": y_pred_svm
}

In [27]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

for name, y_pred in models.items():
    print(f"\n{name} Performance:")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred, pos_label="D", zero_division=0))
    print("Recall:", recall_score(y_test, y_pred, pos_label="D", zero_division=0))
    print("F1-score:", f1_score(y_test, y_pred, pos_label="D", zero_division=0))


Decision Tree Performance:
Accuracy: 0.9375
Precision: 0.038461538461538464
Recall: 0.009900990099009901
F1-score: 0.015748031496062992

Random Forest Performance:
Accuracy: 0.9495
Precision: 0.0
Recall: 0.0
F1-score: 0.0

KNN Performance:
Accuracy: 0.9495
Precision: 0.0
Recall: 0.0
F1-score: 0.0

Logistic Regression Performance:
Accuracy: 0.2145
Precision: 0.049079754601226995
Recall: 0.7920792079207921
F1-score: 0.09243212016175621

Naive Bayes Performance:
Accuracy: 0.944
Precision: 0.0
Recall: 0.0
F1-score: 0.0

SVM Performance:
Accuracy: 0.5225
Precision: 0.04766949152542373
Recall: 0.44554455445544555
F1-score: 0.0861244019138756
