In [2]:
# ----------------------------------------
# 1. Import Libraries and Load Dataset
# ----------------------------------------

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.utils import resample
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (classification_report, confusion_matrix,
                             roc_curve, roc_auc_score, precision_recall_curve)
from sklearn.feature_selection import RFECV
import warnings
warnings.filterwarnings("ignore")

# Load data
df = pd.read_csv("insurance_claims.csv")

# ----------------------------------------
# 2. Data Cleaning
# ----------------------------------------

# Drop empty column
df.drop(columns=['_c39'], inplace=True)

# Fix data types
df['policy_bind_date'] = pd.to_datetime(df['policy_bind_date'], errors='coerce')
df['incident_date'] = pd.to_datetime(df['incident_date'], errors='coerce')
df['fraud_reported'] = df['fraud_reported'].map({'Y': 1, 'N': 0})

# Replace "?" with NaN
df.replace('?', pd.NA, inplace=True)

# Drop redundant features
df.drop(columns=['policy_number', 'insured_zip', 'incident_location'], inplace=True)

# ----------------------------------------
# 3. Train-Validation Split
# ----------------------------------------

X = df.drop(columns=['fraud_reported'])
y = df['fraud_reported']

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

# ----------------------------------------
# 4. EDA on Training Data
# ----------------------------------------

# Univariate analysis
numerical_cols = X_train.select_dtypes(include='number').columns
X_train[numerical_cols].hist(figsize=(15, 10), bins=20)
plt.suptitle("Univariate Distribution of Numerical Features")
plt.tight_layout()
plt.show()

# Correlation heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(X_train[numerical_cols].corr(), annot=True, cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()

# Class balance
sns.countplot(x=y_train)
plt.title("Class Distribution in Training Set")
plt.show()

# Bivariate analysis
for col in numerical_cols:
    sns.boxplot(x=y_train, y=X_train[col])
    plt.title(f"{col} vs Fraud")
    plt.show()

# ----------------------------------------
# 5. Feature Engineering
# ----------------------------------------

# Impute missing categorical with most frequent
cat_cols = X_train.select_dtypes(include='object').columns
imp = SimpleImputer(strategy='most_frequent')
X_train[cat_cols] = imp.fit_transform(X_train[cat_cols])
X_val[cat_cols] = imp.transform(X_val[cat_cols])

# Combine rare values
for col in cat_cols:
    freq = X_train[col].value_counts(normalize=True)
    rare = freq[freq < 0.01].index
    X_train[col] = X_train[col].replace(rare, "Rare")
    X_val[col] = X_val[col].replace(rare, "Rare")

# One-hot encoding
X_train = pd.get_dummies(X_train, drop_first=True)
X_val = pd.get_dummies(X_val, drop_first=True)
X_train, X_val = X_train.align(X_val, join='left', axis=1, fill_value=0)

# Resampling to handle class imbalance
df_train = pd.concat([X_train, y_train], axis=1)
majority = df_train[df_train.fraud_reported == 0]
minority = df_train[df_train.fraud_reported == 1]
minority_upsampled = resample(minority, replace=True, n_samples=len(majority), random_state=42)
train_balanced = pd.concat([majority, minority_upsampled])

X_train = train_balanced.drop("fraud_reported", axis=1)
y_train = train_balanced["fraud_reported"]

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# ----------------------------------------
# 6. Model Building
# ----------------------------------------

# Feature selection using RFECV
lr = LogisticRegression(max_iter=1000)
rfecv = RFECV(estimator=lr, step=1, cv=5, scoring='accuracy')
rfecv.fit(X_train_scaled, y_train)

# Select features
selected_features = X_train.columns[rfecv.support_]
X_train_sel = X_train_scaled[:, rfecv.support_]
X_val_sel = X_val_scaled[:, rfecv.support_]

# Logistic Regression
lr.fit(X_train_sel, y_train)
y_pred_lr = lr.predict(X_val_sel)

print("Logistic Regression Report:")
print(classification_report(y_val, y_pred_lr))

# Optimal cutoff using ROC
y_proba_lr = lr.predict_proba(X_val_sel)[:, 1]
fpr, tpr, thresholds = roc_curve(y_val, y_proba_lr)

plt.plot(fpr, tpr)
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - Logistic Regression")
plt.show()

print("AUC Score:", roc_auc_score(y_val, y_proba_lr))

# Random Forest (Baseline)
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_val)
print("Random Forest Report:")
print(classification_report(y_val, y_pred_rf))

# Hyperparameter tuning
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [5, 10],
    'min_samples_split': [2, 5]
}

grid_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=3, scoring='f1')
grid_rf.fit(X_train, y_train)
best_rf = grid_rf.best_estimator_

# Final prediction
y_pred_best_rf = best_rf.predict(X_val)
print("Tuned Random Forest Report:")
print(classification_report(y_val, y_pred_best_rf))


FileNotFoundError: [Errno 2] No such file or directory: 'insurance_claims.csv'