In [22]:
import pandas as pd
import faker
import random
from datetime import datetime, timedelta

# Initialize Faker
fake = faker.Faker()

# Define the number of rows
num_rows = 500

# Generate synthetic data
data = {
    "Claim_ID": [fake.uuid4() for _ in range(num_rows)],
    "Claim_Date": [fake.date_between(start_date='-1y', end_date='today') for _ in range(num_rows)],
    "Customer_ID": [fake.uuid4() for _ in range(num_rows)],
    "Claim_Amount": [round(random.uniform(100, 10000), 2) for _ in range(num_rows)],
    "Claim_Type": [random.choice(["Medical", "Auto", "Home", "Life"]) for _ in range(num_rows)],
    "Suspicious_Flags": [random.choice([0, 1]) for _ in range(num_rows)],
    "Fraud_Label": [random.choice([0, 1]) for _ in range(num_rows)],
    # Adding 'annual_income' column
    "annual_income": [round(random.uniform(30000, 150000), 2) for _ in range(num_rows)]
}

# Create a DataFrame
df = pd.DataFrame(data)

# Save to CSV
df.to_csv("fraudulent_claims_dataset.csv", index=False)

print("Dataset generated and saved as 'fraudulent_claims_dataset.csv'.")

Dataset generated and saved as 'fraudulent_claims_dataset.csv'.


In [23]:
df = pd.read_csv("fraudulent_claims_dataset.csv")
df.head()

Unnamed: 0,Claim_ID,Claim_Date,Customer_ID,Claim_Amount,Claim_Type,Suspicious_Flags,Fraud_Label,annual_income
0,a18ee3ff-5da8-4c09-a717-5151ee79d3c1,2025-04-06,2d4ffb3d-a65d-447b-95ab-cbd22f728dcf,3059.19,Life,0,0,131636.53
1,5edeaf76-199a-498b-a1b5-e650fa8d2484,2025-03-23,82e73154-c81a-4430-a35b-8bc534ce49f1,1893.61,Auto,0,0,63289.98
2,77736e6e-ef5d-4512-bbd5-35434069ee68,2024-11-16,8a82314f-acf4-4032-a615-20f14c8839d0,5575.98,Auto,1,0,103404.7
3,a7a2b0fb-be7d-4943-9c54-aa11183e7c6c,2024-10-04,73796ecd-eb04-4026-ad39-9a46efb696be,6669.31,Auto,1,1,128522.15
4,73d251bb-5d47-4f40-a0dd-c878886da9a8,2024-10-01,71b69884-69bd-45f7-9e84-e8e0f4c2da35,8058.19,Medical,0,1,97803.19


In [24]:
# Feature: Claim-to-Income Ratio (Assume Annual_Income column exists)
df["Claim_to_Income_Ratio"] = df["Claim_Amount"] / df["annual_income"]
df.head()

Unnamed: 0,Claim_ID,Claim_Date,Customer_ID,Claim_Amount,Claim_Type,Suspicious_Flags,Fraud_Label,annual_income,Claim_to_Income_Ratio
0,a18ee3ff-5da8-4c09-a717-5151ee79d3c1,2025-04-06,2d4ffb3d-a65d-447b-95ab-cbd22f728dcf,3059.19,Life,0,0,131636.53,0.02324
1,5edeaf76-199a-498b-a1b5-e650fa8d2484,2025-03-23,82e73154-c81a-4430-a35b-8bc534ce49f1,1893.61,Auto,0,0,63289.98,0.02992
2,77736e6e-ef5d-4512-bbd5-35434069ee68,2024-11-16,8a82314f-acf4-4032-a615-20f14c8839d0,5575.98,Auto,1,0,103404.7,0.053924
3,a7a2b0fb-be7d-4943-9c54-aa11183e7c6c,2024-10-04,73796ecd-eb04-4026-ad39-9a46efb696be,6669.31,Auto,1,1,128522.15,0.051892
4,73d251bb-5d47-4f40-a0dd-c878886da9a8,2024-10-01,71b69884-69bd-45f7-9e84-e8e0f4c2da35,8058.19,Medical,0,1,97803.19,0.082392


In [25]:
today = pd.to_datetime(datetime.now().date())

# Convert 'Claim_Date' to datetime objects
df['Claim_Date'] = pd.to_datetime(df['Claim_Date'])

# Calculate the difference between today and 'Claim_Date' in days
df['Days_Since_Issuance'] = (today - df['Claim_Date']).dt.days

# Select numerical features for anomaly detection
features = ["Claim_Amount", "Claim_to_Income_Ratio", "Days_Since_Issuance"]
df_selected = df[features]

In [26]:
# 1️⃣ **Elliptic Envelope** (Assumes Gaussian distribution)
from sklearn.covariance import EllipticEnvelope
elliptic = EllipticEnvelope(contamination=0.05)  # 5% contamination rate
df["Elliptic_Outlier"] = elliptic.fit_predict(df_selected)


In [27]:

# 2️⃣ **Isolation Forest** (Randomly isolates anomalies)
from sklearn.ensemble import IsolationForest
iso_forest = IsolationForest(contamination=0.05, random_state=42)
df["IsoForest_Outlier"] = iso_forest.fit_predict(df_selected)

In [28]:
# 3️⃣ **Local Outlier Factor (LOF)** (Detects local anomalies)
from sklearn.neighbors import LocalOutlierFactor
lof = LocalOutlierFactor(n_neighbors=20, contamination=0.05)
df["LOF_Outlier"] = lof.fit_predict(df_selected)

In [29]:
# Convert predictions (-1 = outlier, 1 = normal) to binary (1 = suspicious)
df["Elliptic_Outlier"] = df["Elliptic_Outlier"].apply(lambda x: 1 if x == -1 else 0)
df["IsoForest_Outlier"] = df["IsoForest_Outlier"].apply(lambda x: 1 if x == -1 else 0)
df["LOF_Outlier"] = df["LOF_Outlier"].apply(lambda x: 1 if x == -1 else 0)

In [30]:
# 4️⃣ **Final Suspicious Tag**
df["Anomaly_Flag"] = df[["Elliptic_Outlier", "IsoForest_Outlier", "LOF_Outlier"]].max(axis=1)

In [31]:

# Filter suspicious claims
suspicious_claims = df[df["Anomaly_Flag"] == 1]

# Save suspicious claims
suspicious_claims.to_csv("suspicious_claims_detected.csv", index=False)

print(f"{len(suspicious_claims)} suspicious claims identified and saved.")


54 suspicious claims identified and saved.


In [32]:
# Set Fraud_Label to 1 for suspicious claims and 0 for others
df["Fraud_Label"] = df["Anomaly_Flag"].apply(lambda x: 1 if x == 1 else 0)

# Save updated dataset
df.to_csv("updated_insurance_claims.csv", index=False)

print("Fraud_Label column updated successfully.")

Fraud_Label column updated successfully.


In [33]:
df.head()

Unnamed: 0,Claim_ID,Claim_Date,Customer_ID,Claim_Amount,Claim_Type,Suspicious_Flags,Fraud_Label,annual_income,Claim_to_Income_Ratio,Days_Since_Issuance,Elliptic_Outlier,IsoForest_Outlier,LOF_Outlier,Anomaly_Flag
0,a18ee3ff-5da8-4c09-a717-5151ee79d3c1,2025-04-06,2d4ffb3d-a65d-447b-95ab-cbd22f728dcf,3059.19,Life,0,0,131636.53,0.02324,144,0,0,0,0
1,5edeaf76-199a-498b-a1b5-e650fa8d2484,2025-03-23,82e73154-c81a-4430-a35b-8bc534ce49f1,1893.61,Auto,0,0,63289.98,0.02992,158,0,0,0,0
2,77736e6e-ef5d-4512-bbd5-35434069ee68,2024-11-16,8a82314f-acf4-4032-a615-20f14c8839d0,5575.98,Auto,1,0,103404.7,0.053924,285,0,0,0,0
3,a7a2b0fb-be7d-4943-9c54-aa11183e7c6c,2024-10-04,73796ecd-eb04-4026-ad39-9a46efb696be,6669.31,Auto,1,0,128522.15,0.051892,328,0,0,0,0
4,73d251bb-5d47-4f40-a0dd-c878886da9a8,2024-10-01,71b69884-69bd-45f7-9e84-e8e0f4c2da35,8058.19,Medical,0,0,97803.19,0.082392,331,0,0,0,0


In [34]:
df["Claim_Type"].unique()

array(['Life', 'Auto', 'Medical', 'Home'], dtype=object)

In [35]:
df = pd.get_dummies(df, columns=["Claim_Type"], prefix="Claim")
df.head()


Unnamed: 0,Claim_ID,Claim_Date,Customer_ID,Claim_Amount,Suspicious_Flags,Fraud_Label,annual_income,Claim_to_Income_Ratio,Days_Since_Issuance,Elliptic_Outlier,IsoForest_Outlier,LOF_Outlier,Anomaly_Flag,Claim_Auto,Claim_Home,Claim_Life,Claim_Medical
0,a18ee3ff-5da8-4c09-a717-5151ee79d3c1,2025-04-06,2d4ffb3d-a65d-447b-95ab-cbd22f728dcf,3059.19,0,0,131636.53,0.02324,144,0,0,0,0,False,False,True,False
1,5edeaf76-199a-498b-a1b5-e650fa8d2484,2025-03-23,82e73154-c81a-4430-a35b-8bc534ce49f1,1893.61,0,0,63289.98,0.02992,158,0,0,0,0,True,False,False,False
2,77736e6e-ef5d-4512-bbd5-35434069ee68,2024-11-16,8a82314f-acf4-4032-a615-20f14c8839d0,5575.98,1,0,103404.7,0.053924,285,0,0,0,0,True,False,False,False
3,a7a2b0fb-be7d-4943-9c54-aa11183e7c6c,2024-10-04,73796ecd-eb04-4026-ad39-9a46efb696be,6669.31,1,0,128522.15,0.051892,328,0,0,0,0,True,False,False,False
4,73d251bb-5d47-4f40-a0dd-c878886da9a8,2024-10-01,71b69884-69bd-45f7-9e84-e8e0f4c2da35,8058.19,0,0,97803.19,0.082392,331,0,0,0,0,False,False,False,True


In [36]:
# Add 'Policy_Issuance_Date' column if it doesn't exist
if 'Policy_Issuance_Date' not in df.columns:
    df['Policy_Issuance_Date'] = [fake.date_between(start_date='-5y', end_date='-1y') for _ in range(len(df))] # Generate random issuance dates between 5 years and 1 year ago

# Convert 'Policy_Issuance_Date' to datetime objects
df["Policy_Issuance_Date"] = pd.to_datetime(df["Policy_Issuance_Date"], errors='coerce')
df["Policy_Issuance_Date"] = pd.to_datetime(df["Policy_Issuance_Date"], errors='coerce')


In [37]:
df["Claim_Date"] = pd.to_datetime(df["Claim_Date"], errors='coerce')
print(df.columns)


Index(['Claim_ID', 'Claim_Date', 'Customer_ID', 'Claim_Amount',
       'Suspicious_Flags', 'Fraud_Label', 'annual_income',
       'Claim_to_Income_Ratio', 'Days_Since_Issuance', 'Elliptic_Outlier',
       'IsoForest_Outlier', 'LOF_Outlier', 'Anomaly_Flag', 'Claim_Auto',
       'Claim_Home', 'Claim_Life', 'Claim_Medical', 'Policy_Issuance_Date'],
      dtype='object')


In [38]:
df.columns = df.columns.str.strip()  # Remove leading/trailing spaces

In [39]:
print(df.dtypes)  # Identify problematic columns


Claim_ID                         object
Claim_Date               datetime64[ns]
Customer_ID                      object
Claim_Amount                    float64
Suspicious_Flags                  int64
Fraud_Label                       int64
annual_income                   float64
Claim_to_Income_Ratio           float64
Days_Since_Issuance               int64
Elliptic_Outlier                  int64
IsoForest_Outlier                 int64
LOF_Outlier                       int64
Anomaly_Flag                      int64
Claim_Auto                         bool
Claim_Home                         bool
Claim_Life                         bool
Claim_Medical                      bool
Policy_Issuance_Date     datetime64[ns]
dtype: object


In [40]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_auc_score

# Convert Claim_Date to datetime
df["Claim_Date"] = pd.to_datetime(df["Claim_Date"], errors='coerce')

# Drop datetime columns and non-relevant IDs before training
columns_to_drop = ['Fraud_Label', 'Claim_ID', 'Policy_Issuance_Date', 'Claim_Date', 'Customer_ID']  # Added Customer_ID to the list of columns to drop

# Ensure Policyholder_ID exists before dropping
if 'Policyholder_ID' in df.columns:
    columns_to_drop.append('Policyholder_ID')

X = df.drop(columns=columns_to_drop)

# Target (Fraud Label)
y = df['Fraud_Label']

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale only numeric features
# Select only numeric columns for scaling
numeric_cols = X_train.select_dtypes(include=np.number).columns.tolist()
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[numeric_cols]) # Scale only numeric features
X_test_scaled = scaler.transform(X_test[numeric_cols]) # Scale only numeric features

# 1️⃣ **Train Random Forest**
rf_model = RandomForestClassifier(n_estimators=200, class_weight="balanced", random_state=42)
rf_model.fit(X_train, y_train)  # Use the unscaled data for Random Forest
rf_probs = rf_model.predict_proba(X_test)[:, 1]  # Get fraud probabilities

# 2️⃣ **Train Neural Network**
nn_model = MLPClassifier(hidden_layer_sizes=(64, 32), activation='relu', max_iter=500, random_state=42)
nn_model.fit(X_train_scaled, y_train)  # Use scaled data for Neural Network
nn_probs = nn_model.predict_proba(X_test_scaled)[:, 1]  # Get fraud probabilities

# 3️⃣ **Ensemble: Compute Final Fraud Score**
df_test = X_test.copy()
df_test["Fraud_Score"] = (rf_probs + nn_probs) / 2  # Average both scores
df_test["Actual_Label"] = y_test

# Evaluate performance with error handling
try:
    auc_score = roc_auc_score(y_test, df_test["Fraud_Score"])
    print(f"AUC-ROC Score (Ensemble): {auc_score:.4f}")
except ValueError:
    print("Error computing ROC-AUC Score. Check data.")

# Display top suspicious claims
df_test.sort_values("Fraud_Score", ascending=False).head(5)

AUC-ROC Score (Ensemble): 1.0000


Unnamed: 0,Claim_Amount,Suspicious_Flags,annual_income,Claim_to_Income_Ratio,Days_Since_Issuance,Elliptic_Outlier,IsoForest_Outlier,LOF_Outlier,Anomaly_Flag,Claim_Auto,Claim_Home,Claim_Life,Claim_Medical,Fraud_Score,Actual_Label
86,9041.66,1,31104.11,0.29069,10,1,1,0,1,True,False,False,False,0.999993,1
440,7592.83,0,35550.24,0.21358,222,1,1,0,1,False,False,False,True,0.999991,1
76,9856.96,1,45689.09,0.21574,281,1,1,0,1,False,False,True,False,0.997492,1
316,9563.79,0,31877.96,0.300013,18,1,1,0,1,False,True,False,False,0.994988,1
323,202.83,0,121949.34,0.001663,353,0,1,1,1,True,False,False,False,0.992496,1


In [41]:
from sklearn.metrics import accuracy_score, f1_score, recall_score

# Convert fraud scores into binary predictions (Threshold = 0.5)
df_test["Predicted_Label"] = (df_test["Fraud_Score"] >= 0.5).astype(int)

# Compute Evaluation Metrics
accuracy = accuracy_score(df_test["Actual_Label"], df_test["Predicted_Label"])
f1 = f1_score(df_test["Actual_Label"], df_test["Predicted_Label"])
recall = recall_score(df_test["Actual_Label"], df_test["Predicted_Label"])

# Print Results
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Recall: {recall:.4f}")

Accuracy: 1.0000
F1 Score: 1.0000
Recall: 1.0000


In [42]:
import pickle

# Save the best model using pickle - Replace 'model' with the desired model
# Options: rf_model, nn_model, or create an ensemble
with open("fraudulent_claims_model.pkl", "wb") as file:
    # Example: Saving the Random Forest model
    pickle.dump(rf_model, file)
    # or
    # pickle.dump(nn_model, file) # To save the Neural Network