In [1]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta
import uuid

# Set random seed for reproducibility
np.random.seed(42)
random.seed(42)

# Define the number of rows
num_rows = 10000  # Generating 10,000 rows to exceed the 9,000 requirement

# Function to generate random dates within a range
def random_date(start_date, end_date):
    time_between_dates = end_date - start_date
    days_between_dates = time_between_dates.days
    random_days = random.randrange(days_between_dates)
    return start_date + timedelta(days=random_days)

# Generate customer IDs (unique for some customers, repeated for others to simulate multiple claims)
num_customers = 4000
customer_ids = [f"CUST_{str(uuid.uuid4())[:8]}" for _ in range(num_customers)]

# Define claim types
claim_types = ["Auto Collision", "Property Damage", "Medical", "Liability", "Natural Disaster", 
               "Theft", "Fire", "Personal Injury", "Business Interruption", "Travel"]

# Generate the synthetic data
data = {
    "claim_id": [f"CLM_{str(uuid.uuid4())[:10]}" for _ in range(num_rows)],
    "cust_id": [random.choice(customer_ids) for _ in range(num_rows)],
    "policy_date": [],
    "claim_date": [],
    "claim_amount": [],
    "income": [],
    "claim_type": [random.choice(claim_types) for _ in range(num_rows)],
    "suspicious_flag": [],
    "fraud_label": []
}

# Date ranges
start_date = datetime(2020, 1, 1)
end_date = datetime(2025, 5, 15)  # Current date in the provided context is May 21, 2025

# Generate date pairs and other dependent features
for i in range(num_rows):
    # Policy date is always before claim date
    policy_date = random_date(start_date, end_date - timedelta(days=30))
    
    # Most claims happen within 1-3 years of policy start, with some outliers
    if random.random() < 0.9:  # 90% of claims
        max_claim_delay = timedelta(days=365 * 3)  # Up to 3 years
        claim_date = random_date(policy_date, min(policy_date + max_claim_delay, end_date))
    else:  # 10% of claims happen very soon after policy (potentially suspicious)
        claim_date = random_date(policy_date, policy_date + timedelta(days=60))
    
    data["policy_date"].append(policy_date.strftime("%Y-%m-%d"))
    data["claim_date"].append(claim_date.strftime("%Y-%m-%d"))
    
    # Generate claim amounts - different distributions based on claim type
    claim_type = data["claim_type"][i]
    if claim_type in ["Auto Collision", "Property Damage"]:
        claim_amount = np.random.gamma(shape=2.0, scale=2000, size=1)[0]
    elif claim_type in ["Medical", "Personal Injury"]:
        claim_amount = np.random.gamma(shape=3.0, scale=5000, size=1)[0]
    elif claim_type in ["Natural Disaster", "Fire"]:
        claim_amount = np.random.gamma(shape=4.0, scale=10000, size=1)[0]
    else:
        claim_amount = np.random.gamma(shape=1.5, scale=1500, size=1)[0]
    
    # Round to 2 decimal places
    data["claim_amount"].append(round(claim_amount, 2))
    
    # Income - log-normal distribution to simulate realistic income distribution
    income = np.random.lognormal(mean=11.0, sigma=0.6, size=1)[0]  # centered around ~$60,000
    data["income"].append(round(income, 2))

# Create suspicious flags based on patterns and thresholds
suspicious_flags = []
fraud_labels = []

for i in range(num_rows):
    # Calculate time difference between policy and claim in days
    policy_date = datetime.strptime(data["policy_date"][i], "%Y-%m-%d")
    claim_date = datetime.strptime(data["claim_date"][i], "%Y-%m-%d")
    days_diff = (claim_date - policy_date).days
    
    # Initial suspicion score
    suspicion_score = 0
    
    # Factors that increase suspicion:
    
    # 1. Very quick claim after policy initiation
    if days_diff < 30:
        suspicion_score += 3
    elif days_diff < 90:
        suspicion_score += 1
    
    # 2. High claim amount relative to income
    income_ratio = data["claim_amount"][i] / (data["income"][i] / 12)  # Monthly income ratio
    if income_ratio > 1.0:
        suspicion_score += 2
    elif income_ratio > 0.5:
        suspicion_score += 1
    
    # 3. Customer with multiple claims
    cust_id = data["cust_id"][i]
    if data["cust_id"].count(cust_id) > 3:
        suspicion_score += 2
    elif data["cust_id"].count(cust_id) > 1:
        suspicion_score += 1
    
    # 4. Round claim amounts (potentially made up)
    if data["claim_amount"][i] % 100 == 0:
        suspicion_score += 1
    
    # 5. Claim type specific patterns
    if data["claim_type"][i] == "Theft" and data["claim_amount"][i] > 10000:
        suspicion_score += 2
    elif data["claim_type"][i] == "Fire" and days_diff < 180:
        suspicion_score += 2
    
    # Assign suspicious flag
    if suspicion_score >= 4:
        suspicious_flags.append(1)
    else:
        suspicious_flags.append(0)
    
    # Assign fraud label
    # Not all suspicious claims are fraud, and some non-suspicious claims are fraud
    if suspicious_flags[i] == 1:
        # 60% of highly suspicious claims are actually fraud
        fraud_labels.append(1 if random.random() < 0.6 else 0)
    else:
        # 5% of non-suspicious claims are still fraud (harder to detect)
        fraud_labels.append(1 if random.random() < 0.05 else 0)

data["suspicious_flag"] = suspicious_flags
data["fraud_label"] = fraud_labels

# Create DataFrame
df = pd.DataFrame(data)

# Calculate some statistics
total_claims = len(df)
fraud_claims = df["fraud_label"].sum()
fraud_percentage = fraud_claims / total_claims * 100
suspicious_claims = df["suspicious_flag"].sum()
suspicious_percentage = suspicious_claims / total_claims * 100

print(f"Dataset created with {total_claims} claims")
print(f"Fraud claims: {fraud_claims} ({fraud_percentage:.2f}%)")
print(f"Suspicious claims: {suspicious_claims} ({suspicious_percentage:.2f}%)")

# Display the first few rows
print("\nSample data:")
print(df.head())

# Save to CSV
df.to_csv("insurance_claims_fraud_dataset.csv", index=False)
print("\nDataset saved to 'insurance_claims_fraud_dataset.csv'")

# Additional dataset information
print("\nDataset Summary:")
print(df.describe())

# Distribution of claim types
print("\nClaim Type Distribution:")
print(df["claim_type"].value_counts())

# Correlation between suspicious flags and fraud
print("\nCorrelation between suspicious flag and fraud:")
print(df[["suspicious_flag", "fraud_label"]].corr())

Dataset created with 10000 claims
Fraud claims: 2525 (25.25%)
Suspicious claims: 3659 (36.59%)

Sample data:
         claim_id        cust_id policy_date  claim_date  claim_amount  \
0  CLM_82344a57-5  CUST_024586e6  2024-08-07  2024-10-29       2684.43   
1  CLM_430c9691-b  CUST_2d8b4014  2022-02-03  2024-03-04       6159.45   
2  CLM_2f7b6081-f  CUST_b0b93bad  2024-01-27  2024-04-10       9299.43   
3  CLM_e284e732-b  CUST_825e5547  2021-10-18  2023-09-28      26630.52   
4  CLM_88cd42bc-7  CUST_de61c4a3  2022-04-13  2024-07-13      15408.13   

     income        claim_type  suspicious_flag  fraud_label  
0  55107.52             Theft                1            0  
1  72499.96   Personal Injury                0            0  
2  94888.79   Property Damage                1            1  
3  43691.03  Natural Disaster                0            0  
4  18996.91           Medical                0            0  

Dataset saved to 'insurance_claims_fraud_dataset.csv'

Dataset Summary:
 