In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import random

# Set random seed for reproducibility
np.random.seed(42)
random.seed(42)

# Number of rows
n_rows = 9000

# Generate Policy IDs
policy_ids = [f'POL{str(i).zfill(6)}' for i in range(1, n_rows + 1)]

# Generate Customer Ages (18-80)
customer_ages = np.random.randint(18, 81, size=n_rows)

# Generate Gender (M/F)
genders = np.random.choice(['M', 'F'], size=n_rows)

# Generate Policy Types
policy_types = np.random.choice(['Auto', 'Home', 'Health', 'Life', 'Travel'], 
                               size=n_rows, 
                               p=[0.4, 0.25, 0.2, 0.1, 0.05])

# Generate Annual Income (20,000 - 200,000)
# Make it right-skewed to be more realistic
annual_incomes = np.random.lognormal(mean=11, sigma=0.5, size=n_rows).astype(int)
annual_incomes = np.clip(annual_incomes, 20000, 200000)

# Generate Claim History (0-5)
# Most people have 0 or 1 claims, fewer have multiple claims
claim_histories = np.random.choice([0, 1, 2, 3, 4, 5], 
                                  size=n_rows, 
                                  p=[0.6, 0.25, 0.08, 0.04, 0.02, 0.01])

# Generate Fraudulent Claim (0/1)
# Make fraud rare (about 2% of claims)
fraudulent_claims = np.zeros(n_rows)
claim_indices = np.where(claim_histories > 0)[0]
fraud_count = int(len(claim_indices) * 0.02)
fraud_indices = np.random.choice(claim_indices, size=fraud_count, replace=False)
fraudulent_claims[fraud_indices] = 1

# Generate Premium Amount (500-5,000)
# Make it depend on age, policy type, and claim history
premium_amounts = np.zeros(n_rows)
for i in range(n_rows):
    base_premium = 0
    
    # Policy type factor
    if policy_types[i] == 'Auto':
        base_premium = 1200
    elif policy_types[i] == 'Home':
        base_premium = 1000
    elif policy_types[i] == 'Health':
        base_premium = 2500
    elif policy_types[i] == 'Life':
        base_premium = 800
    else:  # Travel
        base_premium = 400
    
    # Age factor (U-shaped curve - higher for young and old)
    age_factor = 1.0
    if customer_ages[i] < 25:
        age_factor = 1.5
    elif customer_ages[i] > 65:
        age_factor = 1.3
    
    # Claim history factor
    claim_factor = 1.0 + (claim_histories[i] * 0.15)
    
    # Calculate premium with some randomness
    premium_amounts[i] = base_premium * age_factor * claim_factor * np.random.uniform(0.85, 1.15)

premium_amounts = np.round(premium_amounts, 2)

# Generate Claim Amount (0-50,000)
# Only people with claims have claim amounts
claim_amounts = np.zeros(n_rows)
for i in range(n_rows):
    if claim_histories[i] > 0:
        # Base on policy type
        if policy_types[i] == 'Auto':
            mean_claim = 5000
            std_claim = 3000
        elif policy_types[i] == 'Home':
            mean_claim = 8000
            std_claim = 5000
        elif policy_types[i] == 'Health':
            mean_claim = 12000
            std_claim = 8000
        elif policy_types[i] == 'Life':
            mean_claim = 25000
            std_claim = 15000
        else:  # Travel
            mean_claim = 2000
            std_claim = 1500
        
        # Generate claim amount with lognormal distribution for right skew
        claim_amount = np.random.lognormal(mean=np.log(mean_claim), sigma=0.7)
        
        # Fraudulent claims tend to be higher
        if fraudulent_claims[i] == 1:
            claim_amount *= np.random.uniform(1.2, 2.0)
        
        claim_amounts[i] = claim_amount

# Round claim amounts
claim_amounts = np.round(claim_amounts, 2)

# Generate Risk Score (categorical: 'Low', 'Medium', 'High')
# Based on age, claim history, and income
risk_scores_numeric = np.zeros(n_rows)
for i in range(n_rows):
    # Age risk (higher for young drivers and very old)
    age_risk = 0
    if customer_ages[i] < 25:
        age_risk = 3
    elif customer_ages[i] < 40:
        age_risk = 1
    elif customer_ages[i] < 65:
        age_risk = 0
    else:
        age_risk = 2
    
    # Claim history risk
    claim_risk = claim_histories[i] * 1.5
    
    # Income risk (higher for lower income)
    income_risk = 0
    if annual_incomes[i] < 40000:
        income_risk = 2
    elif annual_incomes[i] < 80000:
        income_risk = 1
    
    # Combine risks with some randomness
    risk_scores_numeric[i] = age_risk + claim_risk + income_risk + np.random.uniform(-1, 1)

# Convert numeric risk scores to categories
risk_score_categories = pd.qcut(risk_scores_numeric, q=[0, 0.4, 0.75, 1.0], labels=['Low', 'Medium', 'High'])

# Create the DataFrame
insurance_df = pd.DataFrame({
    'Policy_ID': policy_ids,
    'Customer_Age': customer_ages,
    'Gender': genders,
    'Policy_Type': policy_types,
    'Annual_Income': annual_incomes,
    'Claim_History': claim_histories,
    'Fraudulent_Claim': fraudulent_claims.astype(int),
    'Premium_Amount': premium_amounts,
    'Claim_Amount': claim_amounts,
    'Risk_Score': risk_score_categories
})

# Print dataset statistics
print("Synthetic Insurance Dataset Statistics:")
print(f"Total rows: {len(insurance_df)}")
print("\nSample rows:")
print(insurance_df.head())

print("\nColumn statistics:")
print(insurance_df.describe(include='all').T)

print("\nValue counts for categorical columns:")
print("Policy Types:")
print(insurance_df['Policy_Type'].value_counts())
print("\nRisk Score:")
print(insurance_df['Risk_Score'].value_counts())
print("\nFraudulent Claims:")
print(insurance_df['Fraudulent_Claim'].value_counts())

# Save to CSV
insurance_df.to_csv('synthetic_insurance_data.csv', index=False)
print("\nDataset saved to 'synthetic_insurance_data.csv'")

# Return the first 10 rows to display
insurance_df.head(10)

Synthetic Insurance Dataset Statistics:
Total rows: 9000

Sample rows:
   Policy_ID  Customer_Age Gender Policy_Type  Annual_Income  Claim_History  \
0  POL000001            56      M        Life         111642              0   
1  POL000002            69      M        Auto          42853              0   
2  POL000003            46      F        Life         135705              1   
3  POL000004            32      M        Auto          43703              0   
4  POL000005            60      F        Home          71251              1   

   Fraudulent_Claim  Premium_Amount  Claim_Amount Risk_Score  
0                 0          828.63          0.00        Low  
1                 0         1767.80          0.00     Medium  
2                 0          981.47      63156.40        Low  
3                 0         1097.23          0.00        Low  
4                 0         1312.54       4500.43     Medium  

Column statistics:
                   count unique        top  freq        

Unnamed: 0,Policy_ID,Customer_Age,Gender,Policy_Type,Annual_Income,Claim_History,Fraudulent_Claim,Premium_Amount,Claim_Amount,Risk_Score
0,POL000001,56,M,Life,111642,0,0,828.63,0.0,Low
1,POL000002,69,M,Auto,42853,0,0,1767.8,0.0,Medium
2,POL000003,46,F,Life,135705,1,0,981.47,63156.4,Low
3,POL000004,32,M,Auto,43703,0,0,1097.23,0.0,Low
4,POL000005,60,F,Home,71251,1,0,1312.54,4500.43,Medium
5,POL000006,25,F,Auto,54318,1,0,1300.13,19727.51,Medium
6,POL000007,78,M,Health,59163,0,0,3188.2,0.0,Medium
7,POL000008,38,F,Health,83796,0,0,2498.59,0.0,Low
8,POL000009,56,M,Health,29166,0,0,2868.04,0.0,Low
9,POL000010,75,F,Home,39522,1,0,1301.65,12032.1,High
