In [1]:
import pandas as pd
import numpy as np
import random
from faker import Faker

# Setup
fake = Faker()
np.random.seed(42)

# Number of records (60,000 total → 15k per product)
n = 60000

# Product-wise configuration
products = [
    ("LIC Jeevan Umang", "Money Back", [15, 20, 25, 30], 0.08),
    ("LIC Jeevan Lakshya", "Endowment", [13, 16, 20, 25], 0.07),
    ("Tech Term", "Term Plan", [10, 20, 30, 40], 0.04),
    ("New Endowment Plan", "Endowment", [12, 15, 20, 25], 0.065)
]

# Supporting categories
genders = ['Male', 'Female']
marital_statuses = ['Single', 'Married', 'Widowed']
education_levels = ['High School', 'Graduate', 'Post-Graduate', 'Diploma']
occupations = ['Farmer', 'Teacher', 'Engineer', 'Self-employed', 'Clerk', 'Business']
location_types = ['Urban', 'Semi-urban', 'Rural']
income_groups = ['<3L', '3–6L', '6–10L', '10L+']
payment_modes = ['Monthly', 'Quarterly', 'Half-yearly', 'Annually']
channels = ['Agent', 'Online', 'Direct branch', 'Corporate']
regions = ['North', 'South', 'East', 'West', 'Central']

# Income Mapping Function
def map_income(group):
    return {
        '<3L': np.random.randint(100000, 300000),
        '3–6L': np.random.randint(300000, 600000),
        '6–10L': np.random.randint(600000, 1000000),
        '10L+': np.random.randint(1000000, 2500000)
    }[group]

# Premium Calculation Function
def calculate_premium(sa, mode, rate):
    annual = rate * sa
    return round({
        'Monthly': annual / 12,
        'Quarterly': annual / 4,
        'Half-yearly': annual / 2,
        'Annually': annual
    }[mode], 2)

# Main Data Generation
data = []
for product_name, policy_type, terms, rate in products:
    for _ in range(n // 4):
        age = np.random.randint(18, 61)
        gender = random.choice(genders)
        occ = random.choice(occupations)
        edu = random.choice(education_levels)
        ms = random.choice(marital_statuses)
        deps = np.random.randint(0, 6)
        loc = random.choices(location_types, [0.4, 0.3, 0.3])[0]
        region = random.choice(regions)
        income_group = random.choices(income_groups, [0.2, 0.35, 0.3, 0.15])[0]
        declared_income = map_income(income_group)
        date_purchased = np.random.randint(1, 11)
        policy_term = random.choice(terms)
        premium_term = policy_term if product_name == "Tech Term" else policy_term - 5
        time_to_maturity = policy_term - date_purchased
        sum_assured = random.randrange(200000, 2000001, 50000)
        mode = random.choice(payment_modes)
        premium = calculate_premium(sum_assured, mode, rate)
        status = random.choices(['Active', 'Lapsed', 'Matured'], [0.7, 0.2, 0.1])[0]
        claim = random.choices(['Yes', 'No'], [0.2, 0.8])[0]
        channel = random.choices(channels, [0.5, 0.2, 0.2, 0.1])[0]
        pincode = np.random.randint(400000, 499999)

        data.append([
            product_name, policy_type, date_purchased, age, gender, occ, edu,
            ms, deps, loc, region, income_group, claim, status, channel,
            pincode, policy_term, premium_term, time_to_maturity, sum_assured,
            mode, premium, declared_income
        ])

# Column Names
columns = [
    "Product", "Policy Type", "Date of Purchased (Years Ago)", "Age", "Gender",
    "Occupation", "Education Level", "Marital Status", "Number of Dependents",
    "Location Type", "Region", "Annual Income Group", "Claim History (Yes/No)",
    "Policy Status", "Channel of Purchase", "Pincode", "Policy Term",
    "Premium Term", "Time to Maturity", "Sum Assured", "Mode of Premium Payment",
    "Premium Per Payment", "Declared Income"
]

# Create and Save DataFrame
df = pd.DataFrame(data, columns=columns)
df.to_csv("Improved_LIC_Synthetic_Data.csv", index=False)
print("✅ Data generated and saved as 'LIC_Synthetic_Data.csv'")

✅ Data generated and saved as 'Improved_LIC_Synthetic_Data.csv'
