# Bajaj Allianz Life Insurance Data Generator
This notebook simulates realistic life insurance customer data based on a predefined product mix strategy.

## Objective
- Generate synthetic customer data for life insurance modeling.
- Incorporate business logic for product and policy assignment.
- Create a dataset suitable for Exploratory Data Analysis and Recommendation Modeling.

In [1]:
import pandas as pd
import numpy as np
import random

In [2]:
# Seed for reproducibility
random.seed(42)
np.random.seed(42)

In [3]:
# Define sample sizes and value pools
num_records = 100000

age_groups = ['18–25', '26–35', '36–45', '46–60']
genders = ['Male', 'Female']
marital_status = ['Single', 'Married', 'Widowed']
occupations = ['Engineer', 'Farmer', 'Clerk', 'Self-employed', 'Teacher', 'Business Owner']
education_levels = ['High School', 'Graduate', 'Post-Graduate', 'Diploma']
locations = ['Rural', 'Semi-urban', 'Urban']
regions = ['North', 'South', 'East', 'West', 'Central']
incomes = ['<3L', '3–6L', '6–10L', '10L+']
premium_modes = ['Monthly', 'Quarterly', 'Half-yearly', 'Annually']
products = ['Smart Protect Goal', 'Save Assure', 'Goal Assure', 'Child Advantage Plan', 'Income Assure', 'Future Gain']
policy_types = ['Term Plan', 'Endowment', 'Money Back', 'Child Plan', 'ULIP']
policy_statuses = ['Active', 'Lapsed', 'Matured']
claim_history = ['Yes', 'No']
children_flag = ['Yes', 'No']

In [4]:
# Helper functions
def generate_income(income_category):
    return {
        '<3L': np.random.randint(100001, 300000),
        '3–6L': np.random.randint(300001, 600000),
        '6–10L': np.random.randint(600001, 1000000),
        '10L+': np.random.randint(1000001, 2500000)
    }[income_category]

def generate_sum_assured(product):
    return {
        'Smart Protect Goal': np.random.randint(500000, 1500000),
        'Save Assure': np.random.randint(200000, 1000000),
        'Goal Assure': np.random.randint(400000, 1200000),
        'Child Advantage Plan': np.random.randint(300000, 1000000),
        'Income Assure': np.random.randint(250000, 800000),
        'Future Gain': np.random.randint(500000, 1200000)
    }[product]

def choose_policy_type(product):
    return {
        'Smart Protect Goal': 'Term Plan',
        'Save Assure': 'Endowment',
        'Goal Assure': 'ULIP',
        'Child Advantage Plan': 'Child Plan',
        'Income Assure': 'Money Back',
        'Future Gain': 'ULIP'
    }[product]

## Data Generation Logic

In [5]:
data = []

for _ in range(num_records):
    age = random.choice(age_groups)
    gender = random.choice(genders)
    marital = random.choice(marital_status)
    occupation = random.choice(occupations)
    edu = random.choice(education_levels)
    loc = random.choice(locations)
    reg = random.choice(regions)
    income_cat = random.choice(incomes)
    income_val = generate_income(income_cat)
    product = random.choice(products)
    policy_type = choose_policy_type(product)
    mode = random.choice(premium_modes)
    benefit_term = random.randint(10, 30)
    premium_term = random.randint(5, benefit_term)

    if premium_term < 10:
        premium_term_cat = 'short'
    elif premium_term <= 20:
        premium_term_cat = 'medium'
    else:
        premium_term_cat = 'long'

    premium_amt = round(np.random.uniform(500, 150000), 2)
    sum_assured = generate_sum_assured(product)
    purchase_years_ago = random.randint(1, 10)
    maturity_left = max(0, benefit_term - purchase_years_ago)
    dependents = random.randint(0, 5)
    claim = random.choice(claim_history)
    policy_status = random.choice(policy_statuses)
    children = random.choice(children_flag)

    data.append([
        age, gender, marital, occupation, edu, loc, reg,
        income_cat, income_val, mode, product, policy_type, policy_status,
        benefit_term, premium_term, premium_term_cat, premium_amt, sum_assured,
        purchase_years_ago, maturity_left, dependents, claim, children
    ])

## Create and Export the Dataset

In [6]:
columns = [
    "Age_Group", "Gender", "Marital_Status", "Occupation", "Education_Level", "Location", "Region",
    "Income", "Declared_Income", "Mode_of_Premium_Payment", "Product", "Policy_Type", "Policy_Status",
    "Benefit_Term", "Premium_Term", "Premium_Term_Category", "Premium_per_Payment", "Sum_Assured",
    "Date_of_Purchased_(years_ago)", "Time_to_Maturity", "Number_of_Dependents", "Claim_History", "Children"
]

df = pd.DataFrame(data, columns=columns)
df.to_csv("Bajaj_Life_Insurance_Updated.csv", index=False)
df.head()

Unnamed: 0,Age_Group,Gender,Marital_Status,Occupation,Education_Level,Location,Region,Income,Declared_Income,Mode_of_Premium_Payment,...,Benefit_Term,Premium_Term,Premium_Term_Category,Premium_per_Payment,Sum_Assured,Date_of_Purchased_(years_ago),Time_to_Maturity,Number_of_Dependents,Claim_History,Children
0,18–25,Male,Widowed,Clerk,Graduate,Rural,South,<3L,221959,Monthly,...,28,18,medium,109933.09,587498,1,27,0,Yes,Yes
1,18–25,Male,Widowed,Business Owner,Diploma,Rural,West,6–10L,791336,Quarterly,...,23,15,medium,97807.83,564820,5,18,1,Yes,Yes
2,18–25,Female,Single,Clerk,Post-Graduate,Urban,East,<3L,284780,Annually,...,27,8,short,60279.22,565725,7,20,0,No,No
3,26–35,Male,Single,Business Owner,Graduate,Semi-urban,North,3–6L,384655,Annually,...,18,12,medium,77378.05,1055839,6,12,1,No,Yes
4,36–45,Male,Widowed,Business Owner,Graduate,Urban,South,3–6L,441700,Annually,...,18,15,medium,84711.59,368148,9,9,1,No,Yes


In [None]:
def assign_policy_type(age_group):
    if age_group == "18-25":
        return random.choice(["Term Plan", "ULIP"])
    elif age_group == "26-35":
        return random.choice(["ULIP", "Child Plan"])
    elif age_group == "36-45":
        return random.choice(["ULIP", "Endowment Plan"])
    else:
        return random.choice(["Endowment Plan", "Retirement Plan"])
