Code for generating insurance data, XYZ Insurance Company

Revision 3 - Final Code

In [7]:
from faker import Faker
import random
import pandas as pd
from datetime import datetime, timedelta

# Initialize Faker
fake = Faker()

# -----------------------------
# 1. CONFIGURATION & PARAMETERS
# -----------------------------
NUM_CUSTOMERS = 10000
POLICY_TYPES = ['Health', 'Motor', 'Life']
CLAIM_STATUSES = ['Approved', 'Rejected', 'Pending']
CHANNELS = ['Phone', 'Email', 'In-Person', 'Chatbot', 'Social Media']

# Define major counties and additional counties in Kenya
major_counties = ['Nairobi', 'Mombasa', 'Kisumu', 'Eldoret', 'Nakuru']
other_counties = [
    'Kiambu', 'Machakos', 'Kericho', 'Nyeri', 'Meru',
    'Bungoma', 'Kakamega', 'Kisii', 'Trans Nzoia', 'Isiolo',
    'Turkana', 'Narok', 'Uasin Gishu', 'Vihiga', 'Murang\'a'
]

def choose_county():
    """Return a county in Kenya, with a 70% chance for a major county and 30% for a random one."""
    if random.random() < 0.7:
        return random.choice(major_counties)
    else:
        return random.choice(other_counties)

# Reason dictionaries for more detailed communication logs
REASONS_DICT = {
    'Complaint': [
        "Billing error in latest statement",
        "Unsatisfactory claim process",
        "Long wait time on phone support",
        "Incorrect personal details on file",
        "Policy cancellation issue"
    ],
    'Feedback': [
        "Appreciation for quick claim settlement",
        "Request for better mobile app experience",
        "Suggestion for new policy feature",
        "Overall positive feedback on service",
        "Concern about premium increase"
    ],
    'Call Log': [
        "General inquiry about policy coverage",
        "Follow-up call on a pending claim",
        "Inquiry about adding a family member to the policy",
        "Question regarding renewal process",
        "Verification of updated contact details"
    ]
}

# Simple sentiment mapping for demonstration
SENTIMENT_OPTIONS = {
    'Complaint': ['Negative', 'Negative', 'Neutral'],
    'Feedback': ['Positive', 'Neutral', 'Negative'],
    'Call Log': ['Neutral', 'Neutral', 'Positive', 'Negative']
}

# Possible outcomes for any communication
OUTCOME_OPTIONS = ['Resolved', 'Escalated', 'Pending Info', 'Acknowledged']

# -----------------------------
# 2. HELPER FUNCTIONS
# -----------------------------
def generate_message_and_sentiment():
    """
    Return a random reason category, message text, and sentiment.
    E.g., "Complaint", "Billing error in latest statement", "Negative"
    """
    reason_category = random.choice(list(REASONS_DICT.keys()))
    message_text = random.choice(REASONS_DICT[reason_category])
    sentiment = random.choice(SENTIMENT_OPTIONS[reason_category])
    return reason_category, message_text, sentiment

# -----------------------------
# 3. GENERATE CUSTOMERS
# -----------------------------
customer_data = []
for i in range(NUM_CUSTOMERS):
    customer_id = i + 1
    first_name = fake.first_name()
    last_name = fake.last_name()
    age = random.randint(18, 80)
    gender = random.choice(['M', 'F'])
    region = choose_county()  # Use the Kenyan county generator
    vip_status = random.choice([True, False])

    customer_data.append([
        customer_id,
        first_name,
        last_name,
        age,
        gender,
        region,
        vip_status
    ])

df_customers = pd.DataFrame(customer_data, columns=[
    'customer_id',
    'first_name',
    'last_name',
    'age',
    'gender',
    'region',
    'vip_status'
])

# -----------------------------
# 4. GENERATE POLICIES
# -----------------------------
policy_data = []
policy_id_counter = 1

for c in range(NUM_CUSTOMERS):
    # Each customer can have 1 to 3 policies
    for _ in range(random.randint(1, 3)):
        policy_type = random.choice(POLICY_TYPES)
        premium = random.randint(500, 5000)
        sum_insured = premium * 100  # Simple multiplier for demonstration
        start_date = fake.date_between(start_date='-5y', end_date='today')
        end_date = start_date + timedelta(days=random.randint(365, 365*5))  # 1 to 5 years

        policy_data.append([
            policy_id_counter,
            c + 1,         # customer_id
            policy_type,
            premium,
            sum_insured,
            start_date,
            end_date
        ])
        policy_id_counter += 1

df_policies = pd.DataFrame(policy_data, columns=[
    'policy_id',
    'customer_id',
    'policy_type',
    'premium',
    'sum_insured',
    'start_date',
    'end_date'
])

# -----------------------------
# 5. GENERATE CLAIMS
# -----------------------------
claim_data = []
claim_id_counter = 1

for _, row in df_policies.iterrows():
    # Probability of having 0 to 3 claims per policy
    num_claims = random.randint(0, 3)
    for _ in range(num_claims):
        policy_id = row['policy_id']
        customer_id = row['customer_id']  # We'll store this for direct mapping
        claim_date = fake.date_between(start_date=row['start_date'], end_date=row['end_date'])
        # Ensure claim_amount doesn't exceed sum_insured
        max_claim_amount = max(100, row['sum_insured'] // 2)
        claim_amount = random.randint(100, max_claim_amount)
        claim_status = random.choice(CLAIM_STATUSES)

        claim_data.append([
            claim_id_counter,
            policy_id,
            customer_id,
            claim_date,
            claim_amount,
            claim_status
        ])
        claim_id_counter += 1

df_claims = pd.DataFrame(claim_data, columns=[
    'claim_id',
    'policy_id',
    'customer_id',   # NEW: direct link to the customer
    'claim_date',
    'claim_amount',
    'claim_status'
])

# -----------------------------
# 6. GENERATE COMMUNICATIONS
#    (WITH OPTIONAL LINK TO A CLAIM)
# -----------------------------
comm_data = []
comm_id_counter = 1

# Precompute a dictionary of customer_id -> list of their claim_ids
customer_claim_map = {}
for _, claim_row in df_claims.iterrows():
    cust_id = claim_row['customer_id']
    if cust_id not in customer_claim_map:
        customer_claim_map[cust_id] = []
    customer_claim_map[cust_id].append(claim_row['claim_id'])

for customer_id in range(1, NUM_CUSTOMERS + 1):
    # Each customer can have 1 to 5 random communications
    for _ in range(random.randint(1, 5)):
        timestamp = fake.date_time_between(start_date='-2y', end_date='now')
        channel = random.choice(CHANNELS)
        reason_category, message_text, sentiment = generate_message_and_sentiment()
        outcome = random.choice(OUTCOME_OPTIONS)

        # Optionally link a communication to one of the customer's claims if reason is "Complaint" or "Call Log"
        related_claim_id = None
        user_claims = customer_claim_map.get(customer_id, [])
        if user_claims and reason_category in ["Complaint", "Call Log"]:
            # 70% chance to reference an actual claim
            if random.random() < 0.7:
                related_claim_id = random.choice(user_claims)

        comm_data.append([
            comm_id_counter,
            customer_id,
            timestamp,
            channel,
            reason_category,
            message_text,
            sentiment,
            outcome,
            related_claim_id  # new column linking to a claim (if applicable)
        ])
        comm_id_counter += 1

df_communications = pd.DataFrame(comm_data, columns=[
    'communication_id',
    'customer_id',
    'timestamp',
    'channel',
    'reason_category',
    'message_text',
    'sentiment',
    'outcome',
    'related_claim_id'
])

# -----------------------------
# 7. SAVE TO CSV
# -----------------------------
df_customers.to_csv('Customers Data.csv', index=False)
df_policies.to_csv('Policies Data.csv', index=False)
df_claims.to_csv('Claims Data.csv', index=False)
df_communications.to_csv('Communications Data.csv', index=False)

# Print sample outputs for verification
print("SAMPLE CUSTOMERS:\n", df_customers.head(5))
print("SAMPLE POLICIES:\n", df_policies.head(5))
print("SAMPLE CLAIMS:\n", df_claims.head(5))
print("SAMPLE COMMUNICATIONS:\n", df_communications.head(5))


SAMPLE CUSTOMERS:
    customer_id first_name last_name  age gender       region  vip_status
0            1     Brandi      Snow   69      F  Trans Nzoia       False
1            2       Jeff   Edwards   40      M         Meru       False
2            3     Sandra    Watson   53      F      Mombasa       False
3            4     Robert      Beck   80      M  Trans Nzoia        True
4            5    Kenneth       Kim   76      F      Nairobi        True
SAMPLE POLICIES:
    policy_id  customer_id policy_type  premium  sum_insured  start_date  \
0          1            1      Health     1946       194600  2021-07-31   
1          2            2      Health     4620       462000  2024-07-21   
2          3            3      Health     3515       351500  2022-09-27   
3          4            3       Motor     3990       399000  2024-04-02   
4          5            4      Health     4839       483900  2023-12-31   

     end_date  
0  2023-04-21  
1  2025-10-03  
2  2024-02-06  
3  2028-07