In [1]:
import pandas as pd
import numpy as np
import random

# Set random seed for reproducibility
np.random.seed(42)
random.seed(42)

# Number of customers to generate
n_customers = 1000

# Generate Customer_ID
customer_ids = [f"CUST_{str(i+1).zfill(6)}" for i in range(n_customers)]

# Generate Age (realistic distribution: 18-80 years)
ages = np.random.normal(45, 15, n_customers)
ages = np.clip(ages, 18, 80).astype(int)

# Generate Annual_Income (correlated with age, realistic distribution)
# Higher income generally for middle age, with some variation
base_income = 30000 + (ages - 18) * 800 + np.random.normal(0, 15000, n_customers)
# Add some high earners
high_earner_indices = np.random.choice(n_customers, size=int(n_customers * 0.1), replace=False)
base_income[high_earner_indices] += np.random.uniform(50000, 150000, len(high_earner_indices))
annual_income = np.clip(base_income, 25000, 300000).astype(int)

# Generate Policy_Count (1-8 policies, correlated with income)
# Higher income customers tend to have more policies
income_normalized = (annual_income - annual_income.min()) / (annual_income.max() - annual_income.min())
policy_count_prob = 1 + income_normalized * 3 + np.random.normal(0, 0.5, n_customers)
policy_count = np.clip(policy_count_prob, 1, 8).astype(int)

# Generate Total_Premium_Paid (correlated with policy count and income)
# Base premium per policy varies by income level
base_premium_per_policy = 500 + (annual_income / 100000) * 1000
total_premium = policy_count * base_premium_per_policy * np.random.uniform(0.8, 1.2, n_customers)
total_premium = np.round(total_premium, 2)

# Generate Claim_Frequency (0-5 claims, some correlation with age and policies)
# Older customers and those with more policies might have slightly more claims
age_factor = (ages - 18) / 62  # Normalize age
policy_factor = (policy_count - 1) / 7  # Normalize policy count
claim_base = age_factor * 1.5 + policy_factor * 1.0 + np.random.exponential(0.8, n_customers)
claim_frequency = np.clip(claim_base, 0, 5).astype(int)

# Generate Policy_Upgrades (0-3 upgrades, correlated with income and tenure)
# Higher income customers more likely to upgrade
upgrade_prob = income_normalized * 2 + np.random.exponential(0.3, n_customers)
policy_upgrades = np.clip(upgrade_prob, 0, 3).astype(int)

# Create DataFrame
dataset = pd.DataFrame({
    'Customer_ID': customer_ids,
    'Age': ages,
    'Annual_Income': annual_income,
    'Policy_Count': policy_count,
    'Total_Premium_Paid': total_premium,
    'Claim_Frequency': claim_frequency,
    'Policy_Upgrades': policy_upgrades
})

# Display basic statistics
print("Dataset Shape:", dataset.shape)
print("\nDataset Info:")
print(dataset.info())
print("\nFirst 10 rows:")
print(dataset.head(10))
print("\nDataset Statistics:")
print(dataset.describe())

# Check for correlations (useful for clustering)
print("\nCorrelation Matrix:")
correlation_matrix = dataset.select_dtypes(include=[np.number]).corr()
print(correlation_matrix.round(3))

# Save to CSV
dataset.to_csv('customer_segmentation_dataset.csv', index=False)
print(f"\nDataset saved as 'customer_segmentation_dataset.csv'")

# Display some sample customer profiles for different segments
print("\n" + "="*50)
print("SAMPLE CUSTOMER PROFILES")
print("="*50)

# High-value customers
high_value = dataset[dataset['Annual_Income'] > 100000].head(3)
print("\nHigh-Value Customers:")
print(high_value.to_string(index=False))

# Frequent claimers
frequent_claimers = dataset[dataset['Claim_Frequency'] >= 3].head(3)
print("\nFrequent Claimers:")
print(frequent_claimers.to_string(index=False))

# Young customers
young_customers = dataset[dataset['Age'] <= 30].head(3)
print("\nYoung Customers:")
print(young_customers.to_string(index=False))

# Create visualization code for clustering analysis
print("\n" + "="*50)
print("CLUSTERING ANALYSIS STARTER CODE")
print("="*50)


Dataset Shape: (1000, 7)

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Customer_ID         1000 non-null   object 
 1   Age                 1000 non-null   int64  
 2   Annual_Income       1000 non-null   int64  
 3   Policy_Count        1000 non-null   int64  
 4   Total_Premium_Paid  1000 non-null   float64
 5   Claim_Frequency     1000 non-null   int64  
 6   Policy_Upgrades     1000 non-null   int64  
dtypes: float64(1), int64(5), object(1)
memory usage: 54.8+ KB
None

First 10 rows:
   Customer_ID  Age  Annual_Income  Policy_Count  Total_Premium_Paid  \
0  CUST_000001   52          78190             2             2459.48   
1  CUST_000002   42          63069             1              907.63   
2  CUST_000003   54          59694             2             2214.42   
3  CUST_000004   67          59495        