In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

# Load cleaned data (from Task 1)
df = pd.read_csv('data/insurance_data_clean.csv')

# Basic shape check
print("Dataset Shape:", df.shape)

In [None]:
# Claim Frequency: % of policies with claims
df['HasClaim'] = (df['TotalClaims'] > 0).astype(int)

# Claim Severity: Avg claim amount (for policies with claims)
df['ClaimSeverity'] = df['TotalClaims'].replace(0, np.nan)

# Margin: Profit per policy
df['Margin'] = df['TotalPremium'] - df['TotalClaims']

print("✅ Risk metrics created!")

In [None]:
# H₀: No risk differences across provinces

# Claim Frequency by Province
freq_by_prov = df.groupby('Province')['HasClaim'].mean()
print("Claim Frequency by Province:")
print(freq_by_prov.sort_values(ascending=False).head())

# Claim Severity by Province
severity_by_prov = df.groupby('Province')['ClaimSeverity'].mean()
print("\nClaim Severity by Province:")
print(severity_by_prov.sort_values(ascending=False).head())

# T-test: Compare top 2 provinces (e.g., Gauteng vs. Western Cape)
prov1, prov2 = 'Gauteng', 'Western Cape'
severity1 = df[df['Province'] == prov1]['ClaimSeverity'].dropna()
severity2 = df[df['Province'] == prov2]['ClaimSeverity'].dropna()

t_stat, p_val = stats.ttest_ind(severity1, severity2, equal_var=False)
print(f"\nT-test: {prov1} vs {prov2}")
print(f"p-value = {p_val:.4f}")
if p_val < 0.05:
    print("✅ Reject H₀: Significant risk difference between provinces")
else:
    print("❌ Fail to reject H₀")

In [None]:
# H₀: No risk difference between women and men

# Remove missing gender
df_gender = df[df['Gender'].isin(['Male', 'Female'])]

# Claim Frequency by Gender
freq_by_gender = df_gender.groupby('Gender')['HasClaim'].mean()
print("Claim Frequency by Gender:")
print(freq_by_gender)

# T-test on Claim Severity
male_sev = df_gender[df_gender['Gender'] == 'Male']['ClaimSeverity'].dropna()
female_sev = df_gender[df_gender['Gender'] == 'Female']['ClaimSeverity'].dropna()

t_stat, p_val = stats.ttest_ind(male_sev, female_sev, equal_var=False)
print(f"\nT-test: Male vs Female Claim Severity")
print(f"p-value = {p_val:.4f}")
if p_val < 0.05:
    print("✅ Reject H₀: Significant gender risk difference")
else:
    print("❌ Fail to reject H₀")

In [None]:
# Focus on top 10 postal codes (to avoid noise)
top_postal = df['PostalCode'].value_counts().head(10).index
df_postal = df[df['PostalCode'].isin(top_postal)]

# H₀: No risk differences between postal codes
freq_by_postal = df_postal.groupby('PostalCode')['HasClaim'].mean()
print("Claim Frequency by Top 10 Postal Codes:")
print(freq_by_postal.sort_values(ascending=False))

# H₀: No margin difference between postal codes
margin_by_postal = df_postal.groupby('PostalCode')['Margin'].mean()
print("\nAvg Margin by Top 10 Postal Codes:")
print(margin_by_postal.sort_values(ascending=False))

# T-test: Compare top 2 postal codes
pc1, pc2 = margin_by_postal.idxmax(), margin_by_postal.idxmin()
margin1 = df_postal[df_postal['PostalCode'] == pc1]['Margin']
margin2 = df_postal[df_postal['PostalCode'] == pc2]['Margin']

t_stat, p_val = stats.ttest_ind(margin1, margin2, equal_var=False)
print(f"\nT-test: Postal {pc1} vs {pc2} Margin")
print(f"p-value = {p_val:.4f}")
if p_val < 0.05:
    print("✅ Reject H₀: Significant margin difference")
else:
    print("❌ Fail to reject H₀")

In [None]:
# Summarize findings
print("=== BUSINESS RECOMMENDATIONS ===")

if p_val < 0.05:  # From province test
    print("- Adjust premiums by province (e.g., +10% in Gauteng)")
if p_val < 0.05:  # From gender test
    print("- Consider gender-based pricing (if legally permitted in SA)")
if p_val < 0.05:  # From postal test
    print("- Target high-margin postal codes for marketing campaigns")

print("- Investigate high-risk postal codes for fraud patterns")
print("- Bundle safety features (e.g., tracking devices) for high-severity vehicles")