In [1]:
# Task 3: Statistical Validation of Risk Drivers

# Step 1: Import Libraries
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

# For nicer plots
sns.set(style="whitegrid")

# Step 2: Load Data
data_path = '../data/insurance_data.csv'  # Adjust path if needed
df = pd.read_csv(data_path)

# Quick look
print("Data Preview:")
display(df.head())
print("\nData Info:")
df.info()

# Step 3: Define KPIs
df['ClaimFrequency'] = np.where(df['TotalClaims'] > 0, 1, 0)
df['ClaimSeverity'] = df['TotalClaims'] / df['ClaimFrequency'].replace(0, np.nan)
df['Margin'] = df['TotalPremium'] - df['TotalClaims']

# Step 4: Hypothesis Testing Functions

def ttest_numeric(group1, group2, label):
    t_stat, p_value = stats.ttest_ind(group1, group2, equal_var=False, nan_policy='omit')
    print(f"\nT-test for {label}:")
    print(f"T-statistic = {t_stat:.4f}, P-value = {p_value:.4f}")
    if p_value < 0.05:
        print(f"Reject Null Hypothesis → {label} differ significantly")
    else:
        print(f"Fail to reject Null Hypothesis → No significant difference in {label}")

def chi2_categorical(feature, target):
    contingency = pd.crosstab(df[feature], df[target])
    chi2, p, dof, expected = stats.chi2_contingency(contingency)
    print(f"\nChi-squared test for {feature} vs {target}:")
    print(f"Chi2 = {chi2:.4f}, P-value = {p:.4f}")
    if p < 0.05:
        print(f"Reject Null Hypothesis → {feature} impacts {target}")
    else:
        print(f"Fail to reject Null Hypothesis → No significant impact of {feature} on {target}")

# Step 5: Test Hypotheses

# 5.1: Province vs Claim Frequency & Severity
provinces = df['Province'].unique()
if len(provinces) >= 2:
    province1, province2 = provinces[0], provinces[1]
    group_A = df[df['Province'] == province1]['ClaimFrequency']
    group_B = df[df['Province'] == province2]['ClaimFrequency']
    ttest_numeric(group_A, group_B, f"ClaimFrequency: {province1} vs {province2}")
    
    group_A_sev = df[df['Province'] == province1]['ClaimSeverity']
    group_B_sev = df[df['Province'] == province2]['ClaimSeverity']
    ttest_numeric(group_A_sev, group_B_sev, f"ClaimSeverity: {province1} vs {province2}")

# 5.2: Zip Code vs Claim Frequency, Severity, Margin
zip_codes = df['ZipCode'].unique()
if len(zip_codes) >= 2:
    zip1, zip2 = zip_codes[0], zip_codes[1]
    group_A = df[df['ZipCode'] == zip1]['ClaimFrequency']
    group_B = df[df['ZipCode'] == zip2]['ClaimFrequency']
    ttest_numeric(group_A, group_B, f"ClaimFrequency: Zip {zip1} vs {zip2}")
    
    group_A_sev = df[df['ZipCode'] == zip1]['ClaimSeverity']
    group_B_sev = df[df['ZipCode'] == zip2]['ClaimSeverity']
    ttest_numeric(group_A_sev, group_B_sev, f"ClaimSeverity: Zip {zip1} vs {zip2}")
    
    group_A_margin = df[df['ZipCode'] == zip1]['Margin']
    group_B_margin = df[df['ZipCode'] == zip2]['Margin']
    ttest_numeric(group_A_margin, group_B_margin, f"Margin: Zip {zip1} vs {zip2}")

# 5.3: Gender vs Claim Frequency & Severity
if 'Gender' in df.columns:
    chi2_categorical('Gender', 'ClaimFrequency')
    group_male = df[df['Gender'] == 'Male']['ClaimSeverity']
    group_female = df[df['Gender'] == 'Female']['ClaimSeverity']
    ttest_numeric(group_male, group_female, "ClaimSeverity: Male vs Female")

# Step 6: Visualizations (Optional but recommended)
plt.figure(figsize=(10,5))
sns.boxplot(x='Province', y='ClaimSeverity', data=df)
plt.title("Claim Severity by Province")
plt.savefig('../plots/claim_severity_province.png')
plt.show()

plt.figure(figsize=(10,5))
sns.boxplot(x='ZipCode', y='Margin', data=df)
plt.title("Margin by Zip Code")
plt.savefig('../plots/margin_zipcode.png')
plt.show()

plt.figure(figsize=(10,5))
sns.boxplot(x='Gender', y='ClaimSeverity', data=df)
plt.title("Claim Severity by Gender")
plt.savefig('../plots/claim_severity_gender.png')
plt.show()

# Step 7: Summary / Business Interpretation
print("\nSummary Recommendations:")
print("- Reject hypotheses where p < 0.05 → these features impact risk or margin.")
print("- Accept hypotheses where p >= 0.05 → no significant impact detected.")
print("Business recommendation: Adjust premiums, coverage, or segmentation based on significant findings.")


FileNotFoundError: [Errno 2] No such file or directory: '../data/insurance_data.csv'