In [13]:
# Task 3: Risk Analysis & Hypothesis Testing
# ==========================================

# Step 0: Import Libraries
import pandas as pd
import numpy as np
from itertools import combinations
from scipy.stats import ttest_ind, chi2_contingency

# Optional: display all columns
pd.set_option('display.max_columns', None)

# Step 1: Load Data
data_path = '../data/insurance_data.csv'  # Adjust if needed
df = pd.read_csv(data_path)

print("Columns in dataset:", df.columns.tolist())
df.head()

# Step 2: Define Key Metrics
df['ClaimFrequency'] = (df['TotalClaims'] > 0).astype(int)
df['ClaimSeverity'] = df['TotalClaims'] / df['ClaimFrequency'].replace(0, np.nan)
df['Margin'] = df['TotalPremium'] - df['TotalClaims']

# Step 3: Dynamic Column Mapping
col_map = {'province': None, 'zip': None, 'gender': None}

for col in df.columns:
    lower = col.lower()
    if 'prov' in lower:
        col_map['province'] = col
    elif 'zip' in lower:
        col_map['zip'] = col
    elif 'gender' in lower or 'sex' in lower:
        col_map['gender'] = col

print("Column mapping detected:", col_map)

# Step 4a: Hypothesis Test — Risk Differences Across Provinces
prov_col = col_map['province']
if prov_col:
    provinces = df[prov_col].unique()
    print("\nHypothesis Test: Risk Differences Across Provinces")
    for a, b in combinations(provinces, 2):
        grp_a = df[df[prov_col] == a]['ClaimFrequency']
        grp_b = df[df[prov_col] == b]['ClaimFrequency']
        stat, p = ttest_ind(grp_a, grp_b, equal_var=False, nan_policy='omit')
        print(f"{a} vs {b} — Claim Frequency p-value: {p:.4f}")
else:
    print("\nNo province column found.")

# Step 4b: Hypothesis Test — Risk Differences Across Zip Codes
zip_col = col_map['zip']
if zip_col:
    zip_codes = df[zip_col].unique()[:5]  # Limit to first 5 zip codes
    print("\nHypothesis Test: Risk Differences Across Zip Codes")
    for a, b in combinations(zip_codes, 2):
        grp_a = df[df[zip_col] == a]['ClaimFrequency']
        grp_b = df[df[zip_col] == b]['ClaimFrequency']
        stat, p = ttest_ind(grp_a, grp_b, equal_var=False, nan_policy='omit')
        print(f"{a} vs {b} — Claim Frequency p-value: {p:.4f}")
else:
    print("\nNo zip column found.")

# Step 4c: Hypothesis Test — Margin Differences Across Zip Codes
if zip_col:
    print("\nHypothesis Test: Margin Differences Across Zip Codes")
    for a, b in combinations(zip_codes, 2):
        grp_a = df[df[zip_col] == a]['Margin']
        grp_b = df[df[zip_col] == b]['Margin']
        stat, p = ttest_ind(grp_a, grp_b, equal_var=False, nan_policy='omit')
        print(f"{a} vs {b} — Margin p-value: {p:.4f}")

# Step 4d: Hypothesis Test — Risk Differences by Gender
gender_col = col_map['gender']
if gender_col:
    genders = df[gender_col].unique()
    print("\nHypothesis Test: Risk Differences by Gender")
    if len(genders) >= 2:
        grp_1 = df[df[gender_col] == genders[0]]['ClaimFrequency']
        grp_2 = df[df[gender_col] == genders[1]]['ClaimFrequency']
        stat, p = ttest_ind(grp_1, grp_2, equal_var=False, nan_policy='omit')
        print(f"{genders[0]} vs {genders[1]} — Claim Frequency p-value: {p:.4f}")
else:
    print("\nNo gender column found.")

# Step 5: Summary & Interpretation
print("\n--- Interpretation ---")
print("If p-value < 0.05 → reject null hypothesis (significant difference)")
print("If p-value >= 0.05 → fail to reject null hypothesis (no significant difference)")


Columns in dataset: ['Age', 'Gender', 'Province', 'VehicleType', 'TotalPremium', 'TotalClaims', 'LossRatio']
Column mapping detected: {'province': 'Province', 'zip': None, 'gender': 'Gender'}

Hypothesis Test: Risk Differences Across Provinces
ON vs BC — Claim Frequency p-value: nan
ON vs MB — Claim Frequency p-value: nan
ON vs QC — Claim Frequency p-value: nan
ON vs AB — Claim Frequency p-value: nan
BC vs MB — Claim Frequency p-value: nan
BC vs QC — Claim Frequency p-value: nan
BC vs AB — Claim Frequency p-value: nan
MB vs QC — Claim Frequency p-value: nan
MB vs AB — Claim Frequency p-value: nan
QC vs AB — Claim Frequency p-value: nan

No zip column found.

Hypothesis Test: Risk Differences by Gender
Male vs Female — Claim Frequency p-value: nan

--- Interpretation ---
If p-value < 0.05 → reject null hypothesis (significant difference)
If p-value >= 0.05 → fail to reject null hypothesis (no significant difference)


  res = hypotest_fun_out(*samples, **kwds)
