In [None]:
# task3_hypothesis.py

import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency, ttest_ind, f_oneway

# -----------------------------
# Step 1: Load dataset
# -----------------------------
df = pd.read_csv("../data/insurance.csv")  # adjust path if needed

# -----------------------------
# Step 2: Compute metrics
# -----------------------------
df['ClaimFrequency'] = df['TotalClaims'].apply(lambda x: 1 if x > 0 else 0)
df['ClaimSeverity'] = df.apply(lambda row: row['TotalClaims'] if row['TotalClaims'] > 0 else np.nan, axis=1)
df['Margin'] = df['TotalPremium'] - df['TotalClaims']

# -----------------------------
# Step 3: Define function to interpret p-values
# -----------------------------
def interpret_p(p_value, alpha=0.05):
    if p_value < alpha:
        return "Reject H0 → significant difference exists"
    else:
        return "Fail to reject H0 → no significant difference"

# -----------------------------
# Step 4: Province Analysis
# -----------------------------
print("\n--- Province Analysis ---")
# Claim Frequency (Chi-Square)
table = pd.crosstab(df['Province'], df['ClaimFrequency'])
chi2, p, dof, ex = chi2_contingency(table)
print("Claim Frequency by Province: p-value =", round(p,4), "|", interpret_p(p))

# Claim Severity (ANOVA)
groups = [group['ClaimSeverity'].dropna() for name, group in df.groupby('Province')]
f_stat, p_val = f_oneway(*groups)
print("Claim Severity by Province: p-value =", round(p_val,4), "|", interpret_p(p_val))

# Margin (ANOVA)
groups_margin = [group['Margin'].dropna() for name, group in df.groupby('Province')]
f_stat, p_val = f_oneway(*groups_margin)
print("Margin by Province: p-value =", round(p_val,4), "|", interpret_p(p_val))

# -----------------------------
# Step 5: Gender Analysis
# -----------------------------
print("\n--- Gender Analysis ---")
male = df[df['Gender']=='Male']
female = df[df['Gender']=='Female']

# Claim Frequency (Chi-Square)
table = pd.crosstab(df['Gender'], df['ClaimFrequency'])
chi2, p, dof, ex = chi2_contingency(table)
print("Claim Frequency by Gender: p-value =", round(p,4), "|", interpret_p(p))

# Claim Severity (T-Test)
t_stat, p_val = ttest_ind(male['ClaimSeverity'].dropna(), female['ClaimSeverity'].dropna())
print("Claim Severity by Gender: p-value =", round(p_val,4), "|", interpret_p(p_val))

# Margin (T-Test)
t_stat, p_val = ttest_ind(male['Margin'].dropna(), female['Margin'].dropna())
print("Margin by Gender: p-value =", round(p_val,4), "|", interpret_p(p_val))

# -----------------------------
# Step 6: ZipCode Analysis
# -----------------------------
print("\n--- ZipCode Analysis ---")
# Select top 2 ZipCodes by count for comparison
top_zip = df['ZipCode'].value_counts().index[:2]
group_A = df[df['ZipCode']==top_zip[0]]
group_B = df[df['ZipCode']==top_zip[1]]

# Claim Frequency (Chi-Square)
table = pd.crosstab(df['ZipCode'].isin([top_zip[0], top_zip[1]]), df['ClaimFrequency'])
chi2, p, dof, ex = chi2_contingency(table)
print(f"Claim Frequency between Zip {top_zip[0]} and {top_zip[1]}: p-value =", round(p,4), "|", interpret_p(p))

# Claim Severity (T-Test)
t_stat, p_val = ttest_ind(group_A['ClaimSeverity'].dropna(), group_B['ClaimSeverity'].dropna())
print(f"Claim Severity between Zip {top_zip[0]} and {top_zip[1]}: p-value =", round(p_val,4), "|", interpret_p(p_val))

# Margin (T-Test)
t_stat, p_val = ttest_ind(group_A['Margin'].dropna(), group_B['Margin'].dropna())
print(f"Margin between Zip {top_zip[0]} and {top_zip[1]}: p-value =", round(p_val,4), "|", interpret_p(p_val))

print("\n--- Task 3 Hypothesis Testing Completed ---")
