In [None]:
# Jupyter Notebook Cell 1: Setup
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency, ttest_ind

# Load data
DATA_PATH = "../data/MachineLearningRating_v3.txt"
df = pd.read_csv(DATA_PATH, sep='|', low_memory=False)

# Preprocessing
df['HasClaim'] = df['TotalClaims'] > 0
df['ClaimSeverity'] = df.loc[df['HasClaim'], 'TotalClaims']
df['Margin'] = df['TotalPremium'] - df['TotalClaims']


In [None]:
# Cell 2: Claim Frequency by Province (Chi-Squared)
ct_prov = pd.crosstab(df['Province'], df['HasClaim'])
chi2, p_prov, _, _ = chi2_contingency(ct_prov)
print(f"Chi-Squared Test (Province): p = {p_prov:.4f}")

sns.barplot(data=ct_prov.div(ct_prov.sum(axis=1), axis=0).reset_index(),
            x='Province', y=True)
plt.title("Claim Frequency by Province")
plt.ylabel("Proportion with Claim")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
# Cell 3: Claim Frequency by Postal Code (Chi-Squared)
ct_postal = pd.crosstab(df['PostalCode'], df['HasClaim'])
chi2, p_postal, _, _ = chi2_contingency(ct_postal)
print(f"Chi-Squared Test (Postal Code): p = {p_postal:.4f}")


In [None]:
# Cell 4: Margin Difference Between Two Postal Codes (T-Test)
postal_codes = df['PostalCode'].dropna().unique()
if len(postal_codes) >= 2:
    pc1, pc2 = postal_codes[0], postal_codes[1]
    margin1 = df[df['PostalCode'] == pc1]['Margin'].dropna()
    margin2 = df[df['PostalCode'] == pc2]['Margin'].dropna()
    t_stat, p_margin = ttest_ind(margin1, margin2, equal_var=False)
    print(f"T-Test Margin {pc1} vs {pc2}: p = {p_margin:.4f}")
    
    sns.boxplot(data=df[df['PostalCode'].isin([pc1, pc2])], x='PostalCode', y='Margin')
    plt.title(f"Margin Comparison: {pc1} vs {pc2}")
    plt.show()
else:
    print("Not enough postal codes for margin comparison.")


In [None]:
# Cell 5: Claim Frequency by Gender
ct_gender = pd.crosstab(df['Gender'], df['HasClaim'])
chi2, p_gender, _, _ = chi2_contingency(ct_gender)
print(f"Chi-Squared Test (Gender): p = {p_gender:.4f}")

sns.barplot(data=ct_gender.div(ct_gender.sum(axis=1), axis=0).reset_index(),
            x='Gender', y=True)
plt.title("Claim Frequency by Gender")
plt.ylabel("Proportion with Claim")
plt.show()


In [None]:
# Cell 6: Summary & Interpretation
print("🔍 Statistical Results:")
print(f"Province - Claim Frequency: {'Reject H₀' if p_prov < 0.05 else 'Fail to reject H₀'} (p = {p_prov:.4f})")
print(f"PostalCode - Claim Frequency: {'Reject H₀' if p_postal < 0.05 else 'Fail to reject H₀'} (p = {p_postal:.4f})")
print(f"PostalCode - Margin: {'Reject H₀' if 'p_margin' in locals() and p_margin < 0.05 else 'Fail to reject H₀'}")
print(f"Gender - Claim Frequency: {'Reject H₀' if p_gender < 0.05 else 'Fail to reject H₀'} (p = {p_gender:.4f})")
