In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# --- FIX 1: Add Project Root to Path to enable 'src' imports ---
import sys
import os
# This navigates one directory up from 'notebooks' to the project root
project_root = os.path.abspath('..') 
if project_root not in sys.path:
    sys.path.append(project_root)
print(f"Project root added to path: {project_root}")
# --- END FIX 1 ---

# Import our custom statistical testing functions from the src folder
from src.statistical_tests import calculate_metrics, test_claim_frequency, test_numerical_difference

# Set standard alpha (significance level)
ALPHA = 0.05 

# --- Data Loading ---
# NOTE: Ensure your data file is named 'insurance_claims_raw.csv' and is in data/raw/
try:
    df_raw = pd.read_csv('data/raw/insurance_claims_raw.csv')
    print("Data loaded successfully.")
    
    # --- Data Processing: df_metrics is defined here ---
    df_metrics = calculate_metrics(df_raw)
    print("Metrics (HasClaim, Margin) calculated.")
    
except FileNotFoundError:
    print("FATAL ERROR: Data file not found. Ensure 'data/raw/insurance_claims_raw.csv' exists.")
    df_metrics = None # Explicitly set to None if loading fails

if df_metrics is not None:
    print("\nDataFrame Head:")
    df_metrics.head()

Project root added to path: c:\Users\Kifiya_Administrator\Desktop\New folder (3)\Insurance-Analytics-Week\Insurance-Analytics-Week
FATAL ERROR: Data file not found. Ensure 'data/raw/insurance_claims_raw.csv' exists.


In [2]:
# IMPORTANT: Replace these with two valid province names found in your data
PROVINCE_A = 'Gauteng'
PROVINCE_B = 'Western Cape'

print(f"## Testing H₀: No risk difference (Claim Frequency) between {PROVINCE_A} and {PROVINCE_B}")
print("-" * 60)

if df_metrics is not None:
    p_province_risk = test_claim_frequency(
        df_metrics, 
        grouping_col='Province', 
        group_a=PROVINCE_A, 
        group_b=PROVINCE_B, 
        alpha=ALPHA
    )

    # --- Business Interpretation ---
    print("\n--- Business Interpretation ---")
    if p_province_risk < ALPHA:
        print(f"✅ Conclusion: Reject H₀. Province is a significant risk factor (p-value={p_province_risk:.5f}).")
        print("Recommendation: Adjust premiums regionally, targeting lower rates for the lower-risk province to attract new clients.")
    else:
        print(f"❌ Conclusion: Fail to Reject H₀. Regional risk segmentation is not statistically justified (p-value={p_province_risk:.5f}).")
else:
    print("Skipping test: df_metrics is not defined (Data loading failed).")

## Testing H₀: No risk difference (Claim Frequency) between Gauteng and Western Cape
------------------------------------------------------------
Skipping test: df_metrics is not defined (Data loading failed).


In [3]:
GENDER_A = 'Female'
GENDER_B = 'Male'

print(f"\n## Testing H₀: No risk difference (Claim Frequency) between {GENDER_A} and {GENDER_B}")
print("-" * 60)

if df_metrics is not None:
    p_gender_risk = test_claim_frequency(
        df_metrics, 
        grouping_col='Gender', 
        group_a=GENDER_A, 
        group_b=GENDER_B, 
        alpha=ALPHA
    )

    # --- Business Interpretation ---
    print("\n--- Business Interpretation ---")
    if p_gender_risk < ALPHA:
        print(f"✅ Conclusion: Reject H₀. Gender is a statistically significant risk factor (p-value={p_gender_risk:.5f}).")
        print("Recommendation: Use this insight for targeted marketing campaigns, highlighting plan features attractive to the lower-risk gender segment.")
    else:
        print(f"❌ Conclusion: Fail to Reject H₀. Gender risk difference is not statistically significant (p-value={p_gender_risk:.5f}).")
else:
    print("Skipping test: df_metrics is not defined (Data loading failed).")


## Testing H₀: No risk difference (Claim Frequency) between Female and Male
------------------------------------------------------------
Skipping test: df_metrics is not defined (Data loading failed).


In [4]:
# IMPORTANT: Replace these with two specific zip codes identified in your EDA
ZIP_A = '1000' # Placeholder: High volume Zip Code 1
ZIP_B = '5000' # Placeholder: High volume Zip Code 2

print(f"\n## Testing H₀: No significant margin difference (Mean Profit) between Zip Codes {ZIP_A} and {ZIP_B}")
print("-" * 60)

if df_metrics is not None:
    p_margin_zip = test_numerical_difference(
        df_metrics, 
        grouping_col='PostalCode', 
        metric_col='Margin', 
        group_a=ZIP_A, 
        group_b=ZIP_B, 
        alpha=ALPHA,
        filter_claims=False # Margin is calculated across all policies
    )

    # --- Business Interpretation ---
    print("\n--- Business Interpretation ---")
    if p_margin_zip < ALPHA:
        print(f"✅ Conclusion: Reject H₀. Profitability varies significantly by location (p-value={p_margin_zip:.5f}).")
        print("Recommendation: Investigate cost/claims drivers in the low-margin zip code and adjust premium loading factors.")
    else:
        print(f"❌ Conclusion: Fail to Reject H₀. Margin difference is not statistically significant (p-value={p_margin_zip:.5f}).")
else:
    print("Skipping test: df_metrics is not defined (Data loading failed).")


## Testing H₀: No significant margin difference (Mean Profit) between Zip Codes 1000 and 5000
------------------------------------------------------------
Skipping test: df_metrics is not defined (Data loading failed).
