In [1]:
import pandas as pd
import numpy as np
import sys
import os
sys.path.append(os.path.abspath(".."))
# Load the cleaned dataset
from src.data_load import load_data
df = load_data("../data/cleaned_machineLearningRating.csv")
# STEP 1: Define Key Metrics (KPIs)
# 1. ClaimOccurred: 1 if TotalClaims > 0, else 0
df["ClaimOccurred"] = (df["TotalClaims"] > 0).astype(int)
# 2. ClaimSeverity: TotalClaims only when a claim occurred
df["ClaimSeverity"] = df["TotalClaims"]
df.loc[df["TotalClaims"] == 0, "ClaimSeverity"] = np.nan  # only consider where claim occurred
# 3. Margin: TotalPremium - TotalClaims
df["Margin"] = df["TotalPremium"] - df["TotalClaims"]
# Optional: check for NaN or outliers before testing
print(df[["ClaimOccurred", "ClaimSeverity", "Margin"]].describe())


       ClaimOccurred  ClaimSeverity         Margin
count  998443.000000    2773.000000  998443.000000
mean        0.002777   23169.911350      -2.922631
std         0.052627   38561.494622    2360.956489
min         0.000000     139.043860 -392848.566930
25%         0.000000    1680.728070       0.000000
50%         0.000000    6140.350877       2.160614
75%         0.000000   30459.649123      21.929825
max         1.000000  393092.105263    2253.507281



| Metric              | What It Measures                                  | How It’s Calculated                                                                              | Typical Use Cases                                                             |
| ------------------- | ------------------------------------------------- | ------------------------------------------------------------------------------------------------ | ----------------------------------------------------------------------------- |
| **Claim Frequency** | **How often** claims happen                       | `ClaimOccurred = 1 if TotalClaims > 0, else 0`         <br> Then use `mean()` to get % of claims | - Risk of **occurrence**- Fraud detection<br>- Pricing low-risk customers |
| **Claim Severity**  | **How large** the claims are **when they happen** | `ClaimSeverity = TotalClaims`, but only where `ClaimOccurred == 1`                               | - Risk of **magnitude**<br>- Reserve planning<br>- Underwriting large risks   |
| **Margin**         | Profitability of the insurance policy             | `Margin = TotalPremium - TotalClaims`                                                            | - Profitability analysis<br>- Pricing adjustments<br>- Risk management        |
| **Loss Ratio**     | Proportion of claims to premiums                  | `Loss Ratio = TotalClaims / TotalPremium`                                                         | - Overall profitability<br>- Pricing adequacy<br>- Risk assessment            |

In [3]:
# hypothesis 1:
# there is no risk diffeences accross provinces
# so lets stastically stes whether claim frequency vaires vetween provinces
""" so lets use Statistical Test (Chi-Squared)
Why Chi-Squared?
We're comparing categorical groups (provinces)
Against a binary outcome (ClaimOccurred: 0 or 1)(against risk yes or not)
Best test: Chi-squared test for independence"""
import pandas as pd
from scipy.stats import chi2_contingency
# STEP 1: Create a contingency table (cross-tab)
# Rows = Provinces, Columns = ClaimOccurred (0 or 1)
ct = pd.crosstab(df["Province"], df["ClaimOccurred"])
print("\nContingency Table (Province vs ClaimOccurred):")
print(ct)
# STEP 2: Run Chi-squared test
chi2, p_value, dof, expected = chi2_contingency(ct)
print("\nChi-squared test results:")
print(f"Chi2 statistic: {chi2:.4f}")
print(f"Degrees of freedom: {dof}")
print(f"P-value: {p_value:.6f}")

"""remember The null hypothesis (H₀) says:

There is NO difference in risk across provinces."""


Contingency Table (Province vs ClaimOccurred):
ClaimOccurred       0     1
Province                   
Eastern Cape    30282    50
Free State       8088    11
Gauteng        392012  1322
KwaZulu-Natal  169052   483
Limpopo         24769    67
Mpumalanga      51970   126
North West     142938   349
Northern Cape    6372     8
Western Cape   170187   357

Chi-squared test results:
Chi2 statistic: 111.2178
Degrees of freedom: 8
P-value: 0.000000


'remember The null hypothesis (H₀) says:\n\nThere is NO difference in risk across provinces.'

# since Since the p-value is far below 0.05, we reject the null hypothesis.
There is a statistically significant difference in claim frequency across provinces.
This means that geographic region (Province) does affect insurance risk, 
and should be considered in pricing, risk segmentation, or underwriting strategy.

Hypothesis 2: Risk Differences Between Zip Codes What We're Testing:
H₀ (Null Hypothesis):
There are no risk differences between Zip Codes.eski lets test and decide

lets select top 5 most common zip codes in the dataset as it is not possible to run chi-squared test with thoussands of zip codes

In [None]:
# STEP 1: Get top 5 most common Zip Codes (Postal Codes)
top_zip_codes = df["PostalCode"].value_counts().head(5).index.tolist()
print("Top 5 Zip Codes:", top_zip_codes)
# STEP 2: Filter to only those rows
df_zip = df[df["PostalCode"].isin(top_zip_codes)]
# STEP 3: Build a contingency table: ZipCode vs ClaimOccurred
zip_ct = pd.crosstab(df_zip["PostalCode"], df_zip["ClaimOccurred"])
print("\nContingency Table (ZipCode vs ClaimOccurred):")
print(zip_ct)
# STEP 4: Chi-squared test
chi2, p_value, dof, expected = chi2_contingency(zip_ct)

print("\nChi-squared test results for ZipCode:")
print(f"Chi2 statistic: {chi2:.4f}")
print(f"Degrees of freedom: {dof}")
print(f"P-value: {p_value:.6f}")


Top 5 Zip Codes: [2000, 122, 7784, 299, 7405]

Contingency Table (ZipCode vs ClaimOccurred):
ClaimOccurred       0    1
PostalCode                
122             48961  210
299             25479   67
2000           132656  486
7405            18489   29
7784            28532   50

Chi-squared test results for ZipCode:
Chi2 statistic: 60.3943
Degrees of freedom: 4
P-value: 0.000000


In [7]:
# so we reject the null hypothesis since p-value < 0.05
#There is a statistically significant difference in claim frequency between Zip Codes.
# Add a claim rate column


In [8]:
#Hypothesis 3: Margin Differences Between Zip Codes
# What We're Testing:
# H₀ (Null Hypothesis):
# There are no margin differences between Zip Codes.
# we have already calculated Margin in the previous steps
df["Margin"] = df["TotalPremium"] - df["TotalClaims"]
"""We’ll do a one-way ANOVA test (Analysis of Variance):

Because we're comparing means of a continuous variable (Margin) across multiple groups (Zip Codes)"""

"We’ll do a one-way ANOVA test (Analysis of Variance):\n\nBecause we're comparing means of a continuous variable (Margin) across multiple groups (Zip Codes)"

In [9]:
from scipy.stats import f_oneway

# STEP 1: Filter top 5 Zip Codes again
top_zip_codes = df["PostalCode"].value_counts().head(5).index.tolist()
df_zip = df[df["PostalCode"].isin(top_zip_codes)]

# STEP 2: Create a list of margin values per Zip Code
grouped_margins = [
    df_zip[df_zip["PostalCode"] == z]["Margin"].dropna()
    for z in top_zip_codes
]
# STEP 3: Run ANOVA test
f_stat, p_value = f_oneway(*grouped_margins)

print("\nANOVA test results (Margin by Zip Code):")
print(f"F-statistic: {f_stat:.4f}")
print(f"P-value: {p_value:.6f}")



ANOVA test results (Margin by Zip Code):
F-statistic: 2.4087
P-value: 0.047051


 since Since p-value = 0.047 is less than 0.05, we reject the null hypothesis.
 that means there is a statistically significant difference in margins between Zip Codes.
 This means that geographic region (Zip Code) does affect profitability, and should be considered in pricing or risk segmentation.

In [12]:
#Hypothesis 4: Gender-based risk differences?
# What We're Testing:
# H₀ (Null Hypothesis):
# H₀:There are not significant risk difference between Women and Men
""" Use ClaimOccurred as our KPI again
It’s a binary outcome: 1 = claim filed, 0 = no claim
We're comparing 2 groups: Male vs Female
➤ Best test: Z-test for proportions"""
from statsmodels.stats.proportion import proportions_ztest
# STEP 1: Filter only Male and Female rows (ignore Unknown/Not specified)
df_gender = df[df["Gender"].isin(["male", "female"])]
# STEP 2: Count number of claims and totals by gender
claim_counts = df_gender.groupby("Gender")["ClaimOccurred"].sum()
total_counts = df_gender.groupby("Gender")["ClaimOccurred"].count()
print("\nClaim Counts by Gender:")
print(claim_counts)
print("\nTotal Records by Gender:")
print(total_counts)
# STEP 3: Run Z-test for proportions
count = np.array([claim_counts["male"], claim_counts["female"]])
nobs = np.array([total_counts["male"], total_counts["female"]])
stat, p_value = proportions_ztest(count, nobs)
print("\nZ-Test for Claim Frequency by Gender:")
print(f"Z-statistic: {stat:.4f}")
print(f"P-value: {p_value:.6f}")


Claim Counts by Gender:
Gender
female     130
male      2643
Name: ClaimOccurred, dtype: int64

Total Records by Gender:
Gender
female     65731
male      932712
Name: ClaimOccurred, dtype: int64

Z-Test for Claim Frequency by Gender:
Z-statistic: 4.0301
P-value: 0.000056


In [None]:
# so since the p-value is 0.000066, which is less than 0.05, we reject the null hypothesis.
# There is a statistically significant difference in claim frequency


# Experimental Hypothesis Testing Workflow


 TotalClaims > 0 risk of filing a claim (binary classification)

 we are just Choosing a quantitative metric (KPI) that will reflect the impact of the feature we will be testing.

Do people with tracking devices file fewer claims than those without

our goal here is to test whether having a TrackingDevice reduces the risk of filing a claim.

| Step                   | Example                     |
| ---------------------- | --------------------------- |
| Pick a feature         | TrackingDevice              |
| Divide into two groups | "Yes" vs. "No"              |
| Choose a KPI           | ClaimOccurred (0/1)         |
| Run a test (Z-test)    | Compare the two claim rates |
| Check p-value          | Is it below 0.05?           |
| Business insight       | “Does it reduce claims?”    |


what we are really doing 
What Were We Doing?
We were simulating a basic A/B test, like this:
“Does having a Tracking Device reduce the chance of an insurance claim?”
To answer that, we:
Grouped the data into:
Group A: People without a tracking device
Group B: People with a tracking device
Compared their Claim Rates (how many people filed a claim)
Ran a statistical test to check:
“Are the differences just random or really significant?”



In [None]:
# so next step is choose a feature and segment the data
"""Pick a feature (TrackingDevice, AlarmImmobiliser,newvehicle or CoverType)
Then divide the data into:
Group A (Control Group) → no feature (e.g., no tracking device)
Group B (Test Group) → with feature (e.g., has tracking device)"""
# So our KPI is Claim Frequency
df["ClaimOccurred"] = (df["TotalClaims"] > 0).astype(int)
# Check values in TrackingDevice column
print(df["TrackingDevice"].value_counts())


TrackingDevice
No     655086
Yes    343357
Name: count, dtype: int64


In [15]:
# segment the groups 
# Drop missing values for fair testing
df_tracking = df[df["TrackingDevice"].isin(["Yes", "No"])]

# Control Group: No tracking device
group_a = df_tracking[df_tracking["TrackingDevice"] == "No"]

# Test Group: Has a tracking device
group_b = df_tracking[df_tracking["TrackingDevice"] == "Yes"]

# Print group sizes
print("Group A (No tracking):", len(group_a))
print("Group B (Has tracking):", len(group_b))


Group A (No tracking): 655086
Group B (Has tracking): 343357


In [16]:
# next step is statistical test so we are copairing two groups and their claim frequency 
# we will use z-test for proportions
from statsmodels.stats.proportion import proportions_ztest
# Count claims in each group
claim_counts = np.array([
    group_a["ClaimOccurred"].sum(),  # Total claims in Group A
    group_b["ClaimOccurred"].sum()   # Total claims in Group B
])
# Count total records in each group
total_counts = np.array([
    len(group_a),  # Total records in Group A
    len(group_b)   # Total records in Group B
])
# Run Z-test for proportions
stat, p_value = proportions_ztest(claim_counts, total_counts)
print("\nZ-Test for Claim Frequency by Tracking Device:")
print(f"Z-statistic: {stat:.4f}")
print(f"P-value: {p_value:.6f}")
# Check if p-value is below 0.05
if p_value < 0.05:
    print("Reject the null hypothesis: Having a tracking device significantly reduces claim frequency.")
else:
    print("Fail to reject the null hypothesis: No significant difference in claim frequency with tracking devices.")


Z-Test for Claim Frequency by Tracking Device:
Z-statistic: 1.0254
P-value: 0.305164
Fail to reject the null hypothesis: No significant difference in claim frequency with tracking devices.


# so lets see the diffeence between the two types of tests we have done

| Item                          | First 4 Hypotheses                                                          | KPI-Based A/B Test (This one)                                        |
| ----------------------------- | --------------------------------------------------------------------------- | -------------------------------------------------------------------- |
| **Goal**                   | Check if **natural groups** (e.g. Provinces, Genders) affect risk or margin | Check if a **specific feature** (e.g. Tracking Device) affects a KPI |
|  **Group creation**         | Based on categories that already exist                                      | You **manually select** 2 groups: control vs test                    |
|  **Test type**              | Chi-squared, ANOVA, z-test                                                  | Often Z-test (binary outcome)                                        |
|  **What you measure (KPI)** | Frequency, Severity, Margin                                                 | You pick 1 KPI (e.g. ClaimOccurred)                                  |
|  **Thinking style**         | “Is there a pattern in existing data?”                                      | “If I apply a feature, does it change the outcome?”                  |
|  **Purpose**                | Discover **natural differences**                                            | Simulate a controlled test of a **feature’s effect**                 |


Regular hypothesis testing looks for natural group differences.
KPI-based A/B-style testing lets you test a specific feature’s effect in a more controlled way.

In [18]:
# additional analysis choose newvehicle as a feature
# STEP 1: Check unique values in the NewVehicle column
df["ClaimOccurred"] = (df["TotalClaims"] > 0).astype(int)


In [19]:
# filter the boolean groups 
df_nv = df[df["NewVehicle"].isin([True, False])]
group_a = df_nv[df_nv["NewVehicle"] == False]
group_b = df_nv[df_nv["NewVehicle"] == True]
# check group sizes 
print("Group A (Old vehicles):", len(group_a))
print("Group B (New vehicles):", len(group_b))
# run z test
claim_counts = [group_a["ClaimOccurred"].sum(), group_b["ClaimOccurred"].sum()]
group_sizes = [len(group_a), len(group_b)]

stat, p_value = proportions_ztest(claim_counts, group_sizes)

print("\nZ-Test for NewVehicle Impact on Claim Frequency:")
print(f"Z-statistic: {stat:.4f}")
print(f"P-value: {p_value:.6f}")

if p_value < 0.05:
    print(" Reject the null hypothesis: NewVehicle status affects claim risk.")
else:
    print(" Fail to reject the null hypothesis: No significant difference between new and old vehicles.")


Group A (Old vehicles): 152402
Group B (New vehicles): 846041

Z-Test for NewVehicle Impact on Claim Frequency:
Z-statistic: -4.6145
P-value: 0.000004
 Reject the null hypothesis: NewVehicle status affects claim risk.


# so we here we can give the following business insights 

New cars may be safer, better maintained, or have better security features
They may also be treated more cautiously by drivers
This can lead to fewer claims
Insurers could consider offering discounts for new vehicles, or treat them as lower-risk segment