In [1]:
%load_ext autoreload
%autoreload

In [2]:
import warnings
import os
from pathlib import Path
import pandas as pd
from dotenv import load_dotenv
from analysis.risk_hypthesis_tester import RiskHypothesisTester
from utils.load_data import load_local_data

In [3]:
#suppress warning errors
warnings.filterwarnings("ignore")
#load enviroment variables
load_dotenv()

True

In [4]:
# Automatically go to project root (where .git or README.md is)
project_root = Path.cwd()
while not (project_root / "README.md").exists() and project_root != project_root.parent:
    project_root = project_root.parent

os.chdir(project_root)
print("Project root set to:", project_root)


Project root set to: /home/teshager/Documents/10Academy/repositories/projects/insurance-risk-modeling


## 🧱 1. Setup & Imports

In [5]:
# Load the insurance data
processed_data_dir=os.getenv("PROCESSED_DATA")
file_path= os.path.join(processed_data_dir,'cleaned_insurance_data.csv')
df=pd.read_csv(file_path).drop('CustomValueEstimate',axis=1)
df.head()

Unnamed: 0,UnderwrittenCoverID,PolicyID,TransactionMonth,IsVATRegistered,Citizenship,LegalType,Title,Bank,AccountType,MaritalStatus,...,CalculatedPremiumPerTerm,ExcessSelected,CoverCategory,CoverType,CoverGroup,Section,Product,TotalPremium,TotalClaims,Gender_Inferred
0,145249,12827,2015-03-01 00:00:00,True,,Close Corporation,Mr,First National Bank,Current account,Not specified,...,25.0,Mobility - Windscreen,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,21.929825,0.0,Male
1,145249,12827,2015-05-01 00:00:00,True,,Close Corporation,Mr,First National Bank,Current account,Not specified,...,25.0,Mobility - Windscreen,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,21.929825,0.0,Male
2,145249,12827,2015-07-01 00:00:00,True,,Close Corporation,Mr,First National Bank,Current account,Not specified,...,25.0,Mobility - Windscreen,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,0.0,0.0,Male
3,145255,12827,2015-05-01 00:00:00,True,,Close Corporation,Mr,First National Bank,Current account,Not specified,...,584.6468,Mobility - Metered Taxis - R2000,Own damage,Own Damage,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,512.84807,0.0,Male
4,145255,12827,2015-07-01 00:00:00,True,,Close Corporation,Mr,First National Bank,Current account,Not specified,...,584.6468,Mobility - Metered Taxis - R2000,Own damage,Own Damage,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,0.0,0.0,Male


In [7]:
df.columns.sort_values()

Index(['AccountType', 'AlarmImmobiliser', 'Bank', 'CalculatedPremiumPerTerm',
       'CapitalOutstanding', 'Citizenship', 'Converted', 'CoverCategory',
       'CoverGroup', 'CoverType', 'CrossBorder', 'Cylinders', 'ExcessSelected',
       'Gender_Inferred', 'IsVATRegistered', 'LegalType', 'MainCrestaZone',
       'MaritalStatus', 'Model', 'NewVehicle', 'NumberOfDoors', 'PolicyID',
       'PostalCode', 'Product', 'Province', 'Rebuilt', 'RegistrationYear',
       'Section', 'SubCrestaZone', 'SumInsured', 'TermFrequency', 'Title',
       'TotalClaims', 'TotalPremium', 'TrackingDevice', 'TransactionMonth',
       'UnderwrittenCoverID', 'VehicleIntroDate', 'VehicleType', 'WrittenOff',
       'bodytype', 'cubiccapacity', 'kilowatts', 'make', 'mmcode'],
      dtype='object')

In [20]:
# Define columns
group_col = 'Gender'
metric_col = 'TotalClaims'

#rename the gender_inferred column

df=df.rename(columns={'Gender_Inferred':"Gender"})
df.columns


Index(['UnderwrittenCoverID', 'PolicyID', 'TransactionMonth',
       'IsVATRegistered', 'Citizenship', 'LegalType', 'Title', 'Bank',
       'AccountType', 'MaritalStatus', 'Province', 'PostalCode',
       'MainCrestaZone', 'SubCrestaZone', 'mmcode', 'VehicleType',
       'RegistrationYear', 'make', 'Model', 'Cylinders', 'cubiccapacity',
       'kilowatts', 'bodytype', 'NumberOfDoors', 'VehicleIntroDate',
       'AlarmImmobiliser', 'TrackingDevice', 'CapitalOutstanding',
       'NewVehicle', 'WrittenOff', 'Rebuilt', 'Converted', 'CrossBorder',
       'SumInsured', 'TermFrequency', 'CalculatedPremiumPerTerm',
       'ExcessSelected', 'CoverCategory', 'CoverType', 'CoverGroup', 'Section',
       'Product', 'TotalPremium', 'TotalClaims', 'Gender'],
      dtype='object')

In [21]:

# Initialize the tester
tester = RiskHypothesisTester(df)

In [22]:
df['Gender'].value_counts()

Gender
Male      933812
Female     65732
Name: count, dtype: int64

In [23]:
# Run the test
# result = tester.two_sample_ttest(group_col=group_col, metric_col=metric_col, equal_var=False)
result= tester.run_ttest(feature=group_col,metric=metric_col)

[2025-06-13 13:45:11,969] INFO: Running t-test for TotalClaims by Gender...
INFO:analysis.risk_hypthesis_tester:Running t-test for TotalClaims by Gender...
[2025-06-13 13:45:12,985] INFO: T-test result (p=0.02442): REJECT null
INFO:analysis.risk_hypthesis_tester:T-test result (p=0.02442): REJECT null


In [24]:
# View result
print(result)

{'feature': 'Gender', 'metric': 'TotalClaims', 'group_a': 'Male', 'group_b': 'Female', 't_stat': 2.250445015518036, 'p_value': 0.024423519322613906}
