In [2]:
import pandas as pd
from scipy.stats import chi2_contingency
from scipy.stats import f_oneway

In [3]:
df = pd.read_csv("/kaggle/input/insurance-cleaned-data/cleaned_data.csv")

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70573 entries, 0 to 70572
Data columns (total 54 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   UnderwrittenCoverID       70573 non-null  int64  
 1   PolicyID                  70573 non-null  int64  
 2   TransactionMonth          70573 non-null  object 
 3   IsVATRegistered           70573 non-null  bool   
 4   Citizenship               70573 non-null  object 
 5   LegalType                 70573 non-null  object 
 6   Title                     70573 non-null  object 
 7   Language                  70573 non-null  object 
 8   Bank                      70573 non-null  object 
 9   AccountType               70573 non-null  object 
 10  MaritalStatus             70573 non-null  object 
 11  Gender                    70573 non-null  object 
 12  Country                   70573 non-null  object 
 13  Province                  70573 non-null  object 
 14  Postal

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70573 entries, 0 to 70572
Data columns (total 54 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   UnderwrittenCoverID       70573 non-null  int64  
 1   PolicyID                  70573 non-null  int64  
 2   TransactionMonth          70573 non-null  object 
 3   IsVATRegistered           70573 non-null  bool   
 4   Citizenship               70573 non-null  object 
 5   LegalType                 70573 non-null  object 
 6   Title                     70573 non-null  object 
 7   Language                  70573 non-null  object 
 8   Bank                      70573 non-null  object 
 9   AccountType               70573 non-null  object 
 10  MaritalStatus             70573 non-null  object 
 11  Gender                    70573 non-null  object 
 12  Country                   70573 non-null  object 
 13  Province                  70573 non-null  object 
 14  Postal

Let's add coumns usefull for hypothesis

In [6]:
df['HasClaim'] = df["TotalClaims"] > 0
df["Margin"] = df["TotalPremium"] - df["TotalClaims"]

Let's test key hypotheses about risk drivers one by one

**1.  H₀: There are no risk differences across provinces**

let's see by claim frequency

In [10]:
contingency_table = pd.crosstab( df["Province"] ,  df["HasClaim"])
print(contingency_table)
chi2, p, dof, expected = chi2_contingency(contingency_table)
print( "chi2", chi2)
print("P value" , p)
print("dof" , dof )
print("expected" , expected)
if p < 0.5:
    print("Reject the hypothesis")
    print("There are risk differences across provinces")
else:
    print("Could not reject hypothesis")

HasClaim       False  True 
Province                   
Eastern Cape    2410      8
Free State       292      2
Gauteng        24931    108
KwaZulu-Natal  16876     82
Limpopo         2284      8
Mpumalanga      3109     14
North West      9462     28
Northern Cape    202      0
Western Cape   10729     28
chi2 13.635798638247408
P value 0.09176615359073904
dof 8
expected [[2.40847505e+03 9.52494580e+00]
 [2.92841880e+02 1.15811996e+00]
 [2.49403668e+04 9.86332167e+01]
 [1.68911993e+04 6.68006745e+01]
 [2.28297139e+03 9.02860867e+00]
 [3.11069793e+03 1.23020702e+01]
 [9.45261715e+03 3.73828518e+01]
 [2.01204285e+02 7.95715075e-01]
 [1.07146262e+04 4.23737973e+01]]
Reject the hypothesis
There are risk differences across provinces


In [11]:
df.groupby('Province')['HasClaim'].mean()

Province
Eastern Cape     0.003309
Free State       0.006803
Gauteng          0.004313
KwaZulu-Natal    0.004835
Limpopo          0.003490
Mpumalanga       0.004483
North West       0.002950
Northern Cape    0.000000
Western Cape     0.002603
Name: HasClaim, dtype: float64

let's see by claim severity

In [15]:
Province = df["Province"].value_counts()
Province_claim = []
for key in Province.index:
    Province_claim.append(df[df["Province"] == key]["TotalClaims"])

In [19]:
f_stat , p_value_anvoa = f_oneway(*Province_claim)
print("Anova Results")
print(f"F Score,{f_stat:.4f}" )
print(f"P value,{p_value_anvoa:.4f}" )
print("Alpha Value 0.5")
if p_value_anvoa < 0.5:
    print("Reject the hypothesis")
    print("There are risk differences across provinces in terms of severity")
else:
    print("Could not reject hypothesis")
    print("There are no risk differences across provinces in terms of severity")

Anova Results
F Score,0.8057
P value,0.5974
Alpha Value 0.5
Could not reject hypothesis
There are no risk differences across provinces in terms of severity


**2. H₀: There are no risk differences between zip codes**

In [30]:
contingency_table = pd.crosstab( df["PostalCode"] ,  df["HasClaim"])
print(contingency_table)
chi2, p, dof, expected = chi2_contingency(contingency_table)
print( "chi2", chi2)
print(f"P value,{ p:.6f}")
print("dof" , dof )
#print("expected" , expected)
if p < 0.5:
    print("Reject the hypothesis")
    print("There are risk differences between zip codes")
else:
    print("Could not reject hypothesis")

HasClaim    False  True 
PostalCode              
1              52      0
8              16      0
29             22      0
44            131      1
46             11      0
...           ...    ...
8570           90      0
9323           54      1
9431          117      0
9499           76      1
9745           45      0

[338 rows x 2 columns]
chi2 461.42215938721336
P value,0.000007
dof 337
Reject the hypothesis
There are risk differences between zip codes


Let's see by claim severity

In [28]:
PostalCode = df["PostalCode"].value_counts()
PostalCode_claim = []
for key in Province.index:
    PostalCode_claim.append(df[df["PostalCode"] == key]["TotalClaims"])

In [31]:
f_stat , p_value_anvoa = f_oneway(*PostalCode_claim)
print("Anova Results")
print(f"F Score,{f_stat:.4f}" )
print(f"P value,{p_value_anvoa:.4f}" )
print("Alpha Value 0.5")
if p_value_anvoa < 0.5:
    print("Reject the hypothesis")
    print("There are risk differences between zip codes")
else:
    print("Could not reject hypothesis")
    print("There are no risk differences between zip codes")

Anova Results
F Score,nan
P value,nan
Alpha Value 0.5
Could not reject hypothesis
There are no risk differences between zip codes


  f_stat , p_value_anvoa = f_oneway(*PostalCode_claim)


**3.H₀: There are no significant margin (profit) difference between zip codes**

In [33]:
Profit_postalcode = []
for key in PostalCode.index:
    Profit_postalcode.append(df[df["PostalCode"] == key]["Margin"])

In [34]:
f_stat , p_value_anvoa = f_oneway(*Profit_postalcode)

In [35]:
print("Anova Results")
print(f"F Score,{f_stat:.4f}" )
print(f"P value,{p_value_anvoa:.4f}" )
print("Alpha Value 0.5")

Anova Results
F Score,0.8533
P value,0.9760
Alpha Value 0.5


In [36]:
if p_value_anvoa < 0.5:
    print("Reject null hypotesis")
    print("There are significant margin (profit) difference between zipcodes ")
else:
    print("Accept the null hypotesis")
    print("There are no significant margin (profit) difference between zipcodes ")

Accept the null hypotesis
There are no significant margin (profit) difference between zipcodes 


**4. H₀: There are not significant risk difference between Women and Men**

In [37]:
df.groupby("Gender" )[ "HasClaim"].value_counts()

Gender         HasClaim
Female         False          15
               True            1
Male           False        1626
               True           10
Not specified  False       68654
               True          267
Name: count, dtype: int64

In [39]:
contingency_table2 = pd.crosstab( df["Gender"] ,  df["HasClaim"])
print(contingency_table2)
chi2_2, p_2, dof_2, expected_2 = chi2_contingency(contingency_table2)
print( "chi2", chi2_2)
print("P value" , p_2)
print("dof" , dof_2 )
print("expected" , expected_2)
if p < 0.5:
    print("Reject the hypothesis")
    print("There are significant risk difference between Women and Men**")
else:
    print("Could not Reject the hypothesis")
    print("There are not significant risk difference between Women and Men**")

HasClaim       False  True 
Gender                     
Female            15      1
Male            1626     10
Not specified  68654    267
chi2 16.02833736354255
P value 0.00033074307853263893
dof 2
expected [[1.59369731e+01 6.30269366e-02]
 [1.62955550e+03 6.44450427e+00]
 [6.86495075e+04 2.71492469e+02]]
Reject the hypothesis
There are significant risk difference between Women and Men**
