# Feature Engineering

In [146]:
import pandas as pd 
import numpy as np 
import os
import sys

# Get the absolute path to the 'util' directory (assuming 'eda.ipynb' is one level down from 'your_project')
data_dir = os.path.abspath(os.path.join('data'))

# Add the 'util' directory to sys.path if it's not already there
if data_dir not in sys.path:
    sys.path.append(data_dir)
data_dir

# Get the absolute path to the 'util' directory (assuming 'eda.ipynb' is one level down from 'your_project')
util_dir = os.path.abspath(os.path.join('utils'))

# Add the 'util' directory to sys.path if it's not already there
if util_dir not in sys.path:
    sys.path.append(util_dir)

In [147]:
df = pd.read_csv(data_dir+'/supervised_leaned_eda.csv')
df.head()

Unnamed: 0,ZIP Code,Cov C Amount Weighted Avg,Avg Fire Risk Score,Number of Negligible Fire Risk Exposure,Number of Low Fire Risk Exposure,Number of Moderate Fire Risk Exposure,Number of High Fire Risk Exposure,Number of Very High Fire Risk Exposure,Earned Premium 2020,Earned Exposure 2020,Non-CAT Cov A Smoke - Incurred Losses,Non-CAT Cov A Smoke - Number of Claims,Non-CAT Cov C Fire - Incurred Losses,Non-CAT Cov C Fire - Number of Claims,Non-CAT Cov C Smoke - Incurred Losses,Non-CAT Cov C Smoke - Number of Claims,CAT Cov A Smoke - Incurred Losses,CAT Cov A Smoke - Number of Claims,Earned Premium 2021,Earned Exposure 2021
0,90001,174339.07,0.32,884,407,0,0,0,982193,1291,9320,1,40267,1,5070,1,86803,13,1076066,1345
1,90002,167880.4,0.33,1270,614,0,0,0,1400005,1884,1686,1,20720,1,542,1,27666,5,1523488,1939
2,90003,177789.87,0.31,1195,535,1,0,0,1424103,1731,0,0,128964,2,0,0,49203,6,1537173,1769
3,90004,635509.87,0.45,962,643,56,0,0,3992219,1661,0,0,0,0,0,0,5186,1,4428387,1675
4,90005,852256.91,0.44,224,127,16,0,0,1263229,368,0,0,0,0,0,0,0,0,1377640,379


In [148]:
df.columns

Index(['ZIP Code', 'Cov C Amount Weighted Avg', 'Avg Fire Risk Score',
       'Number of Negligible Fire Risk Exposure',
       'Number of Low Fire Risk Exposure',
       'Number of Moderate Fire Risk Exposure',
       'Number of High Fire Risk Exposure',
       'Number of Very High Fire Risk Exposure', 'Earned Premium 2020',
       'Earned Exposure 2020', 'Non-CAT Cov A Smoke - Incurred Losses',
       'Non-CAT Cov A Smoke - Number of Claims',
       'Non-CAT Cov C Fire - Incurred Losses',
       'Non-CAT Cov C Fire - Number of Claims',
       'Non-CAT Cov C Smoke - Incurred Losses',
       'Non-CAT Cov C Smoke - Number of Claims',
       'CAT Cov A Smoke - Incurred Losses',
       'CAT Cov A Smoke - Number of Claims', 'Earned Premium 2021',
       'Earned Exposure 2021'],
      dtype='object')

In [149]:
df["Total CAT Losses"] =  df["CAT Cov A Smoke - Incurred Losses"]
df["Total Non-CAT Losses"] = df["Non-CAT Cov A Smoke - Incurred Losses"] + df["Non-CAT Cov C Fire - Incurred Losses"]+df["Non-CAT Cov C Smoke - Incurred Losses"]
df.rename(columns = { "CAT Cov A Smoke - Number of Claims":"Total CAT Claims"}, inplace= True)
df["Total Non-CAT Claims"] = df["Non-CAT Cov A Smoke - Number of Claims"]+df["Non-CAT Cov C Fire - Number of Claims"]+df["Non-CAT Cov C Smoke - Number of Claims"]
df['Avg Premium'] = df['Earned Premium 2020'] / df['Earned Exposure 2020'].replace(0, np.nan)
df['Avg CAT Loss'] = df['Total CAT Losses'] / df['Earned Exposure 2020'].replace(0, np.nan)
df['Avg Non-CAT Loss'] = df['Total Non-CAT Losses'] / df['Earned Exposure 2020'].replace(0, np.nan)
df['Avg CAT Claims'] = df['Total Non-CAT Claims'] / df['Earned Exposure 2020'].replace(0, np.nan)
df['Avg Non-CAT Claims'] = df['Total CAT Claims'] / df['Earned Exposure 2020'].replace(0, np.nan)
df['Avg Premium 2021'] = df['Earned Premium 2021'] / df['Earned Exposure 2021'].replace(0, np.nan)
df['Loss Ratio'] = (df['Total CAT Losses'] + df['Total Non-CAT Losses'])/df['Earned Premium 2020'].replace(0, np.nan)
df['Claim Frequency'] = (df['Total Non-CAT Claims']+df['Total CAT Claims'])/df['Earned Exposure 2020'].replace(0, np.nan)
df['Average Claim Severity'] = (df['Total CAT Losses'] + df['Total Non-CAT Losses'])/(df['Total Non-CAT Claims']+df['Total CAT Claims']).replace(0, np.nan)
# Replace inf with NaN, then drop all NaN values
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)

In [150]:
df.shape


(1106, 32)

In [151]:
# check for their correlation and predictive power
# drop if any needed
# EDA for the new features
# Summary Section Explaining all

In [152]:
import cleaning
import importlib
importlib.reload(cleaning)
from cleaning import highly_correlated_features

features_to_drop = highly_correlated_features(df, "Avg Premium 2021")
features_to_drop

Earned Premium 2021 Earned Premium 2020 0.4179693431497019 0.4150874277778271
feature_to_drop: Earned Premium 2020, predictive score Earned Premium 2020: 0.0, predictive score Earned Premium 2021: 0.0
Earned Exposure 2021 Earned Exposure 2020 -0.09794124758561937 -0.0997385613890001
feature_to_drop: Earned Exposure 2020, predictive score Earned Exposure 2020: 0.0, predictive score Earned Exposure 2021: 0.0
Total CAT Losses CAT Cov A Smoke - Incurred Losses -0.0023631761243800287 -0.0023631761243800287
feature_to_drop: Total CAT Losses, predictive score CAT Cov A Smoke - Incurred Losses: 0.0, predictive score Total CAT Losses: 0.0
Total Non-CAT Losses Non-CAT Cov C Fire - Incurred Losses -0.00783632955344162 0.004170494959775798
feature_to_drop: Total Non-CAT Losses, predictive score Non-CAT Cov C Fire - Incurred Losses: 0.0, predictive score Total Non-CAT Losses: 0.0
Total Non-CAT Claims Non-CAT Cov A Smoke - Number of Claims -0.0866460693888748 -0.0768119932098768
feature_to_drop: Tot

['Earned Premium 2020',
 'Earned Exposure 2020',
 'Total CAT Losses',
 'Total Non-CAT Losses',
 'Total Non-CAT Claims',
 'Total Non-CAT Claims',
 'Loss Ratio']

In [153]:
df.drop(columns=features_to_drop, inplace=True)
df.shape

(1106, 26)

Observation:

We have a group of newly engineered features that has hight correlation with each other.

Impacts:

Highly correlated features will be dropped. The features that are dropped are Earned Premium 2020',
 'Earned Exposure 2020',
 'Total CAT Losses',
 'Total Non-CAT Losses',
 'Total Non-CAT Claims',
 'Total Non-CAT Claims',
 'Loss Ratio'. Now, the total features reduced from __32 to 26__ (6)

## Exploratory Data Analysis for the newly engineered features

In [154]:
df.columns

Index(['ZIP Code', 'Cov C Amount Weighted Avg', 'Avg Fire Risk Score',
       'Number of Negligible Fire Risk Exposure',
       'Number of Low Fire Risk Exposure',
       'Number of Moderate Fire Risk Exposure',
       'Number of High Fire Risk Exposure',
       'Number of Very High Fire Risk Exposure',
       'Non-CAT Cov A Smoke - Incurred Losses',
       'Non-CAT Cov A Smoke - Number of Claims',
       'Non-CAT Cov C Fire - Incurred Losses',
       'Non-CAT Cov C Fire - Number of Claims',
       'Non-CAT Cov C Smoke - Incurred Losses',
       'Non-CAT Cov C Smoke - Number of Claims',
       'CAT Cov A Smoke - Incurred Losses', 'Total CAT Claims',
       'Earned Premium 2021', 'Earned Exposure 2021', 'Avg Premium',
       'Avg CAT Loss', 'Avg Non-CAT Loss', 'Avg CAT Claims',
       'Avg Non-CAT Claims', 'Avg Premium 2021', 'Claim Frequency',
       'Average Claim Severity'],
      dtype='object')

In [155]:
import ppscore as pps
import plotly.express as px
import numpy as np
import importlib
import feature_engineering
importlib.reload(feature_engineering)
from feature_engineering import feature_stats_histogram


target = "Avg Premium 2021"
column =target
transformed_feature = []
for column in df.columns:
    stats_dict = feature_stats_histogram(df,column, target)
    df[f"transformed_feature_{column}"] = np.sqrt(df[column])
    stats_dict_sqrt = feature_stats_histogram(df,f"transformed_feature_{column}", target)
    if(stats_dict["Correlation"]>stats_dict_sqrt["Correlation"]):
        print("No Transformation Required")
    else:
        print(f"Transformation Required for feature:{column}")
        transformed_feature.append(column)

transformed_feature

column : ZIP Code
Predictive Power Score: 0.0000
Correlation with Target: -0.08165867901721742
Skewness of the feature: -0.20333050915253742


column : transformed_feature_ZIP Code
Predictive Power Score: 0.0000
Correlation with Target: -0.08214681631859194
Skewness of the feature: -0.21603519595050916
No Transformation Required


column : Cov C Amount Weighted Avg
Predictive Power Score: 0.2031
Correlation with Target: 0.8563992674065493
Skewness of the feature: 2.9631433640416596


column : transformed_feature_Cov C Amount Weighted Avg
Predictive Power Score: 0.2031
Correlation with Target: 0.8079971139167199
Skewness of the feature: 1.7317792484451253
No Transformation Required


column : Avg Fire Risk Score
Predictive Power Score: 0.0819
Correlation with Target: 0.4194330755898658
Skewness of the feature: 1.6257940235122337


column : transformed_feature_Avg Fire Risk Score
Predictive Power Score: 0.0666
Correlation with Target: 0.4587724251823534
Skewness of the feature: 1.1260779198950737
Transformation Required for feature:Avg Fire Risk Score


column : Number of Negligible Fire Risk Exposure
Predictive Power Score: 0.0000
Correlation with Target: -0.2629868945556229
Skewness of the feature: 1.202544062095347


column : transformed_feature_Number of Negligible Fire Risk Exposure
Predictive Power Score: 0.0000
Correlation with Target: -0.2957497972716296
Skewness of the feature: -0.028967619943002586
No Transformation Required


column : Number of Low Fire Risk Exposure
Predictive Power Score: 0.0000
Correlation with Target: -0.05142704416795091
Skewness of the feature: 1.423168676274592


column : transformed_feature_Number of Low Fire Risk Exposure
Predictive Power Score: 0.0000
Correlation with Target: -0.06380332981767481
Skewness of the feature: -0.0921975499484241
No Transformation Required


column : Number of Moderate Fire Risk Exposure
Predictive Power Score: 0.0000
Correlation with Target: 0.30371328797554836
Skewness of the feature: 3.2767087086887363


column : transformed_feature_Number of Moderate Fire Risk Exposure
Predictive Power Score: 0.0000
Correlation with Target: 0.3894136149648377
Skewness of the feature: 1.0571881796926592
Transformation Required for feature:Number of Moderate Fire Risk Exposure


column : Number of High Fire Risk Exposure
Predictive Power Score: 0.0000
Correlation with Target: 0.2918454481848835
Skewness of the feature: 5.123746207922466


column : transformed_feature_Number of High Fire Risk Exposure
Predictive Power Score: 0.0000
Correlation with Target: 0.4137711869721171
Skewness of the feature: 1.8791545663927702
Transformation Required for feature:Number of High Fire Risk Exposure


column : Number of Very High Fire Risk Exposure
Predictive Power Score: 0.0009
Correlation with Target: 0.2850731509232502
Skewness of the feature: 7.056063765434458


column : transformed_feature_Number of Very High Fire Risk Exposure
Predictive Power Score: 0.0126
Correlation with Target: 0.396118572759838
Skewness of the feature: 2.453007618026422
Transformation Required for feature:Number of Very High Fire Risk Exposure


column : Non-CAT Cov A Smoke - Incurred Losses
Predictive Power Score: 0.0000
Correlation with Target: -0.050968384886013886
Skewness of the feature: 6.275332283948447


column : transformed_feature_Non-CAT Cov A Smoke - Incurred Losses
Predictive Power Score: 0.0000
Correlation with Target: -0.09685498452178658
Skewness of the feature: 2.606449771243891
No Transformation Required


column : Non-CAT Cov A Smoke - Number of Claims
Predictive Power Score: 0.0000
Correlation with Target: -0.0768119932098768
Skewness of the feature: 7.958896393321983


column : transformed_feature_Non-CAT Cov A Smoke - Number of Claims
Predictive Power Score: 0.0000
Correlation with Target: -0.12894526691026684
Skewness of the feature: 2.747559521057403
No Transformation Required


column : Non-CAT Cov C Fire - Incurred Losses
Predictive Power Score: 0.0000
Correlation with Target: 0.004170494959775798
Skewness of the feature: 9.482165425870583


column : transformed_feature_Non-CAT Cov C Fire - Incurred Losses
Predictive Power Score: 0.0000
Correlation with Target: -0.021034117878732113
Skewness of the feature: 1.638292884248954
No Transformation Required


column : Non-CAT Cov C Fire - Number of Claims
Predictive Power Score: 0.0000
Correlation with Target: -0.09374000495547573
Skewness of the feature: 3.733171261427767


column : transformed_feature_Non-CAT Cov C Fire - Number of Claims
Predictive Power Score: 0.0000
Correlation with Target: -0.10428838821978523
Skewness of the feature: 0.6486930173977564
No Transformation Required


column : Non-CAT Cov C Smoke - Incurred Losses
Predictive Power Score: 0.0000
Correlation with Target: -0.013224105845388622
Skewness of the feature: 8.922316494096506


column : transformed_feature_Non-CAT Cov C Smoke - Incurred Losses
Predictive Power Score: 0.0000
Correlation with Target: -0.036896397765108016
Skewness of the feature: 3.459145946406484
No Transformation Required


column : Non-CAT Cov C Smoke - Number of Claims
Predictive Power Score: 0.0000
Correlation with Target: -0.045695120714572796
Skewness of the feature: 8.124396243089945


column : transformed_feature_Non-CAT Cov C Smoke - Number of Claims
Predictive Power Score: 0.0000
Correlation with Target: -0.06912301453118413
Skewness of the feature: 2.6534405686181346
No Transformation Required


column : CAT Cov A Smoke - Incurred Losses
Predictive Power Score: 0.0000
Correlation with Target: -0.0023631761243800287
Skewness of the feature: 13.89354345761267


column : transformed_feature_CAT Cov A Smoke - Incurred Losses
Predictive Power Score: 0.0000
Correlation with Target: -0.049639930667641816
Skewness of the feature: 4.455501150747807
No Transformation Required


column : Total CAT Claims
Predictive Power Score: 0.0000
Correlation with Target: -0.05088715799387567
Skewness of the feature: 8.674048089562708


column : transformed_feature_Total CAT Claims
Predictive Power Score: 0.0000
Correlation with Target: -0.11110144093726051
Skewness of the feature: 3.3893722263755643
No Transformation Required


column : Earned Premium 2021
Predictive Power Score: 0.0000
Correlation with Target: 0.4179693431497019
Skewness of the feature: 1.694308178883162


column : transformed_feature_Earned Premium 2021
Predictive Power Score: 0.0000
Correlation with Target: 0.30700400459186655
Skewness of the feature: 0.2243268775994817
No Transformation Required


column : Earned Exposure 2021
Predictive Power Score: 0.0000
Correlation with Target: -0.09794124758561937
Skewness of the feature: 0.8704759132151885


column : transformed_feature_Earned Exposure 2021
Predictive Power Score: 0.0000
Correlation with Target: -0.10220478763789474
Skewness of the feature: -0.1280727020617222
No Transformation Required


column : Avg Premium
Predictive Power Score: 0.8691
Correlation with Target: 0.9969536711626394
Skewness of the feature: 5.9496562349178985


column : transformed_feature_Avg Premium
Predictive Power Score: 0.8706
Correlation with Target: 0.9746239891041397
Skewness of the feature: 2.9880911252079505
No Transformation Required


column : Avg CAT Loss
Predictive Power Score: 0.0000
Correlation with Target: 0.09254176513326465
Skewness of the feature: 23.278956628875875


column : transformed_feature_Avg CAT Loss
Predictive Power Score: 0.0000
Correlation with Target: 0.05876508939921497
Skewness of the feature: 8.203115957734942
No Transformation Required


column : Avg Non-CAT Loss
Predictive Power Score: 0.0000
Correlation with Target: 0.033444028741435865
Skewness of the feature: 17.23061359671483


column : transformed_feature_Avg Non-CAT Loss
Predictive Power Score: 0.0000
Correlation with Target: 0.0359302674837235
Skewness of the feature: 8.137562870361005
Transformation Required for feature:Avg Non-CAT Loss


column : Avg CAT Claims
Predictive Power Score: 0.0000
Correlation with Target: 0.01114017351629082
Skewness of the feature: 11.188650567928603


column : transformed_feature_Avg CAT Claims
Predictive Power Score: 0.0000
Correlation with Target: -0.02206372396719546
Skewness of the feature: 4.785952001895927
No Transformation Required


column : Avg Non-CAT Claims
Predictive Power Score: 0.0000
Correlation with Target: 0.04347695746849961
Skewness of the feature: 7.947006098540332


column : transformed_feature_Avg Non-CAT Claims
Predictive Power Score: 0.0000
Correlation with Target: -0.034336571453393507
Skewness of the feature: 2.931296812559619
No Transformation Required


column : Avg Premium 2021
Predictive Power Score: 1.0000
Correlation with Target: 1.0
Skewness of the feature: 5.554956752513249


column : transformed_feature_Avg Premium 2021
Predictive Power Score: 0.9735
Correlation with Target: 0.9719501049701214
Skewness of the feature: 2.7593711957079408
No Transformation Required


column : Claim Frequency
Predictive Power Score: 0.0000
Correlation with Target: 0.03175874229734121
Skewness of the feature: 7.534408516786382


column : transformed_feature_Claim Frequency
Predictive Power Score: 0.0000
Correlation with Target: -0.011125957110369638
Skewness of the feature: 3.360993203537572
No Transformation Required


column : Average Claim Severity
Predictive Power Score: 0.0000
Correlation with Target: 0.1388632040008827
Skewness of the feature: 3.8212831125428677


column : transformed_feature_Average Claim Severity
Predictive Power Score: 0.0000
Correlation with Target: 0.1450797281357173
Skewness of the feature: 1.3982856122889324
Transformation Required for feature:Average Claim Severity


['Avg Fire Risk Score',
 'Number of Moderate Fire Risk Exposure',
 'Number of High Fire Risk Exposure',
 'Number of Very High Fire Risk Exposure',
 'Avg Non-CAT Loss',
 'Average Claim Severity']

In [156]:
for column in transformed_feature:
    df[column] = np.sqrt(df[column])

Observation:
Highly Skewed variables.

Impact:
Log transformation.

<!-- Work on the transformation code, we nned to remove the code that adds transformation column in the dataframe -->