In [2]:
from pathlib import Path
import sys
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
# Path to project root
PROJECT_ROOT = Path.home() / "Desktop" / "Insurance_Project"

# Add root to Python path
sys.path.append(str(PROJECT_ROOT))

In [3]:
from pathlib import Path
import os
import importlib
from src.data import preprocessing as pr
import re
file_path = PROJECT_ROOT/"data"/"raw"/"insurance_claims.csv"
data = pr.load_data(file_path)
# Feature engineering specific to the prototype dataset
data = data.drop('_c39', axis=1)
data.replace('?', np.nan, inplace=True)

for char in ['Y', 'YES']:
    data.replace(char, True, inplace=True)
for char in ['N', 'NO']:
    data.replace(char, False, inplace=True)


  data.replace(char, False, inplace=True)


In [4]:
null_count = int(data.isna().sum().sum()) 
print(f"Null count: {null_count}")

Null count: 972


In [5]:
missing_values = pd.DataFrame(data.isnull().sum().sort_values(ascending=False)) # type: ignore
missing_values

Unnamed: 0,0
property_damage,360
police_report_available,343
collision_type,178
authorities_contacted,91
months_as_customer,0
age,0
policy_number,0
policy_deductable,0
policy_csl,0
policy_state,0


In [6]:
target = 'total_claim_amount'
X_train, X_test, X_val, y_train, y_test, y_val = pr.split_data(data, target)# type: ignore

In [7]:
data['policy_bind_date'] = pd.to_datetime(data['policy_bind_date'], yearfirst=True)
data['month'] = data['policy_bind_date'].dt.month
data['month_sin'] = np.sin(2 * np.pi * data['month']/12)
data['month_cos'] = np.cos(2 * np.pi * data['month']/12)

data = data.drop('month', axis=1)

In [8]:

data['insured_zip'] = data['insured_zip'].astype(object)
date_cols = data.select_dtypes(include="datetime").columns# type: ignore
bool_cols = []
for col in data.columns:
    unique_vals = set(data[col].dropna().unique())
    if unique_vals.issubset({True, False}):
        data[col] = data[col].astype('boolean')  # nullable bool
        bool_cols.append(col)

for col in bool_cols:
    print(f"Bool Column --------- {col}")
print("\n----------------------------------------------------\n")

for col in date_cols:
    print(f"Date Column --------- {col}")
print("\n----------------------------------------------------\n")
num_cols = data.loc[:,~data.columns.isin(date_cols)].select_dtypes(include="number").columns# type: ignore
for col in num_cols:
    print(f"Numerical Column --------- {col}")

print("\n----------------------------------------------------\n")
cat_cols = data.select_dtypes(include="object").columns# type: ignore
cat_cols = [x for x in cat_cols if x not in bool_cols]
for col in cat_cols:
    print(f"Categorical Column --------- {col}")

num_cols.drop("policy_number")

data = data.drop('policy_bind_date', axis=1)
# Cyclical interpretation of datetime feature

Bool Column --------- property_damage
Bool Column --------- police_report_available
Bool Column --------- fraud_reported

----------------------------------------------------

Date Column --------- policy_bind_date

----------------------------------------------------

Numerical Column --------- months_as_customer
Numerical Column --------- age
Numerical Column --------- policy_number
Numerical Column --------- policy_deductable
Numerical Column --------- policy_annual_premium
Numerical Column --------- umbrella_limit
Numerical Column --------- capital-gains
Numerical Column --------- capital-loss
Numerical Column --------- incident_hour_of_the_day
Numerical Column --------- number_of_vehicles_involved
Numerical Column --------- bodily_injuries
Numerical Column --------- witnesses
Numerical Column --------- total_claim_amount
Numerical Column --------- injury_claim
Numerical Column --------- property_claim
Numerical Column --------- vehicle_claim
Numerical Column --------- auto_year
Nu

In [9]:
print(X_train)

     months_as_customer  age  policy_number policy_bind_date policy_state  \
541                 239   41         743092       2013-11-11           OH   
440                 108   31         492224       2005-12-09           IN   
482                 116   30         996253       2001-11-29           IN   
422                   8   21         355085       2012-10-09           IN   
778                 161   38         192524       2004-01-02           IL   
..                  ...  ...            ...              ...          ...   
106                 464   61         632627       1990-10-07           OH   
270                 369   55         577810       2013-04-15           OH   
860                 230   42         175960       2004-11-16           IN   
435                 102   28         810189       1999-08-29           OH   
102                 279   41         389238       2001-06-06           IL   

    policy_csl  policy_deductable  policy_annual_premium  umbrella_limit  \

In [10]:
numerical_columns = X_train.select_dtypes(include='number').columns# type: ignore
ordinal_columns = ['insured_sex', 'insured_education_level', 'incident_type','incident_severity',]
ordinal_categories = [['MALE', 'FEMALE'],
                      ['High School', 'Masters','JD', 'MD'],
                      ['Single Vehicle Collision', 'Multi-vehicle Collision'],
                      ['Minor Damage', 'Major Damage', 'Total Loss']]
nominal_columns = [x for x in cat_cols if x not in ordinal_columns]


In [11]:
print(nominal_columns)

['policy_state', 'policy_csl', 'insured_zip', 'insured_occupation', 'insured_hobbies', 'insured_relationship', 'incident_date', 'collision_type', 'authorities_contacted', 'incident_state', 'incident_city', 'incident_location', 'auto_make', 'auto_model']


In [12]:
importlib.reload(pr)
preprocessor, X_train_processed = pr.prepreprocessing_pipeline(
    X = X_train,
    numerical_cols=numerical_columns,
    ordinal_cols=ordinal_columns,
    ordinal_categories=ordinal_categories,
    nominal_cols=nominal_columns,
    bool_cols=bool_cols
)

In [13]:
importlib.reload(pr)
global_mean = y_train.mean()
X_val_processed = pr.apply_preprocessing_pipeline(X_val,
                                                  preprocessor=preprocessor)

X_test_processed = pr.apply_preprocessing_pipeline(X_test,
                                                  preprocessor=preprocessor)

In [14]:
importlib.reload(pr)
X_train_processed, y_train = pr.remove_null(X_train_processed, y_train)
X_val_processed, y_val = pr.remove_null(X_val_processed, y_val)
X_test_processed, y_test = pr.remove_null(X_test_processed, y_test)

In [15]:
train_data = pd.DataFrame(X_train_processed)
test_data = pd.DataFrame(X_test_processed)
val_data = pd.DataFrame(X_val_processed)
processed_data = pd.concat([
    train_data,
    test_data,
    val_data
], ignore_index=True)   

In [16]:
processed_data.shape

(1000, 38)

In [17]:
print(processed_data.isnull().sum().sum())

0


In [18]:
i = 0
for df in [train_data, test_data, val_data]:
    print(f"Null values in {i}: {df.isnull().sum().sum()}")
    i += 1

Null values in 0: 0
Null values in 1: 0
Null values in 2: 0


In [19]:
y_train_data = pd.Series(y_train)
y_test_data = pd.Series(y_test)
y_val_data = pd.Series(y_val)

In [20]:
y_train_data

541    10790
440    57330
482    59670
422    91260
778    60480
       ...  
106    79800
270    85300
860    58850
435    73400
102    28800
Name: total_claim_amount, Length: 700, dtype: int64

In [21]:
data_folder = PROJECT_ROOT/"data"/"processed"
train_data.to_csv(data_folder/"X_train.csv", index=False)
test_data.to_csv(data_folder/"X_test.csv", index=False)
val_data.to_csv(data_folder/"X_val.csv",index=False)
y_train_data.to_csv(data_folder/"y_train.csv",index=False)
y_test_data.to_csv(data_folder/"y_test.csv",index=False)
y_val_data.to_csv(data_folder/"y_val.csv",index=False)

In [28]:
from sklearn.model_selection import train_test_split
target = 'total_claim_amount'
X, y = data[[col for col in data.columns if col not in target]], data[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

preprocessor, X_train_processed = pr.prepreprocessing_pipeline(
    X = X_train,
    numerical_cols=numerical_columns,
    ordinal_cols=ordinal_columns,
    ordinal_categories=ordinal_categories,
    nominal_cols=nominal_columns,
    bool_cols=bool_cols
)
X_test_processed = pr.apply_preprocessing_pipeline(X_test, preprocessor)
train_reg, test_reg = pd.DataFrame(X_train_processed), pd.DataFrame(X_test_processed)
y_train_reg, y_test_reg = pd.Series(y_train), pd.Series(y_test)

train_reg.to_csv(data_folder/"reg"/"X_train.csv", index=False)
test_reg.to_csv(data_folder/"reg"/"X_test.csv", index=False)
y_train_reg.to_csv(data_folder/"reg"/"y_train.csv", index=False)
y_test_reg.to_csv(data_folder/"reg"/"y_test.csv", index=False)


In [31]:
X_train = pd.DataFrame(X_train_processed)

In [33]:
X_test = pd.DataFrame(X_test_processed)

In [34]:
X_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,28,29,30,31,32,33,34,35,36,37
0,-0.443599,-0.546246,1.352950,-1.043294,-1.068675,-0.477676,-0.721443,0.750179,-1.248779,-1.696979,...,50.0,167.0,12.0,249.0,178.0,201.0,142.0,0.0,67.0,25.0
1,-1.117755,-1.306764,1.108465,-0.223238,-0.197397,-0.477676,1.421328,1.193963,-2.278065,1.639596,...,41.0,150.0,14.0,231.0,351.0,215.0,105.0,0.0,60.0,29.0
2,-0.581887,-0.872182,-0.241858,1.416875,0.370154,-0.477676,-0.834652,1.006347,-0.502816,-0.826568,...,50.0,152.0,8.0,249.0,174.0,104.0,136.0,0.0,64.0,35.0
3,-0.305310,-0.220309,-1.406183,-0.223238,-1.296223,-0.477676,-0.644206,1.652179,-0.786138,-1.116705,...,49.0,167.0,24.0,249.0,351.0,215.0,105.0,0.0,66.0,29.0
4,0.126841,0.105628,0.936796,-1.043294,-0.477050,-0.477676,-0.672785,1.583627,0.953247,-0.826568,...,48.0,150.0,7.0,420.0,351.0,215.0,121.0,0.0,61.0,23.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1.189933,1.083437,-0.157110,-1.043294,1.946996,-0.477676,-0.394151,1.381579,0.458329,-1.261773,...,44.0,150.0,13.0,420.0,197.0,201.0,127.0,0.0,73.0,29.0
96,-0.097877,0.214273,-1.278436,-1.043294,-1.663762,-0.477676,-0.871339,-0.895069,0.953247,-1.551910,...,46.0,150.0,13.0,249.0,178.0,215.0,142.0,0.0,73.0,18.0
97,0.299702,-0.003018,-0.155693,-1.043294,0.602649,-0.477676,-0.818788,-0.895069,0.953247,-1.696979,...,58.0,152.0,15.0,420.0,174.0,201.0,133.0,0.0,63.0,14.0
98,0.697281,0.214273,-1.399137,-0.223238,0.767374,-0.477676,1.503775,1.663003,0.953247,1.639596,...,36.0,141.0,21.0,420.0,197.0,215.0,133.0,0.0,60.0,29.0


In [32]:
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,28,29,30,31,32,33,34,35,36,37
0,-0.936251,-1.415410,0.434765,1.416875,1.065207,-0.477676,1.553427,-0.534269,0.953247,-0.391362,...,52.0,152.0,15.0,231.0,174.0,215.0,121.0,1.0,71.0,27.0
1,2.175238,2.278538,0.037976,-0.223238,-1.997005,-0.477676,1.560946,-0.895069,0.953247,-0.826568,...,49.0,152.0,16.0,420.0,351.0,201.0,136.0,1.0,58.0,24.0
2,-0.538672,-0.980828,-0.214724,-1.043294,0.668317,-0.477676,1.434925,1.500643,-1.331265,1.494527,...,31.0,141.0,23.0,249.0,178.0,201.0,121.0,1.0,60.0,31.0
3,-0.607816,-0.980828,1.155860,1.416875,0.794292,-0.477676,1.575327,-0.895069,0.953247,1.059322,...,35.0,128.0,23.0,420.0,197.0,215.0,133.0,1.0,73.0,18.0
4,-1.160970,-1.306764,0.655822,-0.223238,-0.996948,1.262845,-0.477367,0.764611,0.953247,1.059322,...,49.0,162.0,20.0,420.0,351.0,201.0,136.0,1.0,61.0,23.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
895,0.420704,0.214273,-1.109566,1.416875,0.256051,-0.477676,-0.410688,2.038236,0.953247,-0.246294,...,35.0,167.0,17.0,420.0,178.0,215.0,142.0,1.0,61.0,23.0
896,-1.031324,-0.980828,-1.384911,-0.223238,-1.872678,-0.477676,1.587635,1.717124,-1.808251,-0.391362,...,49.0,162.0,14.0,420.0,197.0,215.0,133.0,1.0,68.0,26.0
897,-1.238757,-1.524055,0.822679,1.416875,-1.082402,-0.477676,-0.562950,-0.895069,0.953247,-1.551910,...,35.0,150.0,16.0,420.0,174.0,201.0,133.0,1.0,70.0,39.0
898,-0.590530,-1.089473,1.197881,-1.043294,-0.195377,-0.477676,1.399107,1.359931,-1.446029,-0.391362,...,49.0,167.0,16.0,420.0,351.0,239.0,136.0,1.0,52.0,13.0


In [75]:
importlib.reload(pr)
processed_data = pr.clean_data(data=data)

In [76]:
print(processed_data.isna().sum().sum())

0


In [78]:
processed_data.to_csv(PROJECT_ROOT/"dashboard"/"assets"/"dashboard_data.csv")

In [79]:
processed_data

Unnamed: 0,months_as_customer,age,policy_number,policy_state,policy_csl,policy_deductable,policy_annual_premium,umbrella_limit,insured_zip,insured_sex,...,total_claim_amount,injury_claim,property_claim,vehicle_claim,auto_make,auto_model,auto_year,fraud_reported,month_sin,month_cos
0,328.0,48.0,521585.0,OH,250/500,1000.0,1406.91,0.0,466132,MALE,...,71610.0,6510.0,13020.0,52080.0,Saab,92x,2004.0,1,-8.660254e-01,5.000000e-01
1,228.0,42.0,342868.0,IN,250/500,2000.0,1197.22,5000000.0,468176,MALE,...,5070.0,780.0,780.0,3510.0,Mercedes,E400,2007.0,1,1.224647e-16,-1.000000e+00
2,134.0,29.0,687698.0,OH,100/300,2000.0,1413.14,5000000.0,430632,FEMALE,...,34650.0,7700.0,3850.0,23100.0,Dodge,RAM,2007.0,0,-1.000000e+00,-1.836970e-16
3,256.0,41.0,227811.0,IL,250/500,2000.0,1415.74,6000000.0,608117,FEMALE,...,63400.0,6340.0,6340.0,50720.0,Chevrolet,Tahoe,2014.0,1,5.000000e-01,-8.660254e-01
4,228.0,44.0,367455.0,IL,500/1000,1000.0,1583.91,6000000.0,610706,MALE,...,6500.0,1300.0,650.0,4550.0,Accura,RSX,2009.0,0,1.224647e-16,-1.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,3.0,38.0,941851.0,OH,500/1000,1000.0,1310.80,0.0,431289,FEMALE,...,87200.0,17440.0,8720.0,61040.0,Honda,Accord,2006.0,0,-5.000000e-01,-8.660254e-01
996,285.0,41.0,186934.0,IL,100/300,1000.0,1436.79,0.0,608177,FEMALE,...,108480.0,18080.0,18080.0,72320.0,Volkswagen,Passat,2015.0,0,5.000000e-01,8.660254e-01
997,130.0,34.0,918516.0,OH,250/500,500.0,1383.49,3000000.0,442797,FEMALE,...,67500.0,7500.0,7500.0,52500.0,Suburu,Impreza,1996.0,0,8.660254e-01,5.000000e-01
998,458.0,62.0,533940.0,IL,500/1000,2000.0,1356.92,5000000.0,441714,MALE,...,46980.0,5220.0,5220.0,36540.0,Audi,A5,1998.0,0,-5.000000e-01,8.660254e-01
