In [None]:
from pathlib import Path
import sys
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
# Path to project root
PROJECT_ROOT = Path.home() / "Desktop" / "Insurance_Project" / "prototype"
# Add root to Python path
sys.path.append(str(PROJECT_ROOT))

In [217]:
from pathlib import Path
import os
import importlib
from src.data import preprocessing as pr
import re
file_path = PROJECT_ROOT/"data"/"raw"/"insurance_claims.csv"
data = pr.load_data(file_path)
# Feature engineering specific to the prototype dataset
data = data.drop('_c39', axis=1)
data.replace('?', np.nan, inplace=True)

for char in ['Y', 'YES']:
    data.replace(char, True, inplace=True)
for char in ['N', 'NO']:
    data.replace(char, False, inplace=True)


  data.replace(char, False, inplace=True)


In [218]:
null_count = int(data.isna().sum().sum()) 
print(f"Null count: {null_count}")

Null count: 972


In [219]:
missing_values = pd.DataFrame(data.isnull().sum().sort_values(ascending=False)) # type: ignore
missing_values

Unnamed: 0,0
property_damage,360
police_report_available,343
collision_type,178
authorities_contacted,91
months_as_customer,0
age,0
policy_number,0
policy_deductable,0
policy_csl,0
policy_state,0


In [220]:
target = 'total_claim_amount'
X_train, X_test, X_val, y_train, y_test, y_val = pr.split_data(data, target)# type: ignore

In [221]:
data['policy_bind_date'] = pd.to_datetime(data['policy_bind_date'], yearfirst=True)
data['month'] = data['policy_bind_date'].dt.month
data['month_sin'] = np.sin(2 * np.pi * data['month']/12)
data['month_cos'] = np.cos(2 * np.pi * data['month']/12)

data = data.drop('month', axis=1)

In [222]:

data['insured_zip'] = data['insured_zip'].astype(object)
date_cols = data.select_dtypes(include="datetime").columns# type: ignore
bool_cols = []
for col in data.columns:
    unique_vals = set(data[col].dropna().unique())
    if unique_vals.issubset({True, False}):
        data[col] = data[col].astype('boolean')  # nullable bool
        bool_cols.append(col)

for col in bool_cols:
    print(f"Bool Column --------- {col}")
print("\n----------------------------------------------------\n")

for col in date_cols:
    print(f"Date Column --------- {col}")
print("\n----------------------------------------------------\n")
num_cols = data.loc[:,~data.columns.isin(date_cols)].select_dtypes(include="number").columns# type: ignore
for col in num_cols:
    print(f"Numerical Column --------- {col}")

print("\n----------------------------------------------------\n")
cat_cols = data.select_dtypes(include="object").columns# type: ignore
cat_cols = [x for x in cat_cols if x not in bool_cols]
for col in cat_cols:
    print(f"Categorical Column --------- {col}")

num_cols.drop("policy_number")

data = data.drop('policy_bind_date', axis=1)
# Cyclical interpretation of datetime feature

Bool Column --------- property_damage
Bool Column --------- police_report_available
Bool Column --------- fraud_reported

----------------------------------------------------

Date Column --------- policy_bind_date

----------------------------------------------------

Numerical Column --------- months_as_customer
Numerical Column --------- age
Numerical Column --------- policy_number
Numerical Column --------- policy_deductable
Numerical Column --------- policy_annual_premium
Numerical Column --------- umbrella_limit
Numerical Column --------- capital-gains
Numerical Column --------- capital-loss
Numerical Column --------- incident_hour_of_the_day
Numerical Column --------- number_of_vehicles_involved
Numerical Column --------- bodily_injuries
Numerical Column --------- witnesses
Numerical Column --------- total_claim_amount
Numerical Column --------- injury_claim
Numerical Column --------- property_claim
Numerical Column --------- vehicle_claim
Numerical Column --------- auto_year
Nu

In [223]:
print(X_train)

     months_as_customer  age  policy_number policy_bind_date policy_state  \
541                 239   41         743092       2013-11-11           OH   
440                 108   31         492224       2005-12-09           IN   
482                 116   30         996253       2001-11-29           IN   
422                   8   21         355085       2012-10-09           IN   
778                 161   38         192524       2004-01-02           IL   
..                  ...  ...            ...              ...          ...   
106                 464   61         632627       1990-10-07           OH   
270                 369   55         577810       2013-04-15           OH   
860                 230   42         175960       2004-11-16           IN   
435                 102   28         810189       1999-08-29           OH   
102                 279   41         389238       2001-06-06           IL   

    policy_csl  policy_deductable  policy_annual_premium  umbrella_limit  \

In [224]:
# data = data.drop('policy_number', axis=1)
# data = data.drop('incident_city', axis=1)
# data = data.drop('incident_location', axis=1)
# data = data.drop('insured_zip', axis=1)
# data = data.drop('insured_hobbies', axis=1)
# data = data.drop('injury_claim', axis=1)
# data = data.drop('property_claim', axis=1)
# data = data.drop('vehicle_claim', axis=1)
# data = data.drop('incident_date', axis=1)




In [225]:

# numerical_columns = X_train.select_dtypes(include='number').columns# type: ignore
# ordinal_columns = ['policy_state','policy_csl','insured_sex', 'insured_education_level', 'incident_type','incident_severity',]
# ordinal_categories = [['OH', 'IN', 'IL'],
#                       ['100/300', '250/500', '500/1000'],
#                       ['MALE', 'FEMALE'],
#                       ['High School', 'Masters','JD', 'MD'],
#                       ['Single Vehicle Collision', 'Multi-vehicle Collision'],
#                       ['Minor Damage', 'Major Damage', 'Total Loss']]
# nominal_columns = [x for x in cat_cols if x not in ordinal_columns]


In [226]:
data = data.drop('policy_number', axis=1)
data = data.drop('insured_hobbies', axis=1)
data = data.drop('injury_claim', axis=1)
data = data.drop('property_claim', axis=1)
data = data.drop('vehicle_claim', axis=1)
data = data.drop('incident_date', axis=1)
data = data.drop('collision_type', axis=1)
data = data.drop('number_of_vehicles_involved', axis=1)
data = data.drop('incident_severity', axis=1)
data = data.drop('incident_type', axis=1)
data = data.drop('bodily_injuries', axis=1)
data = data.drop('property_damage', axis=1)
data = data.drop('police_report_available', axis=1)
data = data.drop('authorities_contacted', axis=1)
data = data.drop('incident_hour_of_the_day', axis=1)
data = data.drop('fraud_reported', axis=1)
data = data.drop('capital-gains', axis=1)
data = data.drop('capital-loss', axis=1)



In [227]:
numerical_columns = X_train.select_dtypes(include='number').columns# type: ignore
ordinal_columns = ['policy_state','policy_csl','insured_sex', 'insured_education_level']
ordinal_categories = [['OH', 'IN', 'IL'],
                      ['100/300', '250/500', '500/1000'],
                      ['MALE', 'FEMALE'],
                      ['High School', 'Masters','JD', 'MD'],]
nominal_columns = [x for x in cat_cols if x not in ordinal_columns]


In [228]:
print(numerical_columns)

Index(['months_as_customer', 'age', 'policy_number', 'policy_deductable',
       'policy_annual_premium', 'umbrella_limit', 'insured_zip',
       'capital-gains', 'capital-loss', 'incident_hour_of_the_day',
       'number_of_vehicles_involved', 'bodily_injuries', 'witnesses',
       'injury_claim', 'property_claim', 'vehicle_claim', 'auto_year'],
      dtype='object')


In [229]:
importlib.reload(pr)
preprocessor, X_train_processed = pr.prepreprocessing_pipeline(
    X = X_train,
    numerical_cols=numerical_columns,
    ordinal_cols=ordinal_columns,
    ordinal_categories=ordinal_categories,
    nominal_cols=nominal_columns,
    bool_cols=bool_cols
)

In [230]:
importlib.reload(pr)
global_mean = y_train.mean()
X_val_processed = pr.apply_preprocessing_pipeline(X_val,
                                                  preprocessor=preprocessor)

X_test_processed = pr.apply_preprocessing_pipeline(X_test,
                                                  preprocessor=preprocessor)

In [231]:
importlib.reload(pr)
X_train_processed, y_train = pr.remove_null(X_train_processed, y_train)
X_val_processed, y_val = pr.remove_null(X_val_processed, y_val)
X_test_processed, y_test = pr.remove_null(X_test_processed, y_test)

In [232]:
train_data = pd.DataFrame(X_train_processed)
test_data = pd.DataFrame(X_test_processed)
val_data = pd.DataFrame(X_val_processed)
processed_data = pd.concat([
    train_data,
    test_data,
    val_data
], ignore_index=True)   

In [233]:
processed_data.shape

(1000, 38)

In [234]:
print(processed_data.isnull().sum().sum())

0


In [235]:
i = 0
for df in [train_data, test_data, val_data]:
    print(f"Null values in {i}: {df.isnull().sum().sum()}")
    i += 1

Null values in 0: 0
Null values in 1: 0
Null values in 2: 0


In [236]:
y_train_data = pd.Series(y_train)
y_test_data = pd.Series(y_test)
y_val_data = pd.Series(y_val)

In [237]:
y_train_data

541    10790
440    57330
482    59670
422    91260
778    60480
       ...  
106    79800
270    85300
860    58850
435    73400
102    28800
Name: total_claim_amount, Length: 700, dtype: int64

In [238]:
data_folder = PROJECT_ROOT/"data"/"processed"
cluster_folder = PROJECT_ROOT/"data"/"processed"/"indexed"
train_data.to_csv(data_folder/"X_train.csv", index=False)
test_data.to_csv(data_folder/"X_test.csv", index=False)
val_data.to_csv(data_folder/"X_val.csv",index=False)
y_train_data.to_csv(data_folder/"y_train.csv",index=False)
y_test_data.to_csv(data_folder/"y_test.csv",index=False)
y_val_data.to_csv(data_folder/"y_val.csv",index=False)

#Cluster data
train_data.to_csv(cluster_folder/"X_train.csv", index=True)
test_data.to_csv(cluster_folder/"X_test.csv", index=True)

In [239]:
# numerical_columns = X_train.select_dtypes(include='number').columns# type: ignore
# ordinal_columns = ['policy_state','policy_csl','insured_sex', 'insured_education_level', 'incident_type','incident_severity',]
# ordinal_categories = [['OH', 'IN', 'IL'],
#                       ['100/300', '250/500', '500/1000'],
#                       ['MALE', 'FEMALE'],
#                       ['High School', 'Masters','JD', 'MD'],
#                       ['Single Vehicle Collision', 'Multi-vehicle Collision'],
#                       ['Minor Damage', 'Major Damage', 'Total Loss']]
# nominal_columns = [x for x in cat_cols if x not in ordinal_columns]


In [240]:
numerical_columns = X_train.select_dtypes(include='number').columns# type: ignore
ordinal_columns = ['policy_state','policy_csl','insured_sex', 'insured_education_level']
ordinal_categories = [['OH', 'IN', 'IL'],
                      ['100/300', '250/500', '500/1000'],
                      ['MALE', 'FEMALE'],
                      ['High School', 'Masters','JD', 'MD'],]
nominal_columns = [x for x in cat_cols if x not in ordinal_columns]

In [241]:
numerical_columns = [col for col in numerical_columns if col in data.columns]
ordinal_columns = [col for col in ordinal_columns if col in data.columns]
nominal_columns = [col for col in nominal_columns if col in data.columns]
bool_cols = [col for col in bool_cols if col in data.columns]

In [242]:
print(numerical_columns)

['months_as_customer', 'age', 'policy_deductable', 'policy_annual_premium', 'umbrella_limit', 'insured_zip', 'witnesses', 'auto_year']


In [243]:
from sklearn.model_selection import train_test_split
target = 'total_claim_amount'
X, y = data[[col for col in data.columns if col not in target]], data[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

preprocessor, X_train_processed = pr.prepreprocessing_pipeline(
    X = X_train,
    numerical_cols=numerical_columns,
    ordinal_cols=ordinal_columns,
    ordinal_categories=ordinal_categories,
    nominal_cols=nominal_columns,
    bool_cols=bool_cols
)
#X_test_processed = pr.apply_preprocessing_pipeline(X_test, preprocessor)
train_reg, test_reg = pd.DataFrame(X_train_processed), pd.DataFrame(X_test_processed)
y_train_reg, y_test_reg = pd.Series(y_train), pd.Series(y_test)

train_reg.to_csv(data_folder/"reg"/"X_train.csv", index=False)
test_reg.to_csv(data_folder/"reg"/"X_test.csv", index=False)
y_train_reg.to_csv(data_folder/"reg"/"y_train.csv", index=False)
y_test_reg.to_csv(data_folder/"reg"/"y_test.csv", index=False)


In [244]:
(data['total_claim_amount'] >= 20000).value_counts(normalize=True)


total_claim_amount
True     0.82
False    0.18
Name: proportion, dtype: float64

In [245]:
importlib.reload(pr)

target = 'total_claim_amount'

X_train, X_test, y_train, y_test = pr.split_data_for_classification(data, target)

preprocessor, X_train_processed = pr.prepreprocessing_pipeline(
    X = X_train,
    numerical_cols=numerical_columns,
    ordinal_cols=ordinal_columns,
    ordinal_categories=ordinal_categories,
    nominal_cols=nominal_columns,
    bool_cols=bool_cols
)
X_train_processed, y_train = pr.oversampling(X_train_processed, y_train)
X_test_processed = pr.apply_preprocessing_pipeline(X_test, preprocessor)
train_class, test_class = pd.DataFrame(X_train_processed), pd.DataFrame(X_test_processed)
y_train_class, y_test_class = pd.Series(y_train), pd.Series(y_test)

train_class.to_csv(data_folder/"classification"/"X_train.csv", index=False)
test_class.to_csv(data_folder/"classification"/"X_test.csv", index=False)
y_train_class.to_csv(data_folder/"classification"/"y_train.csv", index=False)
y_test_class.to_csv(data_folder/"classification"/"y_test.csv", index=False)


severe
0    0.799
1    0.201
Name: proportion, dtype: float64


In [246]:
y_train_class.value_counts(normalize=True)

severe
0    0.5
1    0.5
Name: proportion, dtype: float64

In [247]:
y_test_class.value_counts(normalize=True)

severe
0    0.8
1    0.2
Name: proportion, dtype: float64

In [248]:
print(X.shape)

(1000, 21)


In [249]:
features = X.columns
features

Index(['months_as_customer', 'age', 'policy_state', 'policy_csl',
       'policy_deductable', 'policy_annual_premium', 'umbrella_limit',
       'insured_zip', 'insured_sex', 'insured_education_level',
       'insured_occupation', 'insured_relationship', 'incident_state',
       'incident_city', 'incident_location', 'witnesses', 'auto_make',
       'auto_model', 'auto_year', 'month_sin', 'month_cos'],
      dtype='object')

In [250]:
X_train = pd.DataFrame(X_train_processed)

In [251]:
X_test = pd.DataFrame(X_test_processed)

In [252]:
X_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.661052,0.686729,1.414868,0.308389,2.316550,1.695667,-1.345233,0.953545,0.060746,0.097496,0.941697,1.760205,-7.941524,-1.014514,-1.976802,0.726371,1.137201,-1.0,1.311045,1.863488
1,-0.633414,-1.102792,-0.211417,-1.107000,-0.454075,-0.282159,1.345233,-0.376210,0.060746,0.097496,0.941697,-0.979414,-7.941524,-0.141605,-0.309494,0.726371,0.133788,-1.0,0.709341,0.411958
2,-1.009509,-0.990947,-0.211417,0.721864,-0.454075,1.605387,-1.345233,-0.376210,0.060746,-1.147138,0.941697,-0.979414,-7.941524,0.876789,0.211540,0.726371,-2.430488,-1.0,0.107638,0.619319
3,0.337436,0.127503,-1.024560,-0.070905,-0.454075,-0.438129,-0.448411,-1.207307,-1.154168,-1.147138,0.941697,-0.294509,-7.941524,0.876789,0.211540,0.288456,0.802730,-1.0,0.909909,-0.210126
4,-1.123212,-1.550173,-0.211417,0.530784,-0.454075,-0.385204,1.345233,0.953545,1.275659,1.342130,0.941697,-0.979414,-7.941524,-0.432575,0.211540,0.288456,0.802730,-1.0,0.909909,-0.210126
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,-1.228168,-1.438328,-1.024560,0.969209,2.316550,1.597574,0.448411,-0.209991,1.275659,1.342130,-1.061913,-0.979414,-7.941524,1.313243,-0.309494,-1.268574,-2.430488,-1.0,-2.098608,-1.246933
396,0.354929,0.127503,1.414868,0.045127,1.854779,1.447440,-0.448411,-0.043771,-1.154168,1.342130,0.941697,0.390396,-7.941524,-0.287090,-0.309494,-1.268574,-2.430488,-1.0,-2.098608,-1.246933
397,-0.318544,-0.319877,-0.211417,-1.221719,-0.454075,-0.720065,-1.345233,0.454887,1.275659,-1.147138,0.941697,0.390396,-7.941524,-0.287090,0.315746,-1.268574,0.802730,-1.0,-0.694633,-1.246933
398,1.177089,1.134109,-0.211417,0.089972,-0.454075,-0.704424,-0.448411,-0.209991,0.060746,1.342130,-1.061913,1.760205,-7.941524,-1.014514,-0.309494,-1.268574,-0.312173,-1.0,-0.092930,-0.002765


In [253]:
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,1.387003,1.245954,-1.024560,-0.231305,1.393008,1.680462,-0.448411,-1.705965,1.275659,0.097496,-1.061913,-0.979414,-0.130189,-0.141605,-0.309494,0.726371,0.245279,0.0,-0.694633,0.204597
1,-0.073645,-0.319877,1.414868,0.356974,-0.454075,1.569884,1.345233,0.288668,0.060746,-1.147138,0.941697,-0.979414,-0.130189,1.458728,-0.309494,0.726371,0.133788,0.0,0.107638,-0.832210
2,-1.744205,-1.102792,-1.024560,0.499030,1.854779,-0.936787,0.448411,0.454887,1.275659,0.097496,-1.061913,-0.294509,-0.130189,0.003880,0.315746,0.726371,0.133788,0.0,-0.494065,1.863488
3,0.748516,0.686729,1.414868,-0.139506,-0.454075,-0.471977,-0.448411,0.122448,-1.154168,0.097496,0.941697,0.390396,-0.130189,-1.887423,-0.309494,0.288456,-0.312173,0.0,-1.697472,-1.869017
4,0.005073,0.127503,1.414868,2.031566,-0.454075,-0.655679,-0.448411,-1.373527,-1.154168,0.097496,-1.061913,1.075300,-0.130189,1.458728,0.211540,0.726371,1.137201,0.0,-0.494065,0.204597
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
953,-0.538513,-0.311993,-1.024560,0.392699,-0.454075,-0.445783,-1.329430,-0.880726,0.060746,0.119428,0.941697,1.063231,-0.130189,-1.159999,1.441822,0.718654,-0.314137,0.0,0.107638,-0.806632
954,0.538311,0.189224,1.414868,0.740297,-0.454075,1.685209,-0.347932,-0.043771,-0.881933,-1.147138,0.941697,-0.902678,-0.130189,-1.304269,-1.789999,0.337520,-0.249717,0.0,0.552042,0.295795
955,1.505306,1.563437,-1.024560,0.383527,-0.637793,-0.795113,0.091605,-0.638953,-0.422614,-0.397688,-1.061913,-0.567002,-0.130189,1.195919,-0.309494,-1.283224,1.137201,0.0,0.590721,0.119873
956,-0.876324,-0.692136,1.414868,1.451320,-0.454075,-0.635692,-0.475791,0.137672,-1.117077,0.059498,-1.061913,-0.937594,-0.130189,-0.251557,-1.976802,0.301825,-0.420259,0.0,0.922156,-1.033241


In [254]:
importlib.reload(pr)
processed_data = pr.clean_data(data=data)
pipe, processed_data_clustering = pr.prepreprocessing_pipeline_for_clustering(data, numerical_columns, ordinal_columns, ordinal_categories, nominal_columns, bool_cols)


In [262]:
processed_data_clustering.head(5)

Unnamed: 0,num__months_as_customer,num__age,num__policy_deductable,num__policy_annual_premium,num__umbrella_limit,num__insured_zip,num__witnesses,num__auto_year,ord__policy_state,ord__policy_csl,...,nom__auto_model_Pathfinder,nom__auto_model_RAM,nom__auto_model_RSX,nom__auto_model_Silverado,nom__auto_model_TL,nom__auto_model_Tahoe,nom__auto_model_Ultima,nom__auto_model_Wrangler,nom__auto_model_X5,nom__auto_model_X6
0,1.07814,0.990836,-0.222383,0.616705,-0.479476,-0.489529,0.461838,-0.18344,-1.187173,0.060937,...,-0.178862,-0.211972,-0.110208,-0.149983,-0.142857,-0.156813,-0.153432,-0.209383,-0.153432,-0.127515
1,0.208995,0.334073,1.412784,-0.242521,1.69798,-0.461008,-1.3387,0.315491,0.016856,0.060937,...,-0.178862,-0.211972,-0.110208,-0.149983,-0.142857,-0.156813,-0.153432,-0.209383,-0.153432,-0.127515
2,-0.608002,-1.088913,1.412784,0.642233,1.69798,-0.984885,1.362107,0.315491,-1.187173,-1.182668,...,-0.178862,4.717607,-0.110208,-0.149983,-0.142857,-0.156813,-0.153432,-0.209383,-0.153432,-0.127515
3,0.452355,0.224613,1.412784,0.652886,2.133471,1.491682,0.461838,1.479664,1.220886,0.060937,...,-0.178862,-0.211972,-0.110208,-0.149983,-0.142857,6.377042,-0.153432,-0.209383,-0.153432,-0.127515
4,0.208995,0.552994,-0.222383,1.34198,2.133471,1.527808,-0.438431,0.648112,1.220886,1.304542,...,-0.178862,-0.211972,9.073772,-0.149983,-0.142857,-0.156813,-0.153432,-0.209383,-0.153432,-0.127515


In [263]:
print(processed_data_clustering.isna().sum().sum())

0


In [266]:
processed_data_clustering.to_csv(PROJECT_ROOT/"data"/"clustering"/"input_clustering.csv")

In [256]:
print(processed_data.isna().sum().sum())

0


In [257]:
processed_data.to_csv(PROJECT_ROOT/"dashboard"/"assets"/"dashboard_data.csv")

In [258]:
processed_data

Unnamed: 0,months_as_customer,age,policy_state,policy_csl,policy_deductable,policy_annual_premium,umbrella_limit,insured_zip,insured_sex,insured_education_level,...,incident_city,incident_location,witnesses,total_claim_amount,auto_make,auto_model,auto_year,month_sin,month_cos,severe
0,328.0,48.0,OH,250/500,1000.0,1406.91,0.0,466132,MALE,MD,...,Columbus,9935 4th Drive,2.0,71610.0,Saab,92x,2004.0,-8.660254e-01,5.000000e-01,0
1,228.0,42.0,IN,250/500,2000.0,1197.22,5000000.0,468176,MALE,MD,...,Riverwood,6608 MLK Hwy,0.0,5070.0,Mercedes,E400,2007.0,1.224647e-16,-1.000000e+00,0
2,134.0,29.0,OH,100/300,2000.0,1413.14,5000000.0,430632,FEMALE,PhD,...,Columbus,7121 Francis Lane,3.0,34650.0,Dodge,RAM,2007.0,-1.000000e+00,-1.836970e-16,0
3,256.0,41.0,IL,250/500,2000.0,1415.74,6000000.0,608117,FEMALE,PhD,...,Arlington,6956 Maple Drive,2.0,63400.0,Chevrolet,Tahoe,2014.0,5.000000e-01,-8.660254e-01,0
4,228.0,44.0,IL,500/1000,1000.0,1583.91,6000000.0,610706,MALE,Associate,...,Arlington,3041 3rd Ave,1.0,6500.0,Accura,RSX,2009.0,1.224647e-16,-1.000000e+00,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,3.0,38.0,OH,500/1000,1000.0,1310.80,0.0,431289,FEMALE,Masters,...,Northbrook,6045 Andromedia St,1.0,87200.0,Honda,Accord,2006.0,-5.000000e-01,-8.660254e-01,1
996,285.0,41.0,IL,100/300,1000.0,1436.79,0.0,608177,FEMALE,PhD,...,Northbend,3092 Texas Drive,3.0,108480.0,Volkswagen,Passat,2015.0,5.000000e-01,8.660254e-01,1
997,130.0,34.0,OH,250/500,500.0,1383.49,3000000.0,442797,FEMALE,Masters,...,Arlington,7629 5th St,3.0,67500.0,Suburu,Impreza,1996.0,8.660254e-01,5.000000e-01,0
998,458.0,62.0,IL,500/1000,2000.0,1356.92,5000000.0,441714,MALE,Associate,...,Arlington,6128 Elm Lane,1.0,46980.0,Audi,A5,1998.0,-5.000000e-01,8.660254e-01,0


In [259]:
importlib.reload(pr)

<module 'src.data.preprocessing' from 'C:\\Users\\phili\\Desktop\\Insurance_Project\\src\\data\\preprocessing.py'>

In [260]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

preprocessor_pickle = pr.unfit_prepreprocessing_pipeline(
    numerical_cols=numerical_columns,
    ordinal_cols=ordinal_columns,
    ordinal_categories=ordinal_categories,
    nominal_cols=nominal_columns,
    bool_cols=bool_cols)
preprocessor_pickle.fit(X)

0,1,2
,transformers,"[('num', ...), ('ord', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,"[['OH', 'IN', ...], ['100/300', '250/500', ...], ...]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,func,<function to_...0016535AC09A0>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,'one-to-one'
,kw_args,
,inv_kw_args,

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,verbose,0
,cols,
,drop_invariant,False
,return_df,True
,handle_unknown,'value'
,handle_missing,'value'
,min_group_size,
,combine_min_nan_groups,True
,min_group_name,
,normalize,False


In [261]:
import joblib
joblib.dump(preprocessor_pickle, PROJECT_ROOT/"models"/"preprocessor.pkl")
joblib.dump(features, PROJECT_ROOT/"models"/"features.pkl")


['C:\\Users\\phili\\Desktop\\Insurance_Project\\models\\features.pkl']