In [1]:
from pathlib import Path
import sys
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
# Path to project root
PROJECT_ROOT = Path.home() / "Desktop" / "Insurance_Project" / "prototype"
# Add root to Python path
sys.path.append(str(PROJECT_ROOT))

In [2]:
from pathlib import Path
import os
import importlib
from src.data import preprocessing as pr
import re
file_path = PROJECT_ROOT/"data"/"raw"/"sample_insurance_claims.csv"
data = pr.load_data(file_path)
# Feature engineering specific to the prototype dataset
data = data.drop('_c39', axis=1)
data.replace('?', np.nan, inplace=True)

for char in ['Y', 'YES']:
    data.replace(char, True, inplace=True)
for char in ['N', 'NO']:
    data.replace(char, False, inplace=True)


  data.replace(char, False, inplace=True)


In [3]:
null_count = int(data.isna().sum().sum()) 
print(f"Null count: {null_count}")

Null count: 283


In [4]:
missing_values = pd.DataFrame(data.isnull().sum().sort_values(ascending=False)) # type: ignore
missing_values

Unnamed: 0,0
police_report_available,79
property_damage,76
authorities_contacted,75
collision_type,53
months_as_customer,0
age,0
policy_number,0
policy_deductable,0
policy_csl,0
policy_state,0


In [5]:
target = 'total_claim_amount'
X_train, X_test, X_val, y_train, y_test, y_val = pr.split_data(data, target)# type: ignore

In [6]:
data['policy_bind_date'] = pd.to_datetime(data['policy_bind_date'], yearfirst=True)
data['month'] = data['policy_bind_date'].dt.month
data['month_sin'] = np.sin(2 * np.pi * data['month']/12)
data['month_cos'] = np.cos(2 * np.pi * data['month']/12)

data = data.drop('month', axis=1)

In [7]:

data['insured_zip'] = data['insured_zip'].astype(object)
date_cols = data.select_dtypes(include="datetime").columns# type: ignore
bool_cols = []
for col in data.columns:
    unique_vals = set(data[col].dropna().unique())
    if unique_vals.issubset({True, False}):
        data[col] = data[col].astype('boolean')  # nullable bool
        bool_cols.append(col)

for col in bool_cols:
    print(f"Bool Column --------- {col}")
print("\n----------------------------------------------------\n")

for col in date_cols:
    print(f"Date Column --------- {col}")
print("\n----------------------------------------------------\n")
num_cols = data.loc[:,~data.columns.isin(date_cols)].select_dtypes(include="number").columns# type: ignore
for col in num_cols:
    print(f"Numerical Column --------- {col}")

print("\n----------------------------------------------------\n")
cat_cols = data.select_dtypes(include="object").columns# type: ignore
cat_cols = [x for x in cat_cols if x not in bool_cols]
for col in cat_cols:
    print(f"Categorical Column --------- {col}")

num_cols.drop("policy_number")

data = data.drop('policy_bind_date', axis=1)
# Cyclical interpretation of datetime feature

Bool Column --------- property_damage
Bool Column --------- police_report_available
Bool Column --------- fraud_reported

----------------------------------------------------

Date Column --------- policy_bind_date

----------------------------------------------------

Numerical Column --------- months_as_customer
Numerical Column --------- age
Numerical Column --------- policy_number
Numerical Column --------- policy_deductable
Numerical Column --------- policy_annual_premium
Numerical Column --------- umbrella_limit
Numerical Column --------- capital-gains
Numerical Column --------- capital-loss
Numerical Column --------- incident_hour_of_the_day
Numerical Column --------- number_of_vehicles_involved
Numerical Column --------- bodily_injuries
Numerical Column --------- witnesses
Numerical Column --------- total_claim_amount
Numerical Column --------- injury_claim
Numerical Column --------- property_claim
Numerical Column --------- vehicle_claim
Numerical Column --------- auto_year
Nu

In [11]:
data = data.drop('policy_number', axis=1)
data = data.drop('insured_hobbies', axis=1)
data = data.drop('injury_claim', axis=1)
data = data.drop('property_claim', axis=1)
data = data.drop('vehicle_claim', axis=1)
data = data.drop('incident_date', axis=1)
data = data.drop('collision_type', axis=1)
data = data.drop('number_of_vehicles_involved', axis=1)
data = data.drop('incident_severity', axis=1)
data = data.drop('incident_type', axis=1)
data = data.drop('bodily_injuries', axis=1)
data = data.drop('property_damage', axis=1)
data = data.drop('police_report_available', axis=1)
data = data.drop('authorities_contacted', axis=1)
data = data.drop('incident_hour_of_the_day', axis=1)
data = data.drop('fraud_reported', axis=1)
data = data.drop('capital-gains', axis=1)
data = data.drop('capital-loss', axis=1)



In [12]:
numerical_columns = X_train.select_dtypes(include='number').columns# type: ignore
ordinal_columns = ['policy_state','policy_csl','insured_sex', 'insured_education_level']
ordinal_categories = [['OH', 'IN', 'IL'],
                      ['100/300', '250/500', '500/1000'],
                      ['MALE', 'FEMALE'],
                      ['High School', 'Masters','JD', 'MD'],]
nominal_columns = [x for x in cat_cols if x not in ordinal_columns]


In [13]:
print(numerical_columns)

Index(['months_as_customer', 'age', 'policy_number', 'policy_deductable',
       'policy_annual_premium', 'umbrella_limit', 'insured_zip',
       'capital-gains', 'capital-loss', 'incident_hour_of_the_day',
       'number_of_vehicles_involved', 'bodily_injuries', 'witnesses',
       'injury_claim', 'property_claim', 'vehicle_claim', 'auto_year'],
      dtype='object')


In [14]:
importlib.reload(pr)
preprocessor, X_train_processed = pr.prepreprocessing_pipeline(
    X = X_train,
    numerical_cols=numerical_columns,
    ordinal_cols=ordinal_columns,
    ordinal_categories=ordinal_categories,
    nominal_cols=nominal_columns,
    bool_cols=bool_cols
)

In [15]:
importlib.reload(pr)
global_mean = y_train.mean()
X_val_processed = pr.apply_preprocessing_pipeline(X_val,
                                                  preprocessor=preprocessor)

X_test_processed = pr.apply_preprocessing_pipeline(X_test,
                                                  preprocessor=preprocessor)

In [16]:
importlib.reload(pr)
X_train_processed, y_train = pr.remove_null(X_train_processed, y_train)
X_val_processed, y_val = pr.remove_null(X_val_processed, y_val)
X_test_processed, y_test = pr.remove_null(X_test_processed, y_test)

In [17]:
train_data = pd.DataFrame(X_train_processed)
test_data = pd.DataFrame(X_test_processed)
val_data = pd.DataFrame(X_val_processed)
processed_data = pd.concat([
    train_data,
    test_data,
    val_data
], ignore_index=True)   

In [18]:
processed_data.shape

(200, 38)

In [19]:
print(processed_data.isnull().sum().sum())

0


In [20]:
i = 0
for df in [train_data, test_data, val_data]:
    print(f"Null values in {i}: {df.isnull().sum().sum()}")
    i += 1

Null values in 0: 0
Null values in 1: 0
Null values in 2: 0


In [21]:
y_train_data = pd.Series(y_train)
y_test_data = pd.Series(y_test)
y_val_data = pd.Series(y_val)

In [22]:
y_train_data

169    30228.90
97     14767.84
31     34084.48
12     66448.23
35     34809.35
         ...   
106    21627.23
14     76029.15
92     94420.86
179    89929.40
102    66283.50
Name: total_claim_amount, Length: 140, dtype: float64

In [23]:
data_folder = PROJECT_ROOT/"data"/"processed"
cluster_folder = PROJECT_ROOT/"data"/"processed"/"indexed"
train_data.to_csv(data_folder/"X_train.csv", index=False)
test_data.to_csv(data_folder/"X_test.csv", index=False)
val_data.to_csv(data_folder/"X_val.csv",index=False)
y_train_data.to_csv(data_folder/"y_train.csv",index=False)
y_test_data.to_csv(data_folder/"y_test.csv",index=False)
y_val_data.to_csv(data_folder/"y_val.csv",index=False)

#Cluster data
train_data.to_csv(cluster_folder/"X_train.csv", index=True)
test_data.to_csv(cluster_folder/"X_test.csv", index=True)

In [24]:
# numerical_columns = X_train.select_dtypes(include='number').columns# type: ignore
# ordinal_columns = ['policy_state','policy_csl','insured_sex', 'insured_education_level', 'incident_type','incident_severity',]
# ordinal_categories = [['OH', 'IN', 'IL'],
#                       ['100/300', '250/500', '500/1000'],
#                       ['MALE', 'FEMALE'],
#                       ['High School', 'Masters','JD', 'MD'],
#                       ['Single Vehicle Collision', 'Multi-vehicle Collision'],
#                       ['Minor Damage', 'Major Damage', 'Total Loss']]
# nominal_columns = [x for x in cat_cols if x not in ordinal_columns]


In [25]:
numerical_columns = X_train.select_dtypes(include='number').columns# type: ignore
ordinal_columns = ['policy_state','policy_csl','insured_sex', 'insured_education_level']
ordinal_categories = [['OH', 'IN', 'IL'],
                      ['100/300', '250/500', '500/1000'],
                      ['MALE', 'FEMALE'],
                      ['High School', 'Masters','JD', 'MD'],]
nominal_columns = [x for x in cat_cols if x not in ordinal_columns]

In [26]:
numerical_columns = [col for col in numerical_columns if col in data.columns]
ordinal_columns = [col for col in ordinal_columns if col in data.columns]
nominal_columns = [col for col in nominal_columns if col in data.columns]
bool_cols = [col for col in bool_cols if col in data.columns]

In [27]:
print(numerical_columns)

['months_as_customer', 'age', 'policy_deductable', 'policy_annual_premium', 'umbrella_limit', 'insured_zip', 'witnesses', 'auto_year']


In [28]:
from sklearn.model_selection import train_test_split
target = 'total_claim_amount'
X, y = data[[col for col in data.columns if col not in target]], data[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

preprocessor, X_train_processed = pr.prepreprocessing_pipeline(
    X = X_train,
    numerical_cols=numerical_columns,
    ordinal_cols=ordinal_columns,
    ordinal_categories=ordinal_categories,
    nominal_cols=nominal_columns,
    bool_cols=bool_cols
)
#X_test_processed = pr.apply_preprocessing_pipeline(X_test, preprocessor)
train_reg, test_reg = pd.DataFrame(X_train_processed), pd.DataFrame(X_test_processed)
y_train_reg, y_test_reg = pd.Series(y_train), pd.Series(y_test)

train_reg.to_csv(data_folder/"reg"/"X_train.csv", index=False)
test_reg.to_csv(data_folder/"reg"/"X_test.csv", index=False)
y_train_reg.to_csv(data_folder/"reg"/"y_train.csv", index=False)
y_test_reg.to_csv(data_folder/"reg"/"y_test.csv", index=False)


In [29]:
(data['total_claim_amount'] >= 20000).value_counts(normalize=True)


total_claim_amount
True     0.75
False    0.25
Name: proportion, dtype: float64

In [30]:
importlib.reload(pr)

target = 'total_claim_amount'

X_train, X_test, y_train, y_test = pr.split_data_for_classification(data, target)

preprocessor, X_train_processed = pr.prepreprocessing_pipeline(
    X = X_train,
    numerical_cols=numerical_columns,
    ordinal_cols=ordinal_columns,
    ordinal_categories=ordinal_categories,
    nominal_cols=nominal_columns,
    bool_cols=bool_cols
)
X_train_processed, y_train = pr.oversampling(X_train_processed, y_train)
X_test_processed = pr.apply_preprocessing_pipeline(X_test, preprocessor)
train_class, test_class = pd.DataFrame(X_train_processed), pd.DataFrame(X_test_processed)
y_train_class, y_test_class = pd.Series(y_train), pd.Series(y_test)

train_class.to_csv(data_folder/"classification"/"X_train.csv", index=False)
test_class.to_csv(data_folder/"classification"/"X_test.csv", index=False)
y_train_class.to_csv(data_folder/"classification"/"y_train.csv", index=False)
y_test_class.to_csv(data_folder/"classification"/"y_test.csv", index=False)


severe
0    0.8
1    0.2
Name: proportion, dtype: float64


In [31]:
y_train_class.value_counts(normalize=True)

severe
0    0.5
1    0.5
Name: proportion, dtype: float64

In [32]:
y_test_class.value_counts(normalize=True)

severe
0    0.8
1    0.2
Name: proportion, dtype: float64

In [33]:
print(X.shape)

(200, 21)


In [34]:
features = X.columns
features

Index(['months_as_customer', 'age', 'policy_state', 'policy_csl',
       'policy_deductable', 'policy_annual_premium', 'umbrella_limit',
       'insured_zip', 'insured_sex', 'insured_education_level',
       'insured_occupation', 'insured_relationship', 'incident_state',
       'incident_city', 'incident_location', 'witnesses', 'auto_make',
       'auto_model', 'auto_year', 'month_sin', 'month_cos'],
      dtype='object')

In [35]:
X_train = pd.DataFrame(X_train_processed)

In [36]:
X_test = pd.DataFrame(X_test_processed)

In [37]:
X_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,-0.013650,-0.762360,-1.152973,0.297376,-1.276893,0.485397,0.576997,1.301198,0.066568,-1.217759,-1.246271,-0.790378,-1.0,-0.217423,0.595466,1.283516,-1.689917,-1.0,-1.782084,-0.155196
1,0.607063,-0.048077,1.288617,0.750109,0.965456,-0.207701,0.576997,0.357018,1.841718,-1.217759,-1.246271,-0.174499,-1.0,1.388162,-0.154072,-2.185446,0.672030,-1.0,-0.524142,0.745942
2,-0.125723,0.611261,1.288617,-0.501699,0.591731,-0.826577,1.276388,1.655266,-0.821007,-1.217759,-1.246271,-0.790378,-1.0,-0.217423,-0.154072,-0.797861,-1.689917,-1.0,-1.782084,-0.455576
3,1.339849,0.776096,-1.152973,0.973547,-1.276893,1.584137,0.576997,-0.823207,-0.821007,-1.217759,0.802394,1.673137,-1.0,-0.418121,0.595466,0.589723,0.672030,-1.0,-1.782084,1.046322
4,0.865693,-0.762360,-1.152973,1.034298,0.965456,-0.779201,0.576997,0.120973,-0.821007,-1.217759,0.802394,-0.790378,-1.0,-1.823008,0.845313,-0.797861,-1.689917,-1.0,-0.524142,-1.957473
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,-1.453359,0.006868,1.288617,-0.434063,-1.276893,-1.258795,-0.122393,1.655266,1.841718,0.031225,0.802394,-0.174499,-1.0,-0.217423,-1.902994,-0.797861,0.503320,-1.0,-0.524142,-0.455576
76,0.322570,-1.696422,-1.152973,1.435627,0.591731,1.341705,-1.521174,1.065153,1.841718,1.280209,-1.246271,-0.790378,-1.0,1.388162,-1.902994,-0.797861,0.503320,-1.0,-0.524142,1.046322
77,-1.298181,0.281592,-0.339110,-0.041613,-1.276893,0.514702,0.576997,0.475041,1.841718,-1.217759,-1.246271,-0.174499,-1.0,-0.016725,-1.902994,-0.104069,0.503320,-1.0,0.733799,0.745942
78,0.908798,1.380489,-0.339110,-0.737319,0.965456,0.514420,1.276388,0.593063,0.066568,-1.217759,0.802394,-0.174499,-1.0,-0.217423,-1.902994,0.589723,0.503320,-1.0,-1.782084,-0.155196


In [38]:
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,-0.582637,1.325544,-0.339110,1.531052,-1.276893,0.142354,-1.521174,-1.295297,1.841718,1.280209,0.802394,-0.174499,0.0,-0.016725,0.845313,0.589723,-1.689917,0.0,1.153113,-0.455576
1,0.288085,0.336537,-1.152973,1.222563,-1.276893,1.629889,-1.521174,-0.233094,-0.821007,1.280209,0.802394,-0.790378,0.0,-0.217423,-1.902994,-0.104069,0.672030,0.0,-0.524142,-0.455576
2,1.572616,0.336537,-1.152973,0.749050,0.965456,-0.519521,0.576997,1.301198,-0.821007,-1.217759,0.802394,-0.790378,0.0,1.388162,-1.902994,-0.797861,0.503320,0.0,-1.782084,-1.957473
3,-0.246417,0.226647,-0.339110,0.674030,0.591731,0.851228,1.276388,1.301198,1.841718,1.280209,0.802394,-0.790378,0.0,1.388162,0.845313,-0.104069,0.672030,0.0,-1.782084,0.745942
4,-1.306802,1.160710,-0.339110,0.049170,0.965456,0.946038,-0.821784,-1.413320,0.954143,1.280209,-1.246271,-0.174499,0.0,-0.217423,-1.902994,0.589723,-1.689917,0.0,0.733799,0.745942
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
187,-0.361548,-0.562005,-0.339110,-0.850394,0.745101,-0.131046,0.702352,-1.080406,1.477472,-0.481337,0.802394,0.220607,0.0,-0.016725,0.435182,0.020282,0.503320,0.0,0.411409,0.923051
188,1.253632,0.089895,-0.538058,-0.358352,-0.820110,0.494516,0.576997,0.092123,-0.604040,-0.912447,0.301600,-0.188174,0.0,-0.266483,-0.581594,0.759320,0.544561,0.0,-0.514832,0.525660
189,1.477938,0.072540,-1.152973,0.917471,0.879948,-0.549119,0.256959,0.734128,-0.617931,-1.217759,0.802394,-0.649466,0.0,1.388162,-1.331351,-0.797861,0.503320,0.0,-1.206455,-1.957473
190,-1.673498,-1.180648,1.288617,0.371201,-1.276893,-0.537742,-0.176232,-0.487223,0.954143,-1.217759,0.644688,-0.600737,0.0,1.264564,0.537767,0.536315,0.659043,0.0,1.153113,-1.841857


In [39]:
importlib.reload(pr)
processed_data = pr.clean_data(data=data)
pipe, processed_data_clustering = pr.prepreprocessing_pipeline_for_clustering(data, numerical_columns, ordinal_columns, ordinal_categories, nominal_columns, bool_cols)


In [40]:
processed_data_clustering.head(5)

Unnamed: 0,num__months_as_customer,num__age,num__policy_deductable,num__policy_annual_premium,num__umbrella_limit,num__insured_zip,num__witnesses,num__auto_year,ord__policy_state,ord__policy_csl,...,nom__auto_make_Accura,nom__auto_make_Chevrolet,nom__auto_make_Dodge,nom__auto_make_Mercedes,nom__auto_make_Saab,nom__auto_model_92x,nom__auto_model_E400,nom__auto_model_RAM,nom__auto_model_RSX,nom__auto_model_Tahoe
0,-0.87188,1.607267,1.343864,0.803389,0.88723,-0.387526,0.024423,-1.565136,0.976092,1.329763,...,-0.492175,-0.42829,2.249579,-0.569652,-0.561951,2.171241,-0.585049,-0.42829,-0.476439,-0.546536
1,1.26153,0.233065,1.343864,-0.445208,0.504803,1.439788,0.024423,0.549913,1.863449,-1.110169,...,-0.492175,-0.42829,-0.444528,-0.569652,1.779513,-0.460566,-0.585049,-0.42829,-0.476439,1.829707
2,0.585083,0.013192,-0.285062,1.495167,0.504803,0.120102,-1.371154,-1.447634,0.976092,1.329763,...,-0.492175,2.334869,-0.444528,-0.569652,-0.561951,2.171241,-0.585049,-0.42829,-0.476439,-0.546536
3,-0.83719,1.002618,-1.099525,1.709736,-1.40733,-0.408681,-1.371154,0.43241,0.088736,-1.110169,...,-0.492175,-0.42829,2.249579,-0.569652,-0.561951,-0.460566,-0.585049,2.334869,-0.476439,-0.546536
4,-1.140724,0.507905,1.343864,-1.19822,0.504803,1.258788,-0.673366,0.314907,-0.798621,0.109797,...,2.031798,-0.42829,-0.444528,-0.569652,-0.561951,-0.460566,-0.585049,2.334869,-0.476439,-0.546536


In [41]:
print(processed_data_clustering.isna().sum().sum())

0


In [42]:
processed_data_clustering.to_csv(PROJECT_ROOT/"data"/"clustering"/"input_clustering.csv")

In [43]:
print(processed_data.isna().sum().sum())

0


In [44]:
processed_data.to_csv(PROJECT_ROOT/"dashboard"/"assets"/"dashboard_data.csv")

In [45]:
processed_data

Unnamed: 0,months_as_customer,age,policy_state,policy_csl,policy_deductable,policy_annual_premium,umbrella_limit,insured_zip,insured_sex,insured_education_level,...,incident_city,incident_location,witnesses,total_claim_amount,auto_make,auto_model,auto_year,month_sin,month_cos,severe
0,103.0,79.0,IN,500/1000,2000.0,1695.17,6000000.0,515608,FEMALE,High School,...,Columbus,8911 4th Drive,2.0,16752.00,Dodge,92x,1991.0,5.000000e-01,-0.866025,0
1,349.0,54.0,IL,100/300,2000.0,1290.22,5000000.0,669708,FEMALE,MD,...,Arlington,7988 Francis Lane,2.0,22567.31,Saab,Tahoe,2009.0,5.000000e-01,0.866025,0
2,271.0,50.0,IN,500/1000,1000.0,1919.53,5000000.0,558417,FEMALE,MD,...,Columbus,9838 3rd Ave,0.0,32096.68,Chevrolet,92x,1992.0,1.224647e-16,-1.000000,0
3,107.0,68.0,OH,100/300,500.0,1989.12,0.0,513824,FEMALE,MD,...,Columbus,9897 4th Drive,0.0,26309.15,Dodge,RAM,2008.0,8.660254e-01,-0.500000,0
4,72.0,59.0,CA,250/500,2000.0,1046.00,5000000.0,654444,MALE,Associate,...,Arlington,9567 Francis Lane,1.0,9215.33,Accura,RAM,2007.0,-8.660254e-01,-0.500000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,233.0,50.0,NY,500/1000,1000.0,1796.79,6000000.0,548554,MALE,PhD,...,Riverwood,2020 MLK Hwy,4.0,89186.16,Saab,RSX,2007.0,-2.449294e-16,1.000000,1
196,259.0,45.0,NY,500/1000,2000.0,1214.23,6000000.0,685555,FEMALE,MD,...,Arlington,2193 MLK Hwy,1.0,46649.52,Mercedes,E400,2005.0,-5.000000e-01,-0.866025,0
197,359.0,64.0,CA,100/300,500.0,1728.60,0.0,684186,FEMALE,MD,...,Arlington,7171 4th Drive,3.0,32342.72,Dodge,E400,1997.0,8.660254e-01,0.500000,0
198,198.0,50.0,TX,500/1000,500.0,1235.31,0.0,452930,FEMALE,Associate,...,Columbus,5037 MLK Hwy,2.0,47907.90,Saab,92x,1993.0,-5.000000e-01,0.866025,0


In [46]:
importlib.reload(pr)

<module 'src.data.preprocessing' from 'C:\\Users\\phili\\Desktop\\Insurance_Project\\prototype\\src\\data\\preprocessing.py'>

In [47]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

preprocessor_pickle = pr.unfit_prepreprocessing_pipeline(
    numerical_cols=numerical_columns,
    ordinal_cols=ordinal_columns,
    ordinal_categories=ordinal_categories,
    nominal_cols=nominal_columns,
    bool_cols=bool_cols)
preprocessor_pickle.fit(X)

0,1,2
,transformers,"[('num', ...), ('ord', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,"[['OH', 'IN', ...], ['100/300', '250/500', ...], ...]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,func,<function to_...002077FD90900>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,'one-to-one'
,kw_args,
,inv_kw_args,

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,verbose,0
,cols,
,drop_invariant,False
,return_df,True
,handle_unknown,'value'
,handle_missing,'value'
,min_group_size,
,combine_min_nan_groups,True
,min_group_name,
,normalize,False


In [48]:
import joblib
joblib.dump(preprocessor_pickle, PROJECT_ROOT/"models"/"preprocessor.pkl")
joblib.dump(features, PROJECT_ROOT/"models"/"features.pkl")


['C:\\Users\\phili\\Desktop\\Insurance_Project\\prototype\\models\\features.pkl']