In [None]:
import pandas as pd
import numpy as np

from category_encoders import TargetEncoder

# visualization
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno # for missing values
import sys
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
# data partition
from sklearn.model_selection import train_test_split


sys.path.append('../helper_functions')
# Import functions that are stored in the helper_functions directory. We do this to keep the notebook clean and easy to read
from helper_functions import *



sys.path.append('../pipeline_scripts')
# Import custom_transformer for Incoherences
from incoherences_custom_transformers import (
    IncoCarrierType, 
    IncoWCIOBodyCode, 
    IncoZeroAWW, 
    IncoZeroBirthYEAR, 
    IncoZeroAgeAtInjury, 
    IncoFilterAgeAtInjury, 
    IncoDependents, 
    IncoCorrectAge, 
    IncoSwapAccidentDate, 
    IncoCovidIndicator
)



# import custom_transformers for missing values
from missing_values_transformers import (FillNaNValues,
                                         ImputeBirthYearFromAccident, 
                                         ImputeBirthYearWithMedian, 
                                         ImputeProportionalTransformer,
                                         FillMissingDescriptionsWithCode,
                                         ImputeAccidentDate,
                                         ImputeAgeAtInjury)

In [None]:
# Create the pipeline with the preprocessor and custom transformers
missing_pipeline = Pipeline([
    ('fill_ime4', FillNaNValues(column='IME-4 Count', fill_value=0)),  # Custom transformer for 'IME-4 Count'
    ('fill_zip_code', FillNaNValues(column='Zip Code', fill_value='UNKNOWN')),
    ('impute_birth_year_from_accident', ImputeBirthYearFromAccident()),
    ('impute_birth_year_with_median_age_and_birth', ImputeBirthYearWithMedian()),
    ('impute_medical_fee_region', ImputeProportionalTransformer(column='Medical Fee Region')),
    ('impute_industry_code', ImputeProportionalTransformer(column='Industry Code')),
    ('fill_missing_descriptions', FillMissingDescriptionsWithCode(
        code_column='Industry Code', description_column='Industry Code Description')),
    ('impute_accident_date_with_assembly', ImputeAccidentDate()),
    ('impute_age_at_injury',ImputeAgeAtInjury()),
    ('impute_alternative_dispute_resolution', ImputeProportionalTransformer(column='Alternative Dispute Resolution'))
])

X_train = missing_pipeline.fit_transform(X_train)
X_val = missing_pipeline.transform(X_val)
X_test = missing_pipeline.transform(X_test)