In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

In [2]:
field_types_file = r'../utilfiles/codes_after_reduction.csv'
trn_rd_path = r'../utilfiles/trn_sample_NoOneHot.csv'
validation_path = r'../utilfiles/vld_without_onehot.csv'
sample_val_path = r'../utilfiles/val_sample_without_onehot.csv'

In [3]:
field_types_df = pd.read_csv(field_types_file)

# Create a dictionary with field names as keys and field types as values
field_types = dict(zip(field_types_df['codes'], field_types_df['dataType']))

# Separate fields based on their types
categorical_single_fields = [field for field, dtype in field_types.items() if dtype == 'Categorical (single)']
categorical_multiple_fields = [field for field, dtype in field_types.items() if dtype == 'Categorical (multiple)']
numerical_fields = [field for field, dtype in field_types.items() if dtype == 'Integer']
continuous_fields = [field for field, dtype in field_types.items() if dtype == 'Continuous']

categorical_original_fields = categorical_single_fields + categorical_multiple_fields
numeric_original_fields = numerical_fields + continuous_fields

In [4]:
trn_df = pd.read_csv(trn_rd_path)

In [5]:
cat_imputer = SimpleImputer(strategy='constant', fill_value=-3)
num_imputer = SimpleImputer(strategy='median')

In [6]:
cat_no_nulls_X = pd.DataFrame(cat_imputer.fit_transform(trn_df[categorical_original_fields]),
                              columns=trn_df[categorical_original_fields].columns).astype(int)
num_no_nulls_X = pd.DataFrame(num_imputer.fit_transform(trn_df[numeric_original_fields]),
                               columns=trn_df[numeric_original_fields].columns).astype(int)

trn_df_imputed = pd.concat([num_no_nulls_X, cat_no_nulls_X, trn_df['target']], axis=1)

In [7]:
# Use LabelEncoder handle encoding values.
labelEncoders = {}
for col in categorical_original_fields:
    le = LabelEncoder()
    trn_df_imputed[col] = le.fit_transform(trn_df_imputed[col])
    labelEncoders[col] = le

trn_df_final = trn_df_imputed

In [15]:
trn_df_final.to_csv('../utilfiles/trn_sample_NoOneHot_noNulls.csv',index=False)

In [9]:
sample_val = True # Choose if you want sample Validation or the whole validation (10% of the entire dataset)

if not sample_val:
    val_df = pd.read_csv(validation_path)
else:
    val_df = pd.read_csv(sample_val_path)


In [12]:
cat_no_nulls_df_val = pd.DataFrame(cat_imputer.transform(val_df[categorical_original_fields]),
                              columns=val_df[categorical_original_fields].columns).astype(int)
num_no_nulls_df_val = pd.DataFrame(num_imputer.transform(val_df[numeric_original_fields]),
                               columns=val_df[numeric_original_fields].columns).astype(int)

val_df_imputed = pd.concat([num_no_nulls_df_val, cat_no_nulls_df_val, val_df['target']], axis=1)

In [13]:
for col in categorical_original_fields:
    val_df_imputed[col] = labelEncoders[col].transform(val_df_imputed[col])

val_df_final = val_df_imputed

In [16]:
val_df_final.to_csv('../utilfiles/val_sample_NoOneHot_noNulls.csv',index=False)

In [17]:
import joblib

joblib.dump(le, r'../utilfiles/LabelEncoders.pkl')
joblib.dump(cat_imputer, r'../utilfiles/catImputer.pkl')
joblib.dump(num_imputer, r'../utilfiles/numImputer.pkl')

['../utilfiles/numImputer.pkl']