# Preprocessing

Neural networks require preprocessing such as encoding categorical columns and standardize numerical columns in order to avoid overfitting and improve performance.

## Preparing environment

In [23]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
import sys
sys.path.append('../high_performance_employee_resign_prediction')
from utils import paths

## Importing data

In [24]:
train_df = pd.read_csv(paths.data_processed_dir('reduced_train.csv'))
test_df = pd.read_csv(paths.data_processed_dir('reduced_test.csv'))

In [25]:
# Selecting columns

categorical = ['gender', 'gender_boss', 'seniority', 'recruitment_channel', 'join_year', 'marital_estatus', 'join_month_boss', 'join_year_boss', 'performance']
numerical =  [col for col in train_df.columns if 'embedding' not in col and 'id' not in col and 'resign' not in col and col not in categorical]
other_cols = [col for col in train_df.columns if col not in categorical and col not in numerical and 'resign' not in col]
y = train_df['resign']
train_df.drop(columns='resign', inplace=True)

## Preprocessing

In [26]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical),
        ('cat', OneHotEncoder(sparse_output=False, drop='first'), categorical)
    ],
    remainder='passthrough'
)

In [27]:
train_transformed = preprocessor.fit_transform(train_df)
test_transformed = preprocessor.transform(test_df)

In [28]:
categorical_names = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical)
all_column_names = numerical + list(categorical_names) + other_cols

In [29]:
train_preprocessed = pd.concat([pd.DataFrame(train_transformed, columns=all_column_names), y], axis=1)
test_preprocessed = pd.DataFrame(test_transformed, columns=all_column_names)

## Saving preprocessed data

In [35]:
train_preprocessed.to_csv(paths.data_processed_dir('train_processed.csv'), index=False)
test_preprocessed.to_csv(paths.data_processed_dir('test_processed.csv'), index=False)