# ETL Process to clean data before analysis

## Preparing environment

In [250]:
import pandas as pd

In [251]:
import sys
sys.path.append('../high_performance_employee_resign_prediction')
from utils import paths

## Data Dictionary

* id_employee - Employee ID
* id_last_boss - Boss ID
* seniority - 1: For people who don't have anyone in charge, 2: For those who have people in charge
* work_modality - Work modality specified on employee contract
* office_distance -  Distance in kilometers from employee home to work.
* low_health_days -  Number of days pf justified sick leave since employee started to work for the company.
* gender - Employee gender
* recruitment_channel - Indicates the means by which the application process took place. 
* average_permanence - Mean time in years that the employee has worked in previous laboral experience.
* birth_date - Birth date in format dd/mm/yyyy.
* salary - It is the amount corresponding to the monthly remuneration received.
* psi_score - Is the score obtained in psychometric test during selection process.
* join_date - Date when the employee started to work for the company.
* marital_estatus - Marital Estatus.
* performance_score - Is the score obtained by the employee it his last quarterly performance test if he had.
* resign - "0" if employee continues working or "1" if employee left the company on his first 6 months.

## Extracting data

In [252]:
train_df = pd.read_csv(paths.data_raw_dir('train_data.csv'), sep=';')

test_df = pd.read_csv(paths.data_raw_dir('test_data.csv'), sep=';')

## Checking data

In [253]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2152 entries, 0 to 2151
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id_employee          2152 non-null   int64  
 1   id_last_boss         2061 non-null   float64
 2   seniority            2152 non-null   int64  
 3   work_modality        2152 non-null   object 
 4   office_distance      2152 non-null   float64
 5   low_health_days      2152 non-null   int64  
 6   gender               2152 non-null   object 
 7   recruitment_channel  2152 non-null   object 
 8   average_permanence   2152 non-null   int64  
 9   birth_date           2152 non-null   object 
 10  salary               2152 non-null   int64  
 11  performance_score    2084 non-null   float64
 12  psi_score            2152 non-null   int64  
 13  join_date            2152 non-null   object 
 14  marital_estatus      2152 non-null   object 
 15  resign               2152 non-null   i

In [254]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2020 entries, 0 to 2019
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id_employee          2020 non-null   int64  
 1   id_last_boss         1937 non-null   float64
 2   seniority            2020 non-null   int64  
 3   work_modality        2020 non-null   object 
 4   office_distance      2020 non-null   float64
 5   low_health_days      2020 non-null   int64  
 6   gender               2020 non-null   object 
 7   recruitment_channel  2020 non-null   object 
 8   average_permanence   2020 non-null   int64  
 9   birth_date           2020 non-null   object 
 10  salary               2020 non-null   int64  
 11  performance_score    2020 non-null   int64  
 12  psi_score            2020 non-null   int64  
 13  join_date            2020 non-null   object 
 14  marital_estatus      2020 non-null   object 
dtypes: float64(2), int64(7), object(6)
mem

In [255]:
train_df.describe()

Unnamed: 0,id_employee,id_last_boss,seniority,office_distance,low_health_days,average_permanence,salary,performance_score,psi_score,resign
count,2152.0,2061.0,2152.0,2152.0,2152.0,2152.0,2152.0,2084.0,2152.0,2152.0
mean,102070.160781,102090.326055,1.042286,3.111768,2.709108,6.644981,457021.1,63.472169,75.286245,0.466078
std,1210.999342,43.58601,0.201288,1.78905,2.982975,4.283794,302943.5,23.110594,6.050987,0.498964
min,100001.0,102000.0,1.0,0.12,0.0,1.0,76521.0,5.0,58.0,0.0
25%,101028.75,102054.0,1.0,1.81875,1.0,3.0,260439.8,42.0,71.0,0.0
50%,102048.5,102091.0,1.0,2.51,2.0,6.0,373422.5,65.0,75.0,0.0
75%,103135.5,102127.0,1.0,4.15125,3.0,9.0,674193.0,90.0,79.0,1.0
max,104171.0,102172.0,2.0,21.05,35.0,25.0,1900000.0,99.0,98.0,1.0


In [256]:
test_df.describe()

Unnamed: 0,id_employee,id_last_boss,seniority,office_distance,low_health_days,average_permanence,salary,performance_score,psi_score
count,2020.0,1937.0,2020.0,2020.0,2020.0,2020.0,2020.0,2020.0,2020.0
mean,102103.807921,102089.646877,1.041089,3.059394,2.662376,6.713366,463823.6,89.14604,75.44505
std,1197.751058,43.691244,0.198545,1.637131,2.817351,4.30758,306360.1,4.851837,6.00944
min,100000.0,102000.0,1.0,0.11,0.0,1.0,75517.0,80.0,58.0
25%,101071.25,102052.0,1.0,1.805,1.0,3.0,266311.0,85.0,71.0
50%,102117.5,102089.0,1.0,2.5425,2.0,6.0,374720.0,90.0,76.0
75%,103126.25,102126.0,1.0,4.0,3.0,9.0,676307.5,92.0,79.0
max,104172.0,102172.0,2.0,14.045,23.0,27.0,1900000.0,99.0,98.0


Observing the statistics summary, we can see that there are some outliers in `office_distance`, `low_health_days`, `average_permanence` and `salary` for both train and test data. They will be analyzed and actions will be taken during the EDA.

## Looking for typo errors in object data

In [257]:
train_df.describe(include='object')

Unnamed: 0,work_modality,gender,recruitment_channel,birth_date,join_date,marital_estatus
count,2152,2152,2152,2152,2152,2152
unique,2,2,5,1980,1595,4
top,Presencial,Hombre,Portal Web,24/11/1978,3/10/2013,Soltero
freq,1530,1095,986,3,5,925


In [258]:
test_df.describe(include='object')

Unnamed: 0,work_modality,gender,recruitment_channel,birth_date,join_date,marital_estatus
count,2020,2020,2020,2020,2020,2020
unique,2,2,5,1883,1527,4
top,Presencial,Mujer,Portal Web,27/04/1972,29/09/2017,Soltero
freq,1408,1037,869,3,5,800


In [259]:
# Checking work_modality categories

print(train_df.work_modality.unique())
print(test_df.work_modality.unique())

['Híbrida' 'Presencial']
['Híbrida' 'Presencial']


In [260]:
# Checking recruitment_channel categories

print(train_df.recruitment_channel.unique())
print(test_df.recruitment_channel.unique())

['Ferias & Networking' 'Referidos' 'Linkedin' 'Portal Web' 'Headhunter']
['Ferias & Networking' 'Portal Web' 'Linkedin' 'Referidos' 'Headhunter']


In [261]:
# Checking marital_estatus categories

print(train_df.marital_estatus.unique())
print(test_df.marital_estatus.unique())

['Soltero' 'Viudo' 'Divorciado' 'Casado']
['Soltero' 'Casado' 'Viudo' 'Divorciado']


From the data dictionary we can see that `seniority` and `id_last_boss` are categories too. However, id_ultimo_jefe has many categories, so it will be analyzed during the hypothesis testing phase.

In [262]:
cat_cols = ['work_modality', 'gender', 'recruitment_channel', 'marital_estatus', 'seniority']

# Defining a function to check categories proportion

def check_categories_proportion(df, col):
    
    """
    Calculate and print the proportion of each category in a specified column of a DataFrame.

    This function takes a pandas DataFrame and a column name, calculates the proportion 
    (as a percentage) of each unique value (category) in the specified column, and prints 
    the resulting proportions.

    Parameters:
    df (pandas.DataFrame): The DataFrame containing the data.
    col (str): The name of the column for which to calculate category proportions.

    Returns:
    None: This function prints the category proportions and does not return any value.

    Example:
    >>> import pandas as pd
    >>> data = {'category': ['A', 'B', 'A', 'C', 'B', 'A']}
    >>> df = pd.DataFrame(data)
    >>> check_categories_proportion(df, 'category')
    A    50.0
    B    33.3
    C    16.7
    Name: category, dtype: float64
    """
    
    proportion = df[col].value_counts(normalize=True) * 100
    print(proportion)

In [263]:
for col in cat_cols:
    check_categories_proportion(train_df, col)
    print('-'*40)

work_modality
Presencial    71.096654
Híbrida       28.903346
Name: proportion, dtype: float64
----------------------------------------
gender
Hombre    50.8829
Mujer     49.1171
Name: proportion, dtype: float64
----------------------------------------
recruitment_channel
Portal Web             45.817844
Linkedin               22.676580
Referidos              15.706320
Ferias & Networking    14.312268
Headhunter              1.486989
Name: proportion, dtype: float64
----------------------------------------
marital_estatus
Soltero       42.983271
Casado        38.708178
Divorciado    10.315985
Viudo          7.992565
Name: proportion, dtype: float64
----------------------------------------
seniority
1    95.771375
2     4.228625
Name: proportion, dtype: float64
----------------------------------------


In [264]:
for col in cat_cols:
    check_categories_proportion(test_df, col)
    print('-'*40)

work_modality
Presencial    69.70297
Híbrida       30.29703
Name: proportion, dtype: float64
----------------------------------------
gender
Mujer     51.336634
Hombre    48.663366
Name: proportion, dtype: float64
----------------------------------------
recruitment_channel
Portal Web             43.019802
Linkedin               25.247525
Referidos              16.336634
Ferias & Networking    14.059406
Headhunter              1.336634
Name: proportion, dtype: float64
----------------------------------------
marital_estatus
Soltero       39.603960
Casado        38.514851
Divorciado    12.425743
Viudo          9.455446
Name: proportion, dtype: float64
----------------------------------------
seniority
1    95.891089
2     4.108911
Name: proportion, dtype: float64
----------------------------------------


Categories proportion in both train and test data are similar.

## Extracting info from date columns

In [265]:
# Converting date columns to dtype datetime

train_df['join_date'] = pd.to_datetime(train_df['join_date'], dayfirst=True)
train_df['birth_date'] = pd.to_datetime(train_df['birth_date'], dayfirst=True)

test_df['join_date'] = pd.to_datetime(test_df['join_date'], dayfirst=True)
test_df['birth_date'] = pd.to_datetime(test_df['birth_date'], dayfirst=True)

In [266]:
# Calculating age of join

train_df['join_age'] = (train_df['join_date'] - train_df['birth_date']).dt.days // 365

test_df['join_age'] = (test_df['join_date'] - test_df['birth_date']).dt.days // 365

In [267]:
# Obtaining season info

train_df['join_year'] = train_df['join_date'].dt.year
train_df['join_month'] = train_df['join_date'].dt.month

test_df['join_year'] = test_df['join_date'].dt.year
test_df['join_month'] = test_df['join_date'].dt.month

In [268]:
# Discarding original dates

train_df.drop(['birth_date', 'join_date'], axis=1, inplace=True)
test_df.drop(['birth_date', 'join_date'], axis=1, inplace=True)

In [269]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2152 entries, 0 to 2151
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id_employee          2152 non-null   int64  
 1   id_last_boss         2061 non-null   float64
 2   seniority            2152 non-null   int64  
 3   work_modality        2152 non-null   object 
 4   office_distance      2152 non-null   float64
 5   low_health_days      2152 non-null   int64  
 6   gender               2152 non-null   object 
 7   recruitment_channel  2152 non-null   object 
 8   average_permanence   2152 non-null   int64  
 9   salary               2152 non-null   int64  
 10  performance_score    2084 non-null   float64
 11  psi_score            2152 non-null   int64  
 12  marital_estatus      2152 non-null   object 
 13  resign               2152 non-null   int64  
 14  join_age             2152 non-null   int64  
 15  join_year            2152 non-null   i

In [270]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2020 entries, 0 to 2019
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id_employee          2020 non-null   int64  
 1   id_last_boss         1937 non-null   float64
 2   seniority            2020 non-null   int64  
 3   work_modality        2020 non-null   object 
 4   office_distance      2020 non-null   float64
 5   low_health_days      2020 non-null   int64  
 6   gender               2020 non-null   object 
 7   recruitment_channel  2020 non-null   object 
 8   average_permanence   2020 non-null   int64  
 9   salary               2020 non-null   int64  
 10  performance_score    2020 non-null   int64  
 11  psi_score            2020 non-null   int64  
 12  marital_estatus      2020 non-null   object 
 13  join_age             2020 non-null   int64  
 14  join_year            2020 non-null   int32  
 15  join_month           2020 non-null   i

## Missing values treatment

From context of the business, is well known that people with performance_score greater than or equal to 80 is considered people with high perfomance, and everyone else is considered low performance. There are some missing values in this column on train data, and observing the statistics summary, most of the employees from the train data are low performance, while all the employees in the test data are high performance; this lead us to impute those missing values with 80, in order to help the people to generalize better the high performance employees. Additionally, performance categorical column will be added with 'high' for employees with performance score greater or equal to 80 and 'low' for everyone else.

In [271]:
train_df['performance_score'] = train_df['performance_score'].fillna(80)

Since id_ultimo_jefe is a categorical column, we can't impute it. So we gonna fill missing values with bfill method.

In [272]:
train_df['id_last_boss'] = train_df['id_last_boss'].fillna(method='bfill')

test_df['id_last_boss'] = test_df['id_last_boss'].fillna(method='bfill')

  train_df['id_last_boss'] = train_df['id_last_boss'].fillna(method='bfill')
  test_df['id_last_boss'] = test_df['id_last_boss'].fillna(method='bfill')


## Changing id_last_boss dtype

In [273]:
train_df['id_last_boss'] = train_df['id_last_boss'].astype(int)

test_df['id_last_boss'] = test_df['id_last_boss'].astype(int)

In [274]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2152 entries, 0 to 2151
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id_employee          2152 non-null   int64  
 1   id_last_boss         2152 non-null   int32  
 2   seniority            2152 non-null   int64  
 3   work_modality        2152 non-null   object 
 4   office_distance      2152 non-null   float64
 5   low_health_days      2152 non-null   int64  
 6   gender               2152 non-null   object 
 7   recruitment_channel  2152 non-null   object 
 8   average_permanence   2152 non-null   int64  
 9   salary               2152 non-null   int64  
 10  performance_score    2152 non-null   float64
 11  psi_score            2152 non-null   int64  
 12  marital_estatus      2152 non-null   object 
 13  resign               2152 non-null   int64  
 14  join_age             2152 non-null   int64  
 15  join_year            2152 non-null   i

In [275]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2020 entries, 0 to 2019
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id_employee          2020 non-null   int64  
 1   id_last_boss         2020 non-null   int32  
 2   seniority            2020 non-null   int64  
 3   work_modality        2020 non-null   object 
 4   office_distance      2020 non-null   float64
 5   low_health_days      2020 non-null   int64  
 6   gender               2020 non-null   object 
 7   recruitment_channel  2020 non-null   object 
 8   average_permanence   2020 non-null   int64  
 9   salary               2020 non-null   int64  
 10  performance_score    2020 non-null   int64  
 11  psi_score            2020 non-null   int64  
 12  marital_estatus      2020 non-null   object 
 13  join_age             2020 non-null   int64  
 14  join_year            2020 non-null   int32  
 15  join_month           2020 non-null   i

## Adding performance column

In [276]:
train_df['performance'] = ['high' if score >= 80 else 'low' for score in train_df['performance_score']]
test_df['performance'] = ['high' if score >= 80 else 'low' for score in test_df['performance_score']]

## Adding new features

Boss id are also present in employee id, so we can use this information to create new features that can help to improve the results

In [277]:
expanded_train_df = pd.merge(train_df, train_df.drop(columns=['resign']), how='left',
                             left_on='id_last_boss', right_on='id_employee',
                             suffixes=('_employee', '_boss'))

In [278]:
expanded_train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2152 entries, 0 to 2151
Data columns (total 35 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   id_employee_employee          2152 non-null   int64  
 1   id_last_boss_employee         2152 non-null   int32  
 2   seniority_employee            2152 non-null   int64  
 3   work_modality_employee        2152 non-null   object 
 4   office_distance_employee      2152 non-null   float64
 5   low_health_days_employee      2152 non-null   int64  
 6   gender_employee               2152 non-null   object 
 7   recruitment_channel_employee  2152 non-null   object 
 8   average_permanence_employee   2152 non-null   int64  
 9   salary_employee               2152 non-null   int64  
 10  performance_score_employee    2152 non-null   float64
 11  psi_score_employee            2152 non-null   int64  
 12  marital_estatus_employee      2152 non-null   object 
 13  res

In [279]:
expanded_test_df = pd.merge(test_df, test_df, how='left',
                             left_on='id_last_boss', right_on='id_employee',
                             suffixes=('_employee', '_boss'))

In [280]:
expanded_test_df[['id_employee_employee', 'id_last_boss_employee', 'id_employee_boss', 'id_last_boss_boss']]

Unnamed: 0,id_employee_employee,id_last_boss_employee,id_employee_boss,id_last_boss_boss
0,100486,102115,102115.0,102116.0
1,103752,102074,102074.0,102028.0
2,103937,102150,102150.0,102085.0
3,101744,102172,,
4,101037,102060,102060.0,102041.0
...,...,...,...,...
2015,103755,102161,102161.0,102039.0
2016,103976,102171,,
2017,104115,102144,,
2018,103920,102152,,


In [281]:
expanded_test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2020 entries, 0 to 2019
Data columns (total 34 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   id_employee_employee          2020 non-null   int64  
 1   id_last_boss_employee         2020 non-null   int32  
 2   seniority_employee            2020 non-null   int64  
 3   work_modality_employee        2020 non-null   object 
 4   office_distance_employee      2020 non-null   float64
 5   low_health_days_employee      2020 non-null   int64  
 6   gender_employee               2020 non-null   object 
 7   recruitment_channel_employee  2020 non-null   object 
 8   average_permanence_employee   2020 non-null   int64  
 9   salary_employee               2020 non-null   int64  
 10  performance_score_employee    2020 non-null   int64  
 11  psi_score_employee            2020 non-null   int64  
 12  marital_estatus_employee      2020 non-null   object 
 13  joi

## Missing Values treatment

In order to impute missing values from new features from boss, let's make some notations and assumptions:

- id_employee_boss, id_last_boss_boss, marital_estatus_boss, join_month_boss are not relevant
- gender_boss will be imputed using ffill method
- seniority_boss is not relevant since all boss have seniority 2
- work_modality_boss, recruitment_channel_boss will be assumed to be the same from employee
- Numerical features will be imputed using KNN Imputer
- performance_boss will be determined after imputing numerical values

In [282]:
y = expanded_train_df['resign']

In [283]:
# Dropping columns

expanded_train_df.drop(columns=['id_employee_boss', 'id_last_boss_boss',
                                'marital_estatus_boss', 'join_month_boss',
                                'seniority_boss', 'performance_boss'], inplace=True)

expanded_test_df.drop(columns=['id_employee_boss', 'id_last_boss_boss',
                                'marital_estatus_boss', 'join_month_boss',
                                'seniority_boss', 'performance_boss'], inplace=True)

In [284]:
# Imputing gender_boss

expanded_train_df['gender_boss'].fillna(method='ffill', inplace=True)
expanded_test_df['gender_boss'].fillna(method='ffill', inplace=True)

  expanded_train_df['gender_boss'].fillna(method='ffill', inplace=True)
  expanded_test_df['gender_boss'].fillna(method='ffill', inplace=True)


In [285]:
# Dropping resign

expanded_train_df.drop(columns=['resign'], inplace=True)

In [286]:
# Imputing missing values from work_modality_boss and recruitment_channel_boss

no_miss_train = expanded_train_df.dropna()
miss_train = expanded_train_df[expanded_train_df.isna().any(axis=1)]

miss_train['work_modality_boss'] = miss_train['work_modality_employee']
miss_train['recruitment_channel_boss'] = miss_train['recruitment_channel_employee']

expanded_train_df_clean = pd.concat([no_miss_train, miss_train], axis=0).sort_index()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  miss_train['work_modality_boss'] = miss_train['work_modality_employee']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  miss_train['recruitment_channel_boss'] = miss_train['recruitment_channel_employee']


In [287]:
# Imputing missing values from work_modality_boss and recruitment_channel_boss

no_miss_test = expanded_test_df.dropna()
miss_test = expanded_test_df[expanded_test_df.isna().any(axis=1)]

miss_test['work_modality_boss'] = miss_test['work_modality_employee']
miss_test['recruitment_channel_boss'] = miss_test['recruitment_channel_employee']

expanded_test_df_clean = pd.concat([no_miss_test, miss_test], axis=0).sort_index()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  miss_test['work_modality_boss'] = miss_test['work_modality_employee']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  miss_test['recruitment_channel_boss'] = miss_test['recruitment_channel_employee']


In [288]:
# Preparing missing values imputation

from sklearn.preprocessing import OneHotEncoder, PowerTransformer
from sklearn.impute import KNNImputer

In [289]:
# Saving ids

train_ids = expanded_train_df_clean[['id_employee_employee', 'id_last_boss_employee']]
test_ids = expanded_test_df_clean[['id_employee_employee', 'id_last_boss_employee']]

In [290]:
# Dropping columns

expanded_train_df_clean.drop(columns=['id_employee_employee', 'id_last_boss_employee'], inplace=True)
expanded_test_df_clean.drop(columns=['id_employee_employee', 'id_last_boss_employee'], inplace=True)

In [291]:
expanded_train_df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2152 entries, 0 to 2151
Data columns (total 26 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   seniority_employee            2152 non-null   int64  
 1   work_modality_employee        2152 non-null   object 
 2   office_distance_employee      2152 non-null   float64
 3   low_health_days_employee      2152 non-null   int64  
 4   gender_employee               2152 non-null   object 
 5   recruitment_channel_employee  2152 non-null   object 
 6   average_permanence_employee   2152 non-null   int64  
 7   salary_employee               2152 non-null   int64  
 8   performance_score_employee    2152 non-null   float64
 9   psi_score_employee            2152 non-null   int64  
 10  marital_estatus_employee      2152 non-null   object 
 11  join_age_employee             2152 non-null   int64  
 12  join_year_employee            2152 non-null   int32  
 13  join_mon

In [292]:
# Defining categorical and numerical columns

cat_cols = ['seniority_employee', 'work_modality_employee', 'gender_employee', 'recruitment_channel_employee',
            'marital_estatus_employee', 'join_year_employee', 'join_month_employee', 'performance_employee', 
            'work_modality_boss', 'gender_boss', 'recruitment_channel_boss']

num_cols = ['office_distance_employee', 'low_health_days_employee', 'average_permanence_employee', 'salary_employee',
            'performance_score_employee', 'psi_score_employee', 'join_age_employee', 'office_distance_boss', 'low_health_days_boss',
            'average_permanence_boss', 'salary_boss', 'performance_score_boss', 'psi_score_boss', 'join_age_boss', 'join_year_boss']

In [293]:
expanded_test_df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2020 entries, 0 to 2019
Data columns (total 26 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   seniority_employee            2020 non-null   int64  
 1   work_modality_employee        2020 non-null   object 
 2   office_distance_employee      2020 non-null   float64
 3   low_health_days_employee      2020 non-null   int64  
 4   gender_employee               2020 non-null   object 
 5   recruitment_channel_employee  2020 non-null   object 
 6   average_permanence_employee   2020 non-null   int64  
 7   salary_employee               2020 non-null   int64  
 8   performance_score_employee    2020 non-null   int64  
 9   psi_score_employee            2020 non-null   int64  
 10  marital_estatus_employee      2020 non-null   object 
 11  join_age_employee             2020 non-null   int64  
 12  join_year_employee            2020 non-null   int32  
 13  join_mon

In [294]:
# Encoding categorical columns

oh_encoder = OneHotEncoder(sparse_output=False, drop='first')

train_clean_encoded = pd.DataFrame(oh_encoder.fit_transform(expanded_train_df_clean[cat_cols]))
train_clean_encoded.columns = oh_encoder.get_feature_names_out(cat_cols)

test_clean_encoded = pd.DataFrame(oh_encoder.transform(expanded_test_df_clean[cat_cols]))
test_clean_encoded.columns = oh_encoder.get_feature_names_out(cat_cols)

In [295]:
# Transforming numerical columns

pt = PowerTransformer(method='yeo-johnson')

train_clean_scaled = pd.DataFrame(pt.fit_transform(expanded_train_df_clean[num_cols]), columns=num_cols)

test_clean_scaled = pd.DataFrame(pt.transform(expanded_test_df_clean[num_cols]), columns=num_cols)

In [296]:
# Preparing df to make numerical imputation

train_clean_final = pd.concat([train_clean_encoded, train_clean_scaled], axis=1)
test_clean_final = pd.concat([test_clean_encoded, test_clean_scaled], axis=1)

In [297]:
# Imputing missing values

knn_imp = KNNImputer()

train_clean_final = pd.DataFrame(knn_imp.fit_transform(train_clean_final), columns=train_clean_final.columns)
test_clean_final = pd.DataFrame(knn_imp.transform(test_clean_final), columns=train_clean_final.columns)

In [298]:
train_clean_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2152 entries, 0 to 2151
Data columns (total 55 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   seniority_employee_2                     2152 non-null   float64
 1   work_modality_employee_Presencial        2152 non-null   float64
 2   gender_employee_Mujer                    2152 non-null   float64
 3   recruitment_channel_employee_Headhunter  2152 non-null   float64
 4   recruitment_channel_employee_Linkedin    2152 non-null   float64
 5   recruitment_channel_employee_Portal Web  2152 non-null   float64
 6   recruitment_channel_employee_Referidos   2152 non-null   float64
 7   marital_estatus_employee_Divorciado      2152 non-null   float64
 8   marital_estatus_employee_Soltero         2152 non-null   float64
 9   marital_estatus_employee_Viudo           2152 non-null   float64
 10  join_year_employee_2013                  2152 no

In [299]:
test_clean_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2020 entries, 0 to 2019
Data columns (total 55 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   seniority_employee_2                     2020 non-null   float64
 1   work_modality_employee_Presencial        2020 non-null   float64
 2   gender_employee_Mujer                    2020 non-null   float64
 3   recruitment_channel_employee_Headhunter  2020 non-null   float64
 4   recruitment_channel_employee_Linkedin    2020 non-null   float64
 5   recruitment_channel_employee_Portal Web  2020 non-null   float64
 6   recruitment_channel_employee_Referidos   2020 non-null   float64
 7   marital_estatus_employee_Divorciado      2020 non-null   float64
 8   marital_estatus_employee_Soltero         2020 non-null   float64
 9   marital_estatus_employee_Viudo           2020 non-null   float64
 10  join_year_employee_2013                  2020 no

In [300]:
# Making inverse transformations

train_cat_cols = pd.DataFrame(oh_encoder.inverse_transform(train_clean_final[train_clean_encoded.columns]), columns=cat_cols)
test_cat_cols = pd.DataFrame(oh_encoder.inverse_transform(test_clean_final[train_clean_encoded.columns]), columns=cat_cols)

train_num_cols = pd.DataFrame(pt.inverse_transform(train_clean_final[num_cols]), columns=num_cols)
test_num_cols = pd.DataFrame(pt.inverse_transform(test_clean_final[num_cols]), columns=num_cols)

From original data it's known that all numerical columns but office_distance are integers.

In [301]:
# Changing dtypes

float_cols = ['office_distance_employee', 'office_distance_boss']
int_cols = [col for col in num_cols if col not in float_cols]

train_num_cols[int_cols] = train_num_cols[int_cols].astype(int)
test_num_cols[int_cols] = test_num_cols[int_cols].astype(int)

In [302]:
# Concatenating all features

train_final = pd.concat([train_ids, train_cat_cols, train_num_cols, y], axis=1)
test_final = pd.concat([test_ids, test_cat_cols, test_num_cols], axis=1)

In [303]:
train_final.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2152 entries, 0 to 2151
Data columns (total 29 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   id_employee_employee          2152 non-null   int64  
 1   id_last_boss_employee         2152 non-null   int32  
 2   seniority_employee            2152 non-null   object 
 3   work_modality_employee        2152 non-null   object 
 4   gender_employee               2152 non-null   object 
 5   recruitment_channel_employee  2152 non-null   object 
 6   marital_estatus_employee      2152 non-null   object 
 7   join_year_employee            2152 non-null   object 
 8   join_month_employee           2152 non-null   object 
 9   performance_employee          2152 non-null   object 
 10  work_modality_boss            2152 non-null   object 
 11  gender_boss                   2149 non-null   object 
 12  recruitment_channel_boss      2152 non-null   object 
 13  office_d

In [304]:
test_final.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2020 entries, 0 to 2019
Data columns (total 28 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   id_employee_employee          2020 non-null   int64  
 1   id_last_boss_employee         2020 non-null   int32  
 2   seniority_employee            2020 non-null   object 
 3   work_modality_employee        2020 non-null   object 
 4   gender_employee               2020 non-null   object 
 5   recruitment_channel_employee  2020 non-null   object 
 6   marital_estatus_employee      2020 non-null   object 
 7   join_year_employee            2020 non-null   object 
 8   join_month_employee           2020 non-null   object 
 9   performance_employee          2020 non-null   object 
 10  work_modality_boss            2020 non-null   object 
 11  gender_boss                   2020 non-null   object 
 12  recruitment_channel_boss      2020 non-null   object 
 13  office_d

## Creating new features

These new features allow us to make new features such as salary difference, years of join difference, age difference and boss_performance

In [305]:
# Creating new features

train_final['salary_diff'] = train_final['salary_boss'] - train_final['salary_employee']
train_final['age_diff'] = train_final['join_age_boss'] - train_final['join_age_employee']
train_final['performance_boss'] = ['high' if value >= 80 else 'low' for value in train_final['performance_score_boss']]
train_final['join_year_diff'] = train_final['join_year_boss'] - train_final['join_year_employee']
train_final['join_year_diff'] = train_final['join_year_diff'].astype(int)

test_final['salary_diff'] = test_final['salary_boss'] - test_final['salary_employee']
test_final['age_diff'] = test_final['join_age_boss'] - test_final['join_age_employee']
test_final['performance_boss'] = ['high' if value >= 80 else 'low' for value in test_final['performance_score_boss']]
test_final['join_year_diff'] = test_final['join_year_boss'] - test_final['join_year_employee']
test_final['join_year_diff'] = test_final['join_year_diff'].astype(int)

In [306]:
train_final.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2152 entries, 0 to 2151
Data columns (total 33 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   id_employee_employee          2152 non-null   int64  
 1   id_last_boss_employee         2152 non-null   int32  
 2   seniority_employee            2152 non-null   object 
 3   work_modality_employee        2152 non-null   object 
 4   gender_employee               2152 non-null   object 
 5   recruitment_channel_employee  2152 non-null   object 
 6   marital_estatus_employee      2152 non-null   object 
 7   join_year_employee            2152 non-null   object 
 8   join_month_employee           2152 non-null   object 
 9   performance_employee          2152 non-null   object 
 10  work_modality_boss            2152 non-null   object 
 11  gender_boss                   2149 non-null   object 
 12  recruitment_channel_boss      2152 non-null   object 
 13  office_d

In [307]:
train_final.describe()

Unnamed: 0,id_employee_employee,id_last_boss_employee,office_distance_employee,low_health_days_employee,average_permanence_employee,salary_employee,performance_score_employee,psi_score_employee,join_age_employee,office_distance_boss,...,average_permanence_boss,salary_boss,performance_score_boss,psi_score_boss,join_age_boss,join_year_boss,resign,salary_diff,age_diff,join_year_diff
count,2152.0,2152.0,2152.0,2152.0,2152.0,2152.0,2152.0,2152.0,2152.0,2152.0,...,2152.0,2152.0,2152.0,2152.0,2152.0,2152.0,2152.0,2152.0,2152.0,2152.0
mean,102070.160781,102090.369424,3.111768,2.197955,5.97816,457020.9,63.198885,75.281134,34.785781,2.932792,...,7.380576,1379830.0,58.6171,73.160781,39.078532,2017.08039,0.466078,922808.7,4.292751,-0.476766
std,1210.999342,43.775341,1.78905,2.907506,4.248552,302943.5,22.845863,6.063469,11.149553,1.389564,...,3.75439,299822.0,19.048892,4.646418,9.756918,2.515327,0.498964,427838.7,15.26596,4.386109
min,100001.0,102000.0,0.12,0.0,1.0,76521.0,4.0,58.0,9.0,0.92,...,0.0,405434.0,5.0,59.0,17.0,2012.0,0.0,-1210566.0,-37.0,-11.0
25%,101028.75,102054.0,1.81875,1.0,2.0,260439.8,43.0,71.0,27.0,2.070322,...,4.0,1261511.0,45.0,70.0,32.0,2015.0,0.0,696554.0,-7.0,-3.0
50%,102048.5,102090.5,2.51,1.0,5.0,373422.5,66.0,75.0,36.0,2.555081,...,7.0,1500664.0,58.0,73.0,39.0,2017.0,0.0,1002160.0,4.0,0.0
75%,103135.5,102127.0,4.15125,2.0,8.0,674192.8,88.0,79.0,44.0,3.3,...,9.0,1615999.0,74.0,76.0,46.0,2019.0,1.0,1249090.0,15.0,3.0
max,104171.0,102172.0,21.05,35.0,24.0,1900000.0,98.0,98.0,64.0,7.84,...,21.0,1899999.0,93.0,87.0,64.0,2022.0,1.0,1612250.0,48.0,10.0


In [308]:
train_final.describe(include='object')

Unnamed: 0,seniority_employee,work_modality_employee,gender_employee,recruitment_channel_employee,marital_estatus_employee,join_year_employee,join_month_employee,performance_employee,work_modality_boss,gender_boss,recruitment_channel_boss,performance_boss
count,2152,2152,2152,2152,2152,2152,2152,2152,2152,2149,2152,2152
unique,2,2,2,5,4,12,12,2,2,2,5,2
top,1,Presencial,Hombre,Portal Web,Soltero,2018,5,low,Presencial,Hombre,Portal Web,low
freq,2061,1530,1095,986,925,223,201,1387,1815,1198,988,1841


In [309]:
test_final.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2020 entries, 0 to 2019
Data columns (total 32 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   id_employee_employee          2020 non-null   int64  
 1   id_last_boss_employee         2020 non-null   int32  
 2   seniority_employee            2020 non-null   object 
 3   work_modality_employee        2020 non-null   object 
 4   gender_employee               2020 non-null   object 
 5   recruitment_channel_employee  2020 non-null   object 
 6   marital_estatus_employee      2020 non-null   object 
 7   join_year_employee            2020 non-null   object 
 8   join_month_employee           2020 non-null   object 
 9   performance_employee          2020 non-null   object 
 10  work_modality_boss            2020 non-null   object 
 11  gender_boss                   2020 non-null   object 
 12  recruitment_channel_boss      2020 non-null   object 
 13  office_d

In [310]:
test_final.describe()

Unnamed: 0,id_employee_employee,id_last_boss_employee,office_distance_employee,low_health_days_employee,average_permanence_employee,salary_employee,performance_score_employee,psi_score_employee,join_age_employee,office_distance_boss,low_health_days_boss,average_permanence_boss,salary_boss,performance_score_boss,psi_score_boss,join_age_boss,join_year_boss,salary_diff,age_diff,join_year_diff
count,2020.0,2020.0,2020.0,2020.0,2020.0,2020.0,2020.0,2020.0,2020.0,2020.0,2020.0,2020.0,2020.0,2020.0,2020.0,2020.0,2020.0,2020.0,2020.0,2020.0
mean,102103.807921,102089.980198,3.059394,2.160891,6.043564,463823.3,88.182178,75.438119,35.150495,2.807874,1.194554,7.603465,1396413.0,72.469307,73.617822,39.180198,2017.271287,932589.6,4.029703,-0.314851
std,1197.751058,43.819213,1.637131,2.742247,4.286436,306360.1,4.786836,6.026656,11.225287,1.169214,2.032736,4.471658,282238.1,17.833542,5.558058,8.604928,2.459109,422175.9,14.862057,4.252538
min,100000.0,102000.0,0.11,0.0,1.0,75517.0,80.0,58.0,9.0,0.73,0.0,0.0,488000.0,20.0,60.0,16.0,2012.0,-1412000.0,-38.0,-11.0
25%,101071.25,102052.0,1.805,1.0,2.0,266310.8,84.0,71.0,27.0,2.021268,0.0,4.0,1294810.0,57.0,70.0,34.0,2016.0,709996.5,-7.0,-3.0
50%,102117.5,102091.0,2.5425,1.0,5.0,374720.0,89.0,76.0,37.0,2.569255,1.0,7.0,1482532.0,79.0,74.0,39.0,2017.0,1004806.0,4.0,-1.0
75%,103126.25,102127.0,4.0,2.0,8.0,676307.5,91.0,79.0,44.0,3.323196,2.0,10.0,1615999.0,89.0,77.0,45.0,2019.0,1247610.0,14.0,3.0
max,104172.0,102172.0,14.045,23.0,26.0,1900000.0,98.0,98.0,63.0,7.16,17.0,26.0,1899999.0,97.0,94.0,62.0,2022.0,1803516.0,48.0,10.0


In [311]:
test_final.describe(include='object')

Unnamed: 0,seniority_employee,work_modality_employee,gender_employee,recruitment_channel_employee,marital_estatus_employee,join_year_employee,join_month_employee,performance_employee,work_modality_boss,gender_boss,recruitment_channel_boss,performance_boss
count,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020
unique,2,2,2,5,4,12,12,1,2,2,5,2
top,1,Presencial,Mujer,Portal Web,Soltero,2013,4,high,Presencial,Hombre,Portal Web,low
freq,1937,1408,1037,869,800,203,201,2020,1609,1161,922,1060


## Saving clean data

In [312]:
train_final.to_csv(paths.data_interim_dir('train_clean.csv'), index=False, sep=',')

test_final.to_csv(paths.data_interim_dir('test_clean.csv'), index=False, sep=',')