# ETL Process to clean data before analysis

## Preparing environment

In [1]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

In [2]:
import sys
sys.path.append('../high_performance_employee_resign_prediction')
from utils import paths

## Data Dictionary

* id_employee - Employee ID
* id_last_boss - Boss ID
* seniority - 1: For people who don't have anyone in charge, 2: For those who have people in charge
* work_modality - Work modality specified on employee contract
* office_distance -  Distance in kilometers from employee home to work.
* low_health_days -  Number of days pf justified sick leave since employee started to work for the company.
* gender - Employee gender
* recruitment_channel - Indicates the means by which the application process took place. 
* average_permanence - Mean time in years that the employee has worked in previous laboral experience.
* birth_date - Birth date in format dd/mm/yyyy.
* salary - It is the amount corresponding to the monthly remuneration received.
* psi_score - Is the score obtained in psychometric test during selection process.
* join_date - Date when the employee started to work for the company.
* marital_estatus - Marital Estatus.
* performance_score - Is the score obtained by the employee it his last quarterly performance test if he had.
* resign - "0" if employee continues working or "1" if employee left the company on his first 6 months.

## Extracting data

In [3]:
train_df = pd.read_csv(paths.data_raw_dir('train_data.csv'), parse_dates=['birth_date', 'join_date'], dayfirst=True, sep=';')

test_df = pd.read_csv(paths.data_raw_dir('test_data.csv'), parse_dates=['birth_date', 'join_date'], dayfirst=True, sep=';')

## Checking data

In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2152 entries, 0 to 2151
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   id_employee          2152 non-null   int64         
 1   id_last_boss         2061 non-null   float64       
 2   seniority            2152 non-null   int64         
 3   work_modality        2152 non-null   object        
 4   office_distance      2152 non-null   float64       
 5   low_health_days      2152 non-null   int64         
 6   gender               2152 non-null   object        
 7   recruitment_channel  2152 non-null   object        
 8   average_permanence   2152 non-null   int64         
 9   birth_date           2152 non-null   datetime64[ns]
 10  salary               2152 non-null   int64         
 11  performance_score    2084 non-null   float64       
 12  psi_score            2152 non-null   int64         
 13  join_date            2152 non-nul

In [5]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2020 entries, 0 to 2019
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   id_employee          2020 non-null   int64         
 1   id_last_boss         1937 non-null   float64       
 2   seniority            2020 non-null   int64         
 3   work_modality        2020 non-null   object        
 4   office_distance      2020 non-null   float64       
 5   low_health_days      2020 non-null   int64         
 6   gender               2020 non-null   object        
 7   recruitment_channel  2020 non-null   object        
 8   average_permanence   2020 non-null   int64         
 9   birth_date           2020 non-null   datetime64[ns]
 10  salary               2020 non-null   int64         
 11  performance_score    2020 non-null   int64         
 12  psi_score            2020 non-null   int64         
 13  join_date            2020 non-nul

In [6]:
train_df.describe()

Unnamed: 0,id_employee,id_last_boss,seniority,office_distance,low_health_days,average_permanence,birth_date,salary,performance_score,psi_score,join_date,resign
count,2152.0,2061.0,2152.0,2152.0,2152.0,2152.0,2152,2152.0,2084.0,2152.0,2152,2152.0
mean,102070.160781,102090.326055,1.042286,3.111768,2.709108,6.644981,1982-08-05 21:32:07.137546432,457021.1,63.472169,75.286245,2018-01-18 17:42:36.133828864,0.466078
min,100001.0,102000.0,1.0,0.12,0.0,1.0,1957-01-25 00:00:00,76521.0,5.0,58.0,2012-01-04 00:00:00,0.0
25%,101028.75,102054.0,1.0,1.81875,1.0,3.0,1973-12-16 18:00:00,260439.8,42.0,71.0,2014-12-28 12:00:00,0.0
50%,102048.5,102091.0,1.0,2.51,2.0,6.0,1981-05-07 12:00:00,373422.5,65.0,75.0,2018-02-18 00:00:00,0.0
75%,103135.5,102127.0,1.0,4.15125,3.0,9.0,1991-04-29 18:00:00,674193.0,90.0,79.0,2021-02-23 00:00:00,1.0
max,104171.0,102172.0,2.0,21.05,35.0,25.0,2003-02-19 00:00:00,1900000.0,99.0,98.0,2023-12-24 00:00:00,1.0
std,1210.999342,43.58601,0.201288,1.78905,2.982975,4.283794,,302943.5,23.110594,6.050987,,0.498964


In [7]:
test_df.describe()

Unnamed: 0,id_employee,id_last_boss,seniority,office_distance,low_health_days,average_permanence,birth_date,salary,performance_score,psi_score,join_date
count,2020.0,1937.0,2020.0,2020.0,2020.0,2020.0,2020,2020.0,2020.0,2020.0,2020
mean,102103.807921,102089.646877,1.041089,3.059394,2.662376,6.713366,1982-04-05 03:26:43.960396032,463823.6,89.14604,75.44505,2018-01-28 08:31:50.495049472
min,100000.0,102000.0,1.0,0.11,0.0,1.0,1956-08-16 00:00:00,75517.0,80.0,58.0,2012-01-02 00:00:00
25%,101071.25,102052.0,1.0,1.805,1.0,3.0,1973-05-27 12:00:00,266311.0,85.0,71.0,2014-12-14 00:00:00
50%,102117.5,102089.0,1.0,2.5425,2.0,6.0,1981-03-23 00:00:00,374720.0,90.0,76.0,2018-02-15 12:00:00
75%,103126.25,102126.0,1.0,4.0,3.0,9.0,1990-11-19 12:00:00,676307.5,92.0,79.0,2021-03-25 06:00:00
max,104172.0,102172.0,2.0,14.045,23.0,27.0,2003-01-14 00:00:00,1900000.0,99.0,98.0,2023-12-24 00:00:00
std,1197.751058,43.691244,0.198545,1.637131,2.817351,4.30758,,306360.1,4.851837,6.00944,


Observing the statistics summary, we can see that there are some outliers in `office_distance`, `low_health_days`, `average_permanence` and `salary` for both train and test data. They will be analyzed and actions will be taken during the EDA.

## Looking for typo errors in object data

In [8]:
train_df.describe(include='object')

Unnamed: 0,work_modality,gender,recruitment_channel,marital_estatus
count,2152,2152,2152,2152
unique,2,2,5,4
top,Presencial,Hombre,Portal Web,Soltero
freq,1530,1095,986,925


In [9]:
test_df.describe(include='object')

Unnamed: 0,work_modality,gender,recruitment_channel,marital_estatus
count,2020,2020,2020,2020
unique,2,2,5,4
top,Presencial,Mujer,Portal Web,Soltero
freq,1408,1037,869,800


In [10]:
# Checking work_modality categories

print(train_df.work_modality.unique())
print(test_df.work_modality.unique())

['Híbrida' 'Presencial']
['Híbrida' 'Presencial']


In [11]:
# Checking recruitment_channel categories

print(train_df.recruitment_channel.unique())
print(test_df.recruitment_channel.unique())

['Ferias & Networking' 'Referidos' 'Linkedin' 'Portal Web' 'Headhunter']
['Ferias & Networking' 'Portal Web' 'Linkedin' 'Referidos' 'Headhunter']


In [12]:
# Checking marital_estatus categories

print(train_df.marital_estatus.unique())
print(test_df.marital_estatus.unique())

['Soltero' 'Viudo' 'Divorciado' 'Casado']
['Soltero' 'Casado' 'Viudo' 'Divorciado']


From the data dictionary we can see that `seniority` and `id_last_boss` are categories too. However, id_last_boss has many categories, so it will be analyzed during the hypothesis testing phase.

In [13]:
cat_cols = ['work_modality', 'gender', 'recruitment_channel', 'marital_estatus', 'seniority']

# Defining a function to check categories proportion

def check_categories_proportion(df, col):
    
    """
    Calculate and print the proportion of each category in a specified column of a DataFrame.

    This function takes a pandas DataFrame and a column name, calculates the proportion 
    (as a percentage) of each unique value (category) in the specified column, and prints 
    the resulting proportions.

    Parameters:
    df (pandas.DataFrame): The DataFrame containing the data.
    col (str): The name of the column for which to calculate category proportions.

    Returns:
    None: This function prints the category proportions and does not return any value.

    Example:
    >>> import pandas as pd
    >>> data = {'category': ['A', 'B', 'A', 'C', 'B', 'A']}
    >>> df = pd.DataFrame(data)
    >>> check_categories_proportion(df, 'category')
    A    50.0
    B    33.3
    C    16.7
    Name: category, dtype: float64
    """
    
    proportion = df[col].value_counts(normalize=True) * 100
    print(proportion)

In [14]:
for col in cat_cols:
    check_categories_proportion(train_df, col)
    print('-'*40)

work_modality
Presencial    71.096654
Híbrida       28.903346
Name: proportion, dtype: float64
----------------------------------------
gender
Hombre    50.8829
Mujer     49.1171
Name: proportion, dtype: float64
----------------------------------------
recruitment_channel
Portal Web             45.817844
Linkedin               22.676580
Referidos              15.706320
Ferias & Networking    14.312268
Headhunter              1.486989
Name: proportion, dtype: float64
----------------------------------------
marital_estatus
Soltero       42.983271
Casado        38.708178
Divorciado    10.315985
Viudo          7.992565
Name: proportion, dtype: float64
----------------------------------------
seniority
1    95.771375
2     4.228625
Name: proportion, dtype: float64
----------------------------------------


In [15]:
for col in cat_cols:
    check_categories_proportion(test_df, col)
    print('-'*40)

work_modality
Presencial    69.70297
Híbrida       30.29703
Name: proportion, dtype: float64
----------------------------------------
gender
Mujer     51.336634
Hombre    48.663366
Name: proportion, dtype: float64
----------------------------------------
recruitment_channel
Portal Web             43.019802
Linkedin               25.247525
Referidos              16.336634
Ferias & Networking    14.059406
Headhunter              1.336634
Name: proportion, dtype: float64
----------------------------------------
marital_estatus
Soltero       39.603960
Casado        38.514851
Divorciado    12.425743
Viudo          9.455446
Name: proportion, dtype: float64
----------------------------------------
seniority
1    95.891089
2     4.108911
Name: proportion, dtype: float64
----------------------------------------


Categories proportion in both train and test data are similar.

## Extracting info from date columns

In [16]:
# Calculating age of join

train_df['join_age'] = (train_df['join_date'] - train_df['birth_date']).dt.days // 365

test_df['join_age'] = (test_df['join_date'] - test_df['birth_date']).dt.days // 365

In [17]:
# Obtaining season info

train_df['join_year'] = train_df['join_date'].dt.year
train_df['join_month'] = train_df['join_date'].dt.month

test_df['join_year'] = test_df['join_date'].dt.year
test_df['join_month'] = test_df['join_date'].dt.month

In [18]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2152 entries, 0 to 2151
Data columns (total 19 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   id_employee          2152 non-null   int64         
 1   id_last_boss         2061 non-null   float64       
 2   seniority            2152 non-null   int64         
 3   work_modality        2152 non-null   object        
 4   office_distance      2152 non-null   float64       
 5   low_health_days      2152 non-null   int64         
 6   gender               2152 non-null   object        
 7   recruitment_channel  2152 non-null   object        
 8   average_permanence   2152 non-null   int64         
 9   birth_date           2152 non-null   datetime64[ns]
 10  salary               2152 non-null   int64         
 11  performance_score    2084 non-null   float64       
 12  psi_score            2152 non-null   int64         
 13  join_date            2152 non-nul

In [19]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2020 entries, 0 to 2019
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   id_employee          2020 non-null   int64         
 1   id_last_boss         1937 non-null   float64       
 2   seniority            2020 non-null   int64         
 3   work_modality        2020 non-null   object        
 4   office_distance      2020 non-null   float64       
 5   low_health_days      2020 non-null   int64         
 6   gender               2020 non-null   object        
 7   recruitment_channel  2020 non-null   object        
 8   average_permanence   2020 non-null   int64         
 9   birth_date           2020 non-null   datetime64[ns]
 10  salary               2020 non-null   int64         
 11  performance_score    2020 non-null   int64         
 12  psi_score            2020 non-null   int64         
 13  join_date            2020 non-nul

## Missing values treatment

Before imputing missing values, lets explore them by concatenating both DataFrames. From dataframe info is known that there are null values on id_last_boss in both train and test datasets, and on performance score in train dataset.

Additionally, since bosses are employees of the company as well, they are included in the employee list and they have seniority 2.

In [20]:
# Concatenating both dataframes since they have similar characteristics

concat_df = pd.concat([train_df.drop('resign', axis=1), test_df], axis=0).reset_index(drop=True)

In [21]:
# Checking null values per row

concat_df.isna().sum(axis=1).value_counts().to_frame()

Unnamed: 0,count
0,3932
1,238
2,2


There are only two rows where there are 2 missing values, and 238 where there are 238. Let's explore null values on id_last_boss

In [22]:
# Exploring id_last_boss null values

concat_df[concat_df['id_last_boss'].isna()]

Unnamed: 0,id_employee,id_last_boss,seniority,work_modality,office_distance,low_health_days,gender,recruitment_channel,average_permanence,birth_date,salary,performance_score,psi_score,join_date,marital_estatus,join_age,join_year,join_month
105,102159,,2,Híbrida,0.98,2,Hombre,Referidos,14,1977-11-19,808000,90.0,86,2022-08-18,Soltero,44,2022,8
186,102023,,2,Híbrida,4.90,1,Hombre,Portal Web,4,1996-11-29,539000,80.0,65,2019-12-29,Casado,23,2019,12
485,102068,,2,Híbrida,2.17,3,Mujer,Referidos,4,1989-07-05,896000,41.0,81,2012-08-25,Soltero,23,2012,8
502,102016,,2,Híbrida,2.51,2,Mujer,Portal Web,6,1973-02-07,1616000,40.0,80,2017-08-31,Casado,44,2017,8
585,102173,,2,Híbrida,2.27,1,Hombre,Portal Web,6,1968-08-04,1616000,22.0,82,2018-04-05,Divorciado,49,2018,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4142,102084,,2,Presencial,2.62,3,Mujer,Portal Web,14,1972-11-15,1616000,89.0,76,2023-07-22,Divorciado,50,2023,7
4143,102071,,2,Presencial,0.87,1,Hombre,Portal Web,11,1972-03-13,1616000,85.0,72,2023-11-11,Viudo,51,2023,11
4144,102045,,2,Presencial,3.19,3,Hombre,Portal Web,11,1970-03-21,1616000,97.0,65,2023-09-26,Casado,53,2023,9
4145,102066,,2,Presencial,3.62,3,Hombre,Referidos,5,1968-02-27,1616000,97.0,84,2023-12-06,Casado,55,2023,12


In [23]:
# Checking the values of seniority from these employees

concat_df[concat_df['id_last_boss'].isna()].seniority.value_counts()

seniority
2    174
Name: count, dtype: int64

All the seniority values are 2, which means that all these employees are bosses that lead their own division and don't have any superior. This means that missing values can be imputed with their own IDs.

In [24]:
concat_df['id_last_boss'].fillna(concat_df['id_employee'], inplace=True)

In [25]:
concat_df[concat_df['seniority'] == 2]

Unnamed: 0,id_employee,id_last_boss,seniority,work_modality,office_distance,low_health_days,gender,recruitment_channel,average_permanence,birth_date,salary,performance_score,psi_score,join_date,marital_estatus,join_age,join_year,join_month
105,102159,102159.0,2,Híbrida,0.98,2,Hombre,Referidos,14,1977-11-19,808000,90.0,86,2022-08-18,Soltero,44,2022,8
186,102023,102023.0,2,Híbrida,4.90,1,Hombre,Portal Web,4,1996-11-29,539000,80.0,65,2019-12-29,Casado,23,2019,12
485,102068,102068.0,2,Híbrida,2.17,3,Mujer,Referidos,4,1989-07-05,896000,41.0,81,2012-08-25,Soltero,23,2012,8
502,102016,102016.0,2,Híbrida,2.51,2,Mujer,Portal Web,6,1973-02-07,1616000,40.0,80,2017-08-31,Casado,44,2017,8
585,102173,102173.0,2,Híbrida,2.27,1,Hombre,Portal Web,6,1968-08-04,1616000,22.0,82,2018-04-05,Divorciado,49,2018,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4142,102084,102084.0,2,Presencial,2.62,3,Mujer,Portal Web,14,1972-11-15,1616000,89.0,76,2023-07-22,Divorciado,50,2023,7
4143,102071,102071.0,2,Presencial,0.87,1,Hombre,Portal Web,11,1972-03-13,1616000,85.0,72,2023-11-11,Viudo,51,2023,11
4144,102045,102045.0,2,Presencial,3.19,3,Hombre,Portal Web,11,1970-03-21,1616000,97.0,65,2023-09-26,Casado,53,2023,9
4145,102066,102066.0,2,Presencial,3.62,3,Hombre,Referidos,5,1968-02-27,1616000,97.0,84,2023-12-06,Casado,55,2023,12


In [26]:
# Checking null values per row again

concat_df.isna().sum(axis=1).value_counts().to_frame()

Unnamed: 0,count
0,4104
1,68


Since there are few missing values, we can impute them using an imputation method. In order to preserve the distribution of the sample, we can use an IterativeImputer with a RandomForestRegressor. 

In [27]:
# Defining categorical and numerical columns

num_cols = ['office_distance', 'low_health_days', 'average_permanence', 'salary', 'performance_score', 'psi_score', 'join_age']
cat_cols = ['id_last_boss', 'seniority', 'work_modality', 'gender', 'recruitment_channel', 'marital_estatus']

# join_year and join_month are categories too but they are already encoded

In [28]:
# Saving features that will not be used during the imputation

feat_not_used = ['id_employee', 'birth_date', 'join_date']

concat_fnu = concat_df[feat_not_used]

y = train_df[['resign']]

In [29]:
# Converting id_last_boss on it dtype

concat_df['id_last_boss'] = concat_df['id_last_boss'].astype(int)

In [30]:
# Getting only features to execute imputation

X = concat_df.drop(feat_not_used, axis=1)

In [31]:
# Initializing LabelEncoder

label_encoders = {}

# Applying LabelEncoder to each categorical column

for col in cat_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    label_encoders[col] = le

In [32]:
X.head()

Unnamed: 0,id_last_boss,seniority,work_modality,office_distance,low_health_days,gender,recruitment_channel,average_permanence,salary,performance_score,psi_score,marital_estatus,join_age,join_year,join_month
0,74,0,0,1.76,1,1,0,1,140011,99.0,75,2,18,2018,1
1,115,0,0,0.76,2,0,0,2,182774,99.0,81,2,24,2021,7
2,60,0,0,4.95,3,1,4,11,682106,96.0,74,3,45,2016,7
3,62,0,0,13.03,2,0,2,2,270232,96.0,82,2,22,2014,7
4,62,0,0,13.045,2,0,2,2,266804,95.0,82,2,22,2014,7


In [33]:
# Initializing the IterativeImputer with RandomForestRegressor as the estimator
imputer = IterativeImputer(estimator=RandomForestRegressor(), max_iter=20, min_value=X.performance_score.min(),
                           max_value=X.performance_score.max(), random_state=42)

# Impute missing values
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

In [34]:
X_imputed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4172 entries, 0 to 4171
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id_last_boss         4172 non-null   float64
 1   seniority            4172 non-null   float64
 2   work_modality        4172 non-null   float64
 3   office_distance      4172 non-null   float64
 4   low_health_days      4172 non-null   float64
 5   gender               4172 non-null   float64
 6   recruitment_channel  4172 non-null   float64
 7   average_permanence   4172 non-null   float64
 8   salary               4172 non-null   float64
 9   performance_score    4172 non-null   float64
 10  psi_score            4172 non-null   float64
 11  marital_estatus      4172 non-null   float64
 12  join_age             4172 non-null   float64
 13  join_year            4172 non-null   float64
 14  join_month           4172 non-null   float64
dtypes: float64(15)
memory usage: 489.0 KB


No missing values anymore

In [35]:
X_imputed.head()

Unnamed: 0,id_last_boss,seniority,work_modality,office_distance,low_health_days,gender,recruitment_channel,average_permanence,salary,performance_score,psi_score,marital_estatus,join_age,join_year,join_month
0,74.0,0.0,0.0,1.76,1.0,1.0,0.0,1.0,140011.0,99.0,75.0,2.0,18.0,2018.0,1.0
1,115.0,0.0,0.0,0.76,2.0,0.0,0.0,2.0,182774.0,99.0,81.0,2.0,24.0,2021.0,7.0
2,60.0,0.0,0.0,4.95,3.0,1.0,4.0,11.0,682106.0,96.0,74.0,3.0,45.0,2016.0,7.0
3,62.0,0.0,0.0,13.03,2.0,0.0,2.0,2.0,270232.0,96.0,82.0,2.0,22.0,2014.0,7.0
4,62.0,0.0,0.0,13.045,2.0,0.0,2.0,2.0,266804.0,95.0,82.0,2.0,22.0,2014.0,7.0


In [36]:
# Converting back the features to integers

for col in X_imputed.columns:
    if col != 'office_distance':
        X_imputed[col] = X_imputed[col].astype(int)

X_imputed.head()

Unnamed: 0,id_last_boss,seniority,work_modality,office_distance,low_health_days,gender,recruitment_channel,average_permanence,salary,performance_score,psi_score,marital_estatus,join_age,join_year,join_month
0,74,0,0,1.76,1,1,0,1,140011,99,75,2,18,2018,1
1,115,0,0,0.76,2,0,0,2,182774,99,81,2,24,2021,7
2,60,0,0,4.95,3,1,4,11,682106,96,74,3,45,2016,7
3,62,0,0,13.03,2,0,2,2,270232,96,82,2,22,2014,7
4,62,0,0,13.045,2,0,2,2,266804,95,82,2,22,2014,7


In [37]:
# Decoding all the columns

for col in cat_cols:
    X_imputed[col] = label_encoders[col].inverse_transform(X_imputed[col])
    
X_imputed.head()

Unnamed: 0,id_last_boss,seniority,work_modality,office_distance,low_health_days,gender,recruitment_channel,average_permanence,salary,performance_score,psi_score,marital_estatus,join_age,join_year,join_month
0,102074,1,Híbrida,1.76,1,Mujer,Ferias & Networking,1,140011,99,75,Soltero,18,2018,1
1,102115,1,Híbrida,0.76,2,Hombre,Ferias & Networking,2,182774,99,81,Soltero,24,2021,7
2,102060,1,Híbrida,4.95,3,Mujer,Referidos,11,682106,96,74,Viudo,45,2016,7
3,102062,1,Híbrida,13.03,2,Hombre,Linkedin,2,270232,96,82,Soltero,22,2014,7
4,102062,1,Híbrida,13.045,2,Hombre,Linkedin,2,266804,95,82,Soltero,22,2014,7


In [38]:
# Getting the non used features

concat_df_final = pd.concat([concat_fnu, X_imputed], axis=1)
concat_df_final.head()

Unnamed: 0,id_employee,birth_date,join_date,id_last_boss,seniority,work_modality,office_distance,low_health_days,gender,recruitment_channel,average_permanence,salary,performance_score,psi_score,marital_estatus,join_age,join_year,join_month
0,100247,1999-11-25,2018-01-25,102074,1,Híbrida,1.76,1,Mujer,Ferias & Networking,1,140011,99,75,Soltero,18,2018,1
1,103355,1996-10-16,2021-07-24,102115,1,Híbrida,0.76,2,Hombre,Ferias & Networking,2,182774,99,81,Soltero,24,2021,7
2,100669,1971-01-29,2016-07-13,102060,1,Híbrida,4.95,3,Mujer,Referidos,11,682106,96,74,Viudo,45,2016,7
3,103760,1992-04-28,2014-07-21,102062,1,Híbrida,13.03,2,Hombre,Linkedin,2,270232,96,82,Soltero,22,2014,7
4,100965,1992-05-03,2014-07-22,102062,1,Híbrida,13.045,2,Hombre,Linkedin,2,266804,95,82,Soltero,22,2014,7


In [39]:
concat_df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4172 entries, 0 to 4171
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   id_employee          4172 non-null   int64         
 1   birth_date           4172 non-null   datetime64[ns]
 2   join_date            4172 non-null   datetime64[ns]
 3   id_last_boss         4172 non-null   object        
 4   seniority            4172 non-null   object        
 5   work_modality        4172 non-null   object        
 6   office_distance      4172 non-null   float64       
 7   low_health_days      4172 non-null   int32         
 8   gender               4172 non-null   object        
 9   recruitment_channel  4172 non-null   object        
 10  average_permanence   4172 non-null   int32         
 11  salary               4172 non-null   int32         
 12  performance_score    4172 non-null   int32         
 13  psi_score            4172 non-nul

## Creating new features

In [40]:
# Creating a performance column to determine if a employee has high or low perfomance
concat_df_final['performance'] = ['high' if score >= 80 else 'low' for score in concat_df_final['performance_score']]

# Creating an age-group column
concat_df_final['join_age_group'] = ['young' if age <= 26 else ('adult' if age <= 59 else 'old_adult') for age in concat_df_final['join_age']]

In [41]:
concat_df_final.head()

Unnamed: 0,id_employee,birth_date,join_date,id_last_boss,seniority,work_modality,office_distance,low_health_days,gender,recruitment_channel,average_permanence,salary,performance_score,psi_score,marital_estatus,join_age,join_year,join_month,performance,join_age_group
0,100247,1999-11-25,2018-01-25,102074,1,Híbrida,1.76,1,Mujer,Ferias & Networking,1,140011,99,75,Soltero,18,2018,1,high,young
1,103355,1996-10-16,2021-07-24,102115,1,Híbrida,0.76,2,Hombre,Ferias & Networking,2,182774,99,81,Soltero,24,2021,7,high,young
2,100669,1971-01-29,2016-07-13,102060,1,Híbrida,4.95,3,Mujer,Referidos,11,682106,96,74,Viudo,45,2016,7,high,adult
3,103760,1992-04-28,2014-07-21,102062,1,Híbrida,13.03,2,Hombre,Linkedin,2,270232,96,82,Soltero,22,2014,7,high,young
4,100965,1992-05-03,2014-07-22,102062,1,Híbrida,13.045,2,Hombre,Linkedin,2,266804,95,82,Soltero,22,2014,7,high,young


## Joining dataframe to create new features

Boss ids are also present in employee id, so we can use this information to create new features that can help to improve the results

In [42]:
# Changing id_last_boss dtype
concat_df_final['id_last_boss'] = concat_df_final['id_last_boss'].astype(int)

expanded_df = pd.merge(concat_df_final, concat_df_final, how='left',
                             left_on='id_last_boss', right_on='id_employee',
                             suffixes=('', '_boss'))

In [43]:
expanded_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4172 entries, 0 to 4171
Data columns (total 40 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   id_employee               4172 non-null   int64         
 1   birth_date                4172 non-null   datetime64[ns]
 2   join_date                 4172 non-null   datetime64[ns]
 3   id_last_boss              4172 non-null   int32         
 4   seniority                 4172 non-null   object        
 5   work_modality             4172 non-null   object        
 6   office_distance           4172 non-null   float64       
 7   low_health_days           4172 non-null   int32         
 8   gender                    4172 non-null   object        
 9   recruitment_channel       4172 non-null   object        
 10  average_permanence        4172 non-null   int32         
 11  salary                    4172 non-null   int32         
 12  performance_score   

In [44]:
expanded_df.head()

Unnamed: 0,id_employee,birth_date,join_date,id_last_boss,seniority,work_modality,office_distance,low_health_days,gender,recruitment_channel,...,average_permanence_boss,salary_boss,performance_score_boss,psi_score_boss,marital_estatus_boss,join_age_boss,join_year_boss,join_month_boss,performance_boss,join_age_group_boss
0,100247,1999-11-25,2018-01-25,102074,1,Híbrida,1.76,1,Mujer,Ferias & Networking,...,4,1616000,80,83,Viudo,43,2015,12,high,adult
1,103355,1996-10-16,2021-07-24,102115,1,Híbrida,0.76,2,Hombre,Ferias & Networking,...,4,1616000,94,76,Soltero,35,2018,12,high,adult
2,100669,1971-01-29,2016-07-13,102060,1,Híbrida,4.95,3,Mujer,Referidos,...,16,1616000,82,62,Casado,45,2013,10,high,adult
3,103760,1992-04-28,2014-07-21,102062,1,Híbrida,13.03,2,Hombre,Linkedin,...,6,692000,38,72,Soltero,21,2014,10,low,young
4,100965,1992-05-03,2014-07-22,102062,1,Híbrida,13.045,2,Hombre,Linkedin,...,6,692000,38,72,Soltero,21,2014,10,low,young


In [45]:
# Dropping unnecessary features

expanded_df.drop(columns=['id_employee_boss', # Is the same id_last_boss
                          'id_last_boss_boss', # Is the same id_last_boss
                          'seniority_boss', # It's 2 for all bosses
                          ], inplace=True)

## Creating new features

These new features allow us to make new features such as differences in numerical columns, determine if a employee joined after or before a boss and joined, etc

In [46]:
# Creating difference columns

expanded_df['salary_diff'] = expanded_df['salary_boss'] - expanded_df['salary']
expanded_df['join_days_diff'] = (expanded_df['join_date_boss'] - expanded_df['join_date']).dt.days
expanded_df['joined_after_boss'] = [0 if value > 0 else 1 for value in expanded_df['join_days_diff']]
expanded_df['age_diff'] = (expanded_df['birth_date_boss'] - expanded_df['birth_date']).dt.days // 365
expanded_df['younger_than_boss'] = [0 if value > 0 else 1 for value in expanded_df['age_diff']]

In [47]:
# Dropping date columns

expanded_df.drop(columns=['join_date', 'join_date_boss', 'birth_date_boss', 'birth_date'], inplace=True)

In [48]:
# Catching average numerical features of employee per boss

expanded_df['avg_od_epb'] = expanded_df.groupby('id_last_boss')['office_distance'].transform('mean')
expanded_df['avg_lhd_epb'] = expanded_df.groupby('id_last_boss')['low_health_days'].transform('mean')
expanded_df['avg_avgp_epb'] = expanded_df.groupby('id_last_boss')['average_permanence'].transform('mean')
expanded_df['avg_sal_epb'] = expanded_df.groupby('id_last_boss')['salary'].transform('mean')
expanded_df['avg_ps_epb'] = expanded_df.groupby('id_last_boss')['performance_score'].transform('mean')
expanded_df['avg_psis_epb'] = expanded_df.groupby('id_last_boss')['psi_score'].transform('mean')
expanded_df['avg_ja_epb'] = expanded_df.groupby('id_last_boss')['join_age'].transform('mean')

In [49]:
# Getting total employees by boss (minus 1 because bosses have their same id on their id_last_boss)

freq_enc_boss = expanded_df['id_last_boss'].value_counts() - 1

expanded_df['boss_employees_in_charge'] = expanded_df['id_last_boss'].map(freq_enc_boss)

In [50]:
# Separating train and test df

train_final = pd.concat([expanded_df.loc[:2151, :], y], axis=1)
test_final = expanded_df.loc[2152:, :].reset_index(drop=True)

In [51]:
train_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2152 entries, 0 to 2151
Data columns (total 47 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   id_employee               2152 non-null   int64  
 1   id_last_boss              2152 non-null   int32  
 2   seniority                 2152 non-null   object 
 3   work_modality             2152 non-null   object 
 4   office_distance           2152 non-null   float64
 5   low_health_days           2152 non-null   int32  
 6   gender                    2152 non-null   object 
 7   recruitment_channel       2152 non-null   object 
 8   average_permanence        2152 non-null   int32  
 9   salary                    2152 non-null   int32  
 10  performance_score         2152 non-null   int32  
 11  psi_score                 2152 non-null   int32  
 12  marital_estatus           2152 non-null   object 
 13  join_age                  2152 non-null   int32  
 14  join_yea

In [52]:
test_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2020 entries, 0 to 2019
Data columns (total 46 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   id_employee               2020 non-null   int64  
 1   id_last_boss              2020 non-null   int32  
 2   seniority                 2020 non-null   object 
 3   work_modality             2020 non-null   object 
 4   office_distance           2020 non-null   float64
 5   low_health_days           2020 non-null   int32  
 6   gender                    2020 non-null   object 
 7   recruitment_channel       2020 non-null   object 
 8   average_permanence        2020 non-null   int32  
 9   salary                    2020 non-null   int32  
 10  performance_score         2020 non-null   int32  
 11  psi_score                 2020 non-null   int32  
 12  marital_estatus           2020 non-null   object 
 13  join_age                  2020 non-null   int32  
 14  join_yea

## Saving clean data

In [53]:
train_final.to_csv(paths.data_interim_dir('train_clean.csv'), index=False, sep=',')

test_final.to_csv(paths.data_interim_dir('test_clean.csv'), index=False, sep=',')