# ETL Process to clean data before analysis

## Preparing environment

In [1]:
import pandas as pd

In [8]:
import sys
sys.path.append('../high_performance_employee_resign_prediction')
from utils import paths

## Data Dictionary

* id_colaborador - Employee ID
* id_ultimo_jefe - Boss ID
* seniority - 1: For people who don't have anyone in charge, 2: For those who have people in charge
* modalidad_trabajo - Work modality specified on employee contract
* distancia_oficina -  Distance in kilometers from employee home to work.
* dias_baja_salud -  Number of days pf justified sick leave since employee started to work for the company.
* genero - Employee gender
* canal_reclutamiento - Indicates the means by which the application process took place. 
* permanencia_promedio - Mean time in years that the employee has worked in previous laboral experience.
* fecha_nacimiento - Birth date in format dd/mm/yyyy.
* salario - It is the amount corresponding to the monthly remuneration received.
* psi_score - Is the score obtained in psychometric test during selection process.
* fecha_incorporación - Date when the employee started to work for the company.
* estado_civil - Marital Status.
* performance_score - Is the score obtained by the employee it his last quarterly performance test if he had.
* abandono_6meses - "0" if employee continues working or "1" if employee left the company on his first 6 months.

## Extracting data

In [11]:
train_df = pd.read_csv(paths.data_raw_dir('train_data.csv'), sep=';')

test_df = pd.read_csv(paths.data_raw_dir('test_data.csv'), sep=';')

## Checking data

In [12]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2152 entries, 0 to 2151
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id_colaborador        2152 non-null   int64  
 1   id_ultimo_jefe        2061 non-null   float64
 2   seniority             2152 non-null   int64  
 3   modalidad_trabajo     2152 non-null   object 
 4   distancia_oficina     2152 non-null   float64
 5   dias_baja_salud       2152 non-null   int64  
 6   genero                2152 non-null   object 
 7   canal_reclutamiento   2152 non-null   object 
 8   permanencia_promedio  2152 non-null   int64  
 9   fecha_nacimiento      2152 non-null   object 
 10  salario               2152 non-null   int64  
 11  performance_score     2084 non-null   float64
 12  psi_score             2152 non-null   int64  
 13  fecha_incorporacion   2152 non-null   object 
 14  estado_civil          2152 non-null   object 
 15  abandono_6meses      

In [13]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2020 entries, 0 to 2019
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id_colaborador        2020 non-null   int64  
 1   id_ultimo_jefe        1937 non-null   float64
 2   seniority             2020 non-null   int64  
 3   modalidad_trabajo     2020 non-null   object 
 4   distancia_oficina     2020 non-null   float64
 5   dias_baja_salud       2020 non-null   int64  
 6   genero                2020 non-null   object 
 7   canal_reclutamiento   2020 non-null   object 
 8   permanencia_promedio  2020 non-null   int64  
 9   fecha_nacimiento      2020 non-null   object 
 10  salario               2020 non-null   int64  
 11  performance_score     2020 non-null   int64  
 12  psi_score             2020 non-null   int64  
 13  fecha_incorporacion   2020 non-null   object 
 14  estado_civil          2020 non-null   object 
dtypes: float64(2), int64(

In [14]:
train_df.describe()

Unnamed: 0,id_colaborador,id_ultimo_jefe,seniority,distancia_oficina,dias_baja_salud,permanencia_promedio,salario,performance_score,psi_score,abandono_6meses
count,2152.0,2061.0,2152.0,2152.0,2152.0,2152.0,2152.0,2084.0,2152.0,2152.0
mean,102070.160781,102090.326055,1.042286,3.111768,2.709108,6.644981,457021.1,63.472169,75.286245,0.466078
std,1210.999342,43.58601,0.201288,1.78905,2.982975,4.283794,302943.5,23.110594,6.050987,0.498964
min,100001.0,102000.0,1.0,0.12,0.0,1.0,76521.0,5.0,58.0,0.0
25%,101028.75,102054.0,1.0,1.81875,1.0,3.0,260439.8,42.0,71.0,0.0
50%,102048.5,102091.0,1.0,2.51,2.0,6.0,373422.5,65.0,75.0,0.0
75%,103135.5,102127.0,1.0,4.15125,3.0,9.0,674193.0,90.0,79.0,1.0
max,104171.0,102172.0,2.0,21.05,35.0,25.0,1900000.0,99.0,98.0,1.0


In [15]:
test_df.describe()

Unnamed: 0,id_colaborador,id_ultimo_jefe,seniority,distancia_oficina,dias_baja_salud,permanencia_promedio,salario,performance_score,psi_score
count,2020.0,1937.0,2020.0,2020.0,2020.0,2020.0,2020.0,2020.0,2020.0
mean,102103.807921,102089.646877,1.041089,3.059394,2.662376,6.713366,463823.6,89.14604,75.44505
std,1197.751058,43.691244,0.198545,1.637131,2.817351,4.30758,306360.1,4.851837,6.00944
min,100000.0,102000.0,1.0,0.11,0.0,1.0,75517.0,80.0,58.0
25%,101071.25,102052.0,1.0,1.805,1.0,3.0,266311.0,85.0,71.0
50%,102117.5,102089.0,1.0,2.5425,2.0,6.0,374720.0,90.0,76.0
75%,103126.25,102126.0,1.0,4.0,3.0,9.0,676307.5,92.0,79.0
max,104172.0,102172.0,2.0,14.045,23.0,27.0,1900000.0,99.0,98.0


Observing the statistics summary, we can see that there are some outliers in `distancia_oficina`, `dias_baja_salud`, `permanencia_promedio` and `salario` for both train and test data. They will be analyzed and actions will be taken during the EDA.

## Looking for typo erros in object data

In [18]:
train_df.describe(include='object')

Unnamed: 0,modalidad_trabajo,genero,canal_reclutamiento,fecha_nacimiento,fecha_incorporacion,estado_civil
count,2152,2152,2152,2152,2152,2152
unique,2,2,5,1980,1595,4
top,Presencial,Hombre,Portal Web,24/11/1978,3/10/2013,Soltero
freq,1530,1095,986,3,5,925


In [19]:
test_df.describe(include='object')

Unnamed: 0,modalidad_trabajo,genero,canal_reclutamiento,fecha_nacimiento,fecha_incorporacion,estado_civil
count,2020,2020,2020,2020,2020,2020
unique,2,2,5,1883,1527,4
top,Presencial,Mujer,Portal Web,27/04/1972,29/09/2017,Soltero
freq,1408,1037,869,3,5,800


In [20]:
# Checking modalidad_trabajo categories

print(train_df.modalidad_trabajo.unique())
print(test_df.modalidad_trabajo.unique())

['Híbrida' 'Presencial']
['Híbrida' 'Presencial']


In [21]:
# Checking canal_reclutamiento categories

print(train_df.canal_reclutamiento.unique())
print(test_df.canal_reclutamiento.unique())

['Ferias & Networking' 'Referidos' 'Linkedin' 'Portal Web' 'Headhunter']
['Ferias & Networking' 'Portal Web' 'Linkedin' 'Referidos' 'Headhunter']


In [22]:
# Checking estado_civil categories

print(train_df.estado_civil.unique())
print(test_df.estado_civil.unique())

['Soltero' 'Viudo' 'Divorciado' 'Casado']
['Soltero' 'Casado' 'Viudo' 'Divorciado']


From the data dictionary we can see that `seniority` and `id_ultimo_jefe` are categories too. However, id_ultimo_jefe has many categories, so it will be analyzed during the hypothesis testing phase.

In [23]:
cat_cols = ['modalidad_trabajo', 'genero', 'canal_reclutamiento', 'estado_civil', 'seniority']

# Defining a function to check categories proportion

def check_categories_proportion(df, col):
    
    """
    Calculate and print the proportion of each category in a specified column of a DataFrame.

    This function takes a pandas DataFrame and a column name, calculates the proportion 
    (as a percentage) of each unique value (category) in the specified column, and prints 
    the resulting proportions.

    Parameters:
    df (pandas.DataFrame): The DataFrame containing the data.
    col (str): The name of the column for which to calculate category proportions.

    Returns:
    None: This function prints the category proportions and does not return any value.

    Example:
    >>> import pandas as pd
    >>> data = {'category': ['A', 'B', 'A', 'C', 'B', 'A']}
    >>> df = pd.DataFrame(data)
    >>> check_categories_proportion(df, 'category')
    A    50.0
    B    33.3
    C    16.7
    Name: category, dtype: float64
    """
    
    proportion = df[col].value_counts(normalize=True) * 100
    print(proportion)

In [25]:
for col in cat_cols:
    check_categories_proportion(train_df, col)
    print('-'*40)

modalidad_trabajo
Presencial    71.096654
Híbrida       28.903346
Name: proportion, dtype: float64
----------------------------------------
genero
Hombre    50.8829
Mujer     49.1171
Name: proportion, dtype: float64
----------------------------------------
canal_reclutamiento
Portal Web             45.817844
Linkedin               22.676580
Referidos              15.706320
Ferias & Networking    14.312268
Headhunter              1.486989
Name: proportion, dtype: float64
----------------------------------------
estado_civil
Soltero       42.983271
Casado        38.708178
Divorciado    10.315985
Viudo          7.992565
Name: proportion, dtype: float64
----------------------------------------
seniority
1    95.771375
2     4.228625
Name: proportion, dtype: float64
----------------------------------------


In [26]:
for col in cat_cols:
    check_categories_proportion(test_df, col)
    print('-'*40)

modalidad_trabajo
Presencial    69.70297
Híbrida       30.29703
Name: proportion, dtype: float64
----------------------------------------
genero
Mujer     51.336634
Hombre    48.663366
Name: proportion, dtype: float64
----------------------------------------
canal_reclutamiento
Portal Web             43.019802
Linkedin               25.247525
Referidos              16.336634
Ferias & Networking    14.059406
Headhunter              1.336634
Name: proportion, dtype: float64
----------------------------------------
estado_civil
Soltero       39.603960
Casado        38.514851
Divorciado    12.425743
Viudo          9.455446
Name: proportion, dtype: float64
----------------------------------------
seniority
1    95.891089
2     4.108911
Name: proportion, dtype: float64
----------------------------------------


Categories proportion in both train and test data are similar.

## Extracting info from date columns

In [27]:
# Converting date columns to dtype datetime

train_df['fecha_incorporacion'] = pd.to_datetime(train_df['fecha_incorporacion'], dayfirst=True)
train_df['fecha_nacimiento'] = pd.to_datetime(train_df['fecha_nacimiento'], dayfirst=True)

test_df['fecha_incorporacion'] = pd.to_datetime(test_df['fecha_incorporacion'], dayfirst=True)
test_df['fecha_nacimiento'] = pd.to_datetime(test_df['fecha_nacimiento'], dayfirst=True)

In [29]:
# Calculating age of join

train_df['age'] = (train_df['fecha_incorporacion'] - train_df['fecha_nacimiento']).dt.days // 365

test_df['age'] = (test_df['fecha_incorporacion'] - test_df['fecha_nacimiento']).dt.days // 365

In [30]:
# Obtaining season info

train_df['join_year'] = train_df['fecha_incorporacion'].dt.year
train_df['join_month'] = train_df['fecha_incorporacion'].dt.month

test_df['join_year'] = test_df['fecha_incorporacion'].dt.year
test_df['join_month'] = test_df['fecha_incorporacion'].dt.month

In [31]:
# Discarding original dates

train_df.drop(['fecha_nacimiento', 'fecha_incorporacion'], axis=1, inplace=True)
test_df.drop(['fecha_nacimiento', 'fecha_incorporacion'], axis=1, inplace=True)

In [32]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2152 entries, 0 to 2151
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id_colaborador        2152 non-null   int64  
 1   id_ultimo_jefe        2061 non-null   float64
 2   seniority             2152 non-null   int64  
 3   modalidad_trabajo     2152 non-null   object 
 4   distancia_oficina     2152 non-null   float64
 5   dias_baja_salud       2152 non-null   int64  
 6   genero                2152 non-null   object 
 7   canal_reclutamiento   2152 non-null   object 
 8   permanencia_promedio  2152 non-null   int64  
 9   salario               2152 non-null   int64  
 10  performance_score     2084 non-null   float64
 11  psi_score             2152 non-null   int64  
 12  estado_civil          2152 non-null   object 
 13  abandono_6meses       2152 non-null   int64  
 14  age                   2152 non-null   int64  
 15  join_year            

In [33]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2020 entries, 0 to 2019
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id_colaborador        2020 non-null   int64  
 1   id_ultimo_jefe        1937 non-null   float64
 2   seniority             2020 non-null   int64  
 3   modalidad_trabajo     2020 non-null   object 
 4   distancia_oficina     2020 non-null   float64
 5   dias_baja_salud       2020 non-null   int64  
 6   genero                2020 non-null   object 
 7   canal_reclutamiento   2020 non-null   object 
 8   permanencia_promedio  2020 non-null   int64  
 9   salario               2020 non-null   int64  
 10  performance_score     2020 non-null   int64  
 11  psi_score             2020 non-null   int64  
 12  estado_civil          2020 non-null   object 
 13  age                   2020 non-null   int64  
 14  join_year             2020 non-null   int32  
 15  join_month           

## Missing values treatment

From context of the business, is well known that people with performance_score greater than or equal to 80 is considered people with high perfomance, and everyone else is considered low performance. There are some missing values in this column on train data, and observing the statistics summary, most of the employees from the train data are low performance, while all the employees in the test data are high performance; this lead us to impute those missing values with 80, in order to help the people to generalize better the high performance employees. Additionally, performance categorical column will be added with 'high' for employees with performance score greater or equal to 80 and 'low' for everyone else.

In [34]:
train_df['performance_score'] = train_df['performance_score'].fillna(80)

Since id_ultimo_jefe is a categorical column, we can't impute it. So we gonna fill missing values with 'unknown'.

In [35]:
train_df['id_ultimo_jefe'] = train_df['id_ultimo_jefe'].fillna('unknown')

test_df['id_ultimo_jefe'] = test_df['id_ultimo_jefe'].fillna('unknown')

In [36]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2152 entries, 0 to 2151
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id_colaborador        2152 non-null   int64  
 1   id_ultimo_jefe        2152 non-null   object 
 2   seniority             2152 non-null   int64  
 3   modalidad_trabajo     2152 non-null   object 
 4   distancia_oficina     2152 non-null   float64
 5   dias_baja_salud       2152 non-null   int64  
 6   genero                2152 non-null   object 
 7   canal_reclutamiento   2152 non-null   object 
 8   permanencia_promedio  2152 non-null   int64  
 9   salario               2152 non-null   int64  
 10  performance_score     2152 non-null   float64
 11  psi_score             2152 non-null   int64  
 12  estado_civil          2152 non-null   object 
 13  abandono_6meses       2152 non-null   int64  
 14  age                   2152 non-null   int64  
 15  join_year            

In [37]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2020 entries, 0 to 2019
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id_colaborador        2020 non-null   int64  
 1   id_ultimo_jefe        2020 non-null   object 
 2   seniority             2020 non-null   int64  
 3   modalidad_trabajo     2020 non-null   object 
 4   distancia_oficina     2020 non-null   float64
 5   dias_baja_salud       2020 non-null   int64  
 6   genero                2020 non-null   object 
 7   canal_reclutamiento   2020 non-null   object 
 8   permanencia_promedio  2020 non-null   int64  
 9   salario               2020 non-null   int64  
 10  performance_score     2020 non-null   int64  
 11  psi_score             2020 non-null   int64  
 12  estado_civil          2020 non-null   object 
 13  age                   2020 non-null   int64  
 14  join_year             2020 non-null   int32  
 15  join_month           

## Saving clean data

In [38]:
train_df.to_csv(paths.data_interim_dir('train_clean.csv'), index=False, sep=',')

test_df.to_csv(paths.data_interim_dir('test_clean.csv'), index=False, sep=',')