In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


## Ideas
1. Check if hasving missing data is related to not passing the selection process
2. check if different city and available to move have relation with label
3. process vacants and candidates before merging, otherwise we will repeat a lot of operations

## results

**nan values:** 

most people do not have many nans (>60%) in profile_description, title_or_profession, available_to_move, civil_status, has_video, psy_tests.

Vacants have more than 60% missings in the following variables: max_salary, expiration_date, 
experience_and_positions, knowledge_and_skill, titles_and_studies, number_of_quotas

**relational model**

Most candidates apply to several vacants.

Each vacant can be related to many applicants.

**Label**

There are more applicationstage ids (called id) than application ids. This means that one applicant can be in different stages of applying to a vacant which appears as more ids. An applicant could be a postulant, then in 
video interview and finaly a finalist. Applicants that make it to the finalist stage are considered as having
past the test.

**B U T** there are applicants that have and active status and are still in the "Postulante phase", I believe
that these applications are still being reviewed and should be excluded from the training of the model.

There are two types of applicats that will be consideread as **"not passing"**: those with **status == discarded**
and **status == deleted**

There are three types of applicants that will be considered as **passing**: 1. Clients with stage_title **Finalistas**, 2. Clients that have gotten to the **stage_title Video entrevista** AND have status == accepted, clients that have applied for vacants without Video Entrevista (has_video_entrevista==0) and have status == accepted.

The identification of each sample will be done with the application_id, so all intermediate stages of a client will
be discarded, only the final stage shall be kept 

**the column label is 1 for an accpeted candidate, 0 for a rejected candidate and -1 for rows that should be excluded from training**
              
**categorical variables**





# Candidates

In [7]:
candidates_header = ['id','email','first_name','last_name','phone','birthdate','gender','identification_type',
                    'identification_number','country_birth','city','education_level','salary','profile_description',
                    'without_experience','without_studies','title_or_profession','available_to_move',
                     'civil_status','has_video','studies','experiences','psy_tests']


In [8]:
raw_candidates = pd.read_csv('../DataEnviadaInicialmente/Candidates.csv',names=candidates_header)

In [9]:
pd_candidates = raw_candidates.copy()

print("number of users", len(raw_candidates))
print(raw_candidates.dtypes)
print(len(raw_candidates.columns))
raw_candidates.head()

number of users 548364
id                         int64
email                     object
first_name                object
last_name                 object
phone                    float64
birthdate                 object
gender                    object
identification_type      float64
identification_number    float64
country_birth             object
city                      object
education_level           object
salary                   float64
profile_description       object
without_experience          bool
without_studies             bool
title_or_profession       object
available_to_move         object
civil_status              object
has_video                 object
studies                   object
experiences               object
psy_tests                 object
dtype: object
23


Unnamed: 0,id,email,first_name,last_name,phone,birthdate,gender,identification_type,identification_number,country_birth,...,profile_description,without_experience,without_studies,title_or_profession,available_to_move,civil_status,has_video,studies,experiences,psy_tests
0,6,6.email@gmail.com,SantiLopez,L,3755545000.0,1996-09-14,,0.0,4481457000.0,Colombia,...,,False,False,,,,,[],[],[]
1,7,7.email@magnetosystem.com,Simon,Hoyos,3122798000.0,1989-08-12,,0.0,428632800.0,Colombia,...,,False,False,,,,,[],[],[]
2,8,8.email@innventto.com,Sebastian,T,3413520000.0,1991-11-10,,0.0,6673576000.0,Colombia,...,,False,False,,,,,[],[],[]
3,1,1.email@gmail.com,SebastianTorres,T,3452306000.0,1991-11-10,,1.0,7892609000.0,Colombia,...,,False,False,,,,,[],[],[]
4,9,9.email@innventto.com,Felipe,Ocampo,3957730000.0,1994-10-25,,1.0,2050189000.0,Colombia,...,,False,False,,,,,[],[],[]


### Cheking for missing/not reported/nan values



In [10]:
pd_candidates[['studies','experiences','psy_tests']] = pd_candidates[['studies','experiences',
                                                                      'psy_tests']].replace('[]',np.nan)

pd_nan = pd_candidates.isna().sum().reset_index().rename(columns={0:'missing_count'})
pd_nan['missing_percentage'] = pd_nan['missing_count']/ len(raw_candidates)
pd_nan

Unnamed: 0,index,missing_count,missing_percentage
0,id,0,0.0
1,email,0,0.0
2,first_name,9,1.6e-05
3,last_name,31,5.7e-05
4,phone,8096,0.014764
5,birthdate,81329,0.148312
6,gender,75034,0.136832
7,identification_type,51788,0.094441
8,identification_number,46607,0.084993
9,country_birth,21523,0.039249


In [11]:
# studies, experiences and psy_tests have this string '[]' instead of nans, 
cols = ['country_birth','city','education_level','salary','profile_description',
                    'without_experience','without_studies','title_or_profession','available_to_move',
                     'civil_status','has_video','studies',]
pd_candidates[cols]

Unnamed: 0,country_birth,city,education_level,salary,profile_description,without_experience,without_studies,title_or_profession,available_to_move,civil_status,has_video,studies
0,Colombia,Medellin,,,,False,False,,,,,
1,Colombia,Medellin,,,,False,False,,,,,
2,Colombia,Cravo Norte,Bachillerato completo,,,False,False,,,,,
3,Colombia,Puerto Narino,Bachillerato completo,,,False,False,,,,,
4,Colombia,Medellin,Bachillerato completo,,,False,False,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
548359,Colombia,Bucaramanga,Técnico,877803.0,Soy Técnica en Contabilización de operaciones...,True,False,,,,,"[{""id"":1012030,""institute"":""SENA"",""title"":""Con..."
548360,Colombia,Bogota,Bachillerato completo,900000.0,,False,False,,,,,"[{""id"":740806,""institute"":""Universidad Ecci"",""..."
548361,Colombia,Cartagena,Bachillerato completo,600000.0,,True,False,,,free_union,,"[{""id"":1012028,""institute"":""Institución educat..."
548362,Colombia,Bogota,Bachillerato completo,980000.0,,True,True,,,,,


# Vacants

In [34]:
vacant_header = ['id','title','description','salary_type','min_salary','max_salary','status','created_at',
                    'company','education_level','agree','requirements','publish_date','confidential',
                 'expiration_date','experience_and_positions','knowledge_and_skill','titles_and_studies',
                 'number_of_quotas']


In [35]:
raw_vacants = pd.read_csv('../DataEnviadaInicialmente/Vacants.csv',names=vacant_header)


In [36]:
pd_vacants = raw_vacants.copy()

print("number of users", len(pd_vacants))
print(len(pd_vacants.columns))

print(pd_vacants.dtypes)

pd_vacants.head()

number of users 11693
19
id                            int64
title                        object
description                  object
salary_type                  object
min_salary                    int64
max_salary                  float64
status                        int64
created_at                   object
company                      object
education_level              object
agree                          bool
requirements                 object
publish_date                 object
confidential                   bool
expiration_date              object
experience_and_positions     object
knowledge_and_skill          object
titles_and_studies           object
number_of_quotas            float64
dtype: object


Unnamed: 0,id,title,description,salary_type,min_salary,max_salary,status,created_at,company,education_level,agree,requirements,publish_date,confidential,expiration_date,experience_and_positions,knowledge_and_skill,titles_and_studies,number_of_quotas
0,1,Desarrollador Ruby on Rails,Estamos buscando un desarrollador Ruby on Rail...,personalized,800000,,2,2017-11-17 03:08:20 UTC,Magneto,,True,,2017-11-17 03:08:20 UTC,False,,,,,
1,2,Desarrollador Ruby on Rails,Estamos buscando un desarrollador Ruby on Rail...,personalized,800000,,1,2017-11-17 21:30:58 UTC,Magneto,,True,,2017-11-17 21:30:58 UTC,False,,,,,
2,3,Vacante 1,descriopcion 1,personalized,800000,,1,2017-11-21 13:45:35 UTC,Magneto,,True,,2017-11-21 13:45:35 UTC,False,,,,,
3,6,Practicantes Millenials CLARO,Procesos de reclutamiento de practicantes de c...,personalized,737717,,0,2017-11-22 12:04:41 UTC,Magneto,,False,,2017-11-22 12:04:41 UTC,False,,,,,
4,7,Practicantes Millenials CLARO clon,Procesos de reclutamiento de practicantes de c...,personalized,800000,,1,2017-11-23 12:00:56 UTC,Magneto,,True,,2017-11-23 12:00:56 UTC,False,,,,,


In [37]:
pd_nan = pd_vacants.isna().sum().reset_index().rename(columns={0:'missing_count'})
pd_nan['missing_percentage'] = pd_nan['missing_count']/ len(pd_vacants)
pd_nan

Unnamed: 0,index,missing_count,missing_percentage
0,id,0,0.0
1,title,19,0.001625
2,description,161,0.013769
3,salary_type,0,0.0
4,min_salary,0,0.0
5,max_salary,8190,0.700419
6,status,0,0.0
7,created_at,0,0.0
8,company,0,0.0
9,education_level,3103,0.265372


# Merge using relational model

In [214]:
applications_header = ['id','vacant_id','candidate_id','created_at','status','discard_type']
applicationstages_header = ['id','application_id','stage_id','created_at','status']
stages_header = ['id','title','send_sms','send_email','send_call','stage_type','vacant_id','stage_order']


In [215]:
applications = pd.read_csv('../DataEnviadaInicialmente/Applications.csv',names=applications_header)
applicationstages = pd.read_csv('../DataEnviadaInicialmente/ApplicationStages.csv',names=applicationstages_header)
stages = pd.read_csv('../DataEnviadaInicialmente/Stages.csv',names=stages_header)


In [216]:
#merge_candidate with vacant through applications
pd_candidates = pd_candidates.rename(columns={'id':'candidate_id'})
pd_candidates_app = pd_candidates.merge(applications,how='right', on='candidate_id')

# lo siguiente implica que cada candidato aplica a varias vacantes
print('merge with applications')
print(len(pd_candidates))
print(len(applications))
print(len(pd_candidates_app))

pd_vacants = pd_vacants.rename(columns={'id':'vacant_id'})
pd_candidates_vac = pd_candidates_app.merge(pd_vacants, how='left', on='vacant_id')

print('merge with vacants')
print(len(pd_candidates_app))
print(len(pd_vacants))
print(len(pd_candidates_vac))

pd_candidates_vac.columns

merge with applications
548364
2120287
2120287
merge with vacants
2120287
11693
2120287


Index(['candidate_id', 'email', 'first_name', 'last_name', 'phone',
       'birthdate', 'gender', 'identification_type', 'identification_number',
       'country_birth', 'city', 'education_level_x', 'salary',
       'profile_description', 'without_experience', 'without_studies',
       'title_or_profession', 'available_to_move', 'civil_status', 'has_video',
       'studies', 'experiences', 'psy_tests', 'id', 'vacant_id',
       'created_at_x', 'status_x', 'discard_type', 'title', 'description',
       'salary_type', 'min_salary', 'max_salary', 'status_y', 'created_at_y',
       'company', 'education_level_y', 'agree', 'requirements', 'publish_date',
       'confidential', 'expiration_date', 'experience_and_positions',
       'knowledge_and_skill', 'titles_and_studies', 'number_of_quotas'],
      dtype='object')

In [217]:
pd_candidates_vac = pd_candidates_vac.rename(columns={'education_level_x':'candidate_education_level', 
                                  'education_level_y':'vacant_education_level',
                                 'status_x':'candidate_status','status_y':'vacant_status',
                                 'created_at_x':'application_created_at','created_at_y':'vacant_created_at'})

pd_candidates_vac.columns

Index(['candidate_id', 'email', 'first_name', 'last_name', 'phone',
       'birthdate', 'gender', 'identification_type', 'identification_number',
       'country_birth', 'city', 'candidate_education_level', 'salary',
       'profile_description', 'without_experience', 'without_studies',
       'title_or_profession', 'available_to_move', 'civil_status', 'has_video',
       'studies', 'experiences', 'psy_tests', 'id', 'vacant_id',
       'application_created_at', 'candidate_status', 'discard_type', 'title',
       'description', 'salary_type', 'min_salary', 'max_salary',
       'vacant_status', 'vacant_created_at', 'company',
       'vacant_education_level', 'agree', 'requirements', 'publish_date',
       'confidential', 'expiration_date', 'experience_and_positions',
       'knowledge_and_skill', 'titles_and_studies', 'number_of_quotas'],
      dtype='object')

In [218]:
# merge applicationstage and stage
stages = stages.rename(columns={'id':'stage_id'})
stagemerge = applicationstages.merge(stages,how='left',on='stage_id')
print('merge with stage')
print(len(stages))
print(len(applicationstages))
print(len(stagemerge))


merge with stage
44470
2604399
2604399


In [219]:
pd_candidates_vac = pd_candidates_vac.rename(columns={'id':'application_id'})
pd_dataset = pd_candidates_vac.merge(stagemerge, how='inner',on='application_id')

pd_dataset = pd_dataset.rename(columns={'title_x':'vacant_title','title_y':'stage_title'})

print('merge with all')
print(len(stagemerge))
print(len(pd_candidates_vac))
print(len(pd_dataset))

pd_nan = pd_dataset.isna().sum().reset_index().rename(columns={0:'missing_count'})
pd_nan['missing_percentage'] = pd_nan['missing_count']/ len(pd_dataset)
pd_nan

merge with all
2604399
2120287
2604399


Unnamed: 0,index,missing_count,missing_percentage
0,candidate_id,0,0.0
1,email,0,0.0
2,first_name,16,6e-06
3,last_name,67,2.6e-05
4,phone,26159,0.010044
5,birthdate,291898,0.112079
6,gender,261916,0.100567
7,identification_type,167052,0.064142
8,identification_number,149276,0.057317
9,country_birth,66280,0.025449


In [220]:
# checking if the ids coincide
print(pd_dataset[pd_dataset.vacant_id_x!=pd_dataset.vacant_id_y][['vacant_id_x','vacant_id_y']])
pd_dataset

Empty DataFrame
Columns: [vacant_id_x, vacant_id_y]
Index: []


Unnamed: 0,candidate_id,email,first_name,last_name,phone,birthdate,gender,identification_type,identification_number,country_birth,...,stage_id,created_at,status,stage_title,send_sms,send_email,send_call,stage_type,vacant_id_y,stage_order
0,6,6.email@gmail.com,SantiLopez,L,3.755545e+09,1996-09-14,,0.0,4.481457e+09,Colombia,...,4,2017-11-17 21:33:35 UTC,accepted,Postulantes a vacante,True,True,True,0,2,1.0
1,6,6.email@gmail.com,SantiLopez,L,3.755545e+09,1996-09-14,,0.0,4.481457e+09,Colombia,...,5,2017-11-24 21:33:56 UTC,accepted,Video Entrevista,True,True,True,1,2,2.0
2,6,6.email@gmail.com,SantiLopez,L,3.755545e+09,1996-09-14,,0.0,4.481457e+09,Colombia,...,6,2019-02-18 18:46:00 UTC,active,Finalistas,False,False,False,2,2,3.0
3,6,6.email@gmail.com,SantiLopez,L,3.755545e+09,1996-09-14,,0.0,4.481457e+09,Colombia,...,22,2017-11-24 16:31:17 UTC,accepted,Postulantes a vacante,False,False,False,0,8,1.0
4,6,6.email@gmail.com,SantiLopez,L,3.755545e+09,1996-09-14,,0.0,4.481457e+09,Colombia,...,23,2017-11-24 16:42:51 UTC,active,Video Entrevista,False,False,False,1,8,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2604394,1590514,1590514.email@gmail.com,Yeolsy paola,Rojas Romero,3.016980e+09,2000-12-17,female,0.0,3.966227e+09,Colombia,...,67633,2020-02-12 23:58:47 UTC,discarded,Postulantes,True,True,False,0,19846,1.0
2604395,1339046,1339046.email@gmail.com,Ricardo,Zarate,3.633573e+09,2000-01-01,male,0.0,1.463051e+09,Colombia,...,68873,2020-02-12 23:59:00 UTC,discarded,Postulantes,True,True,True,0,20174,1.0
2604396,1590506,1590506.email@gmail.com,Kevin david,Quintana,3.871021e+09,2001-10-26,male,0.0,9.985984e+09,Colombia,...,64268,2020-02-12 23:59:50 UTC,discarded,Postulantes,False,True,False,0,18923,1.0
2604397,1590552,1590552.email@gmail.com,Astrid carolina,Leiva blanco,3.465604e+09,1996-10-21,female,0.0,8.307801e+09,Colombia,...,66786,2020-02-12 23:59:58 UTC,active,Postulantes,False,False,False,0,19624,1.0


In [221]:
pd_dataset[['vacant_title', 'stage_title']]

Unnamed: 0,vacant_title,stage_title
0,Desarrollador Ruby on Rails,Postulantes a vacante
1,Desarrollador Ruby on Rails,Video Entrevista
2,Desarrollador Ruby on Rails,Finalistas
3,Practicantes Millenials CLARO clon,Postulantes a vacante
4,Practicantes Millenials CLARO clon,Video Entrevista
...,...,...
2604394,RECIBO VILLAVICENCIO,Postulantes
2604395,Auxiliar de archivo,Postulantes
2604396,COLABORADOR DE TIENDA ATLÁNTICO-REGIONAL CARIBE-,Postulantes
2604397,1042 EMPACADORES/SIN EXPERIENCIA/Éxito Country...,Postulantes


In [222]:
# checking the proportions notice that id has way more

print(len(pd_dataset.application_id.unique()))
print(len(pd_dataset.id.unique()))
print(len(pd_dataset.vacant_id_x.unique()))
print(len(pd_dataset.candidate_id.unique()))
print(len(stages))
print(len(applicationstages))


2120233
2604399
11691
548357
44470
2604399


In [223]:
# as we can see there are several stages for applications
pd_grouped = pd_dataset.groupby('application_id')[['id']].nunique()
pd_grouped

Unnamed: 0_level_0,id
application_id,Unnamed: 1_level_1
33,3
34,3
35,3
36,2
37,2
...,...
3365945,1
3365946,1
3365947,1
3365948,1


### Labelling

In [224]:
# There is a status called status that tells us wheter the client  
# has advanced to another application stage. So let`s take applications with several stages and compare the 
#status stage type and id 
df_try = pd_grouped.reset_index()
app_list = df_try[df_try['id']>1]['application_id'].unique()
pd_dataset[pd_dataset['application_id']==app_list[0]][['candidate_id',
                                                       'status','stage_title','stage_type','stage_order',
                                                       'expiration_date','created_at']]


Unnamed: 0,candidate_id,status,stage_title,stage_type,stage_order,expiration_date,created_at
0,6,accepted,Postulantes a vacante,0,1.0,,2017-11-17 21:33:35 UTC
1,6,accepted,Video Entrevista,1,2.0,,2017-11-24 21:33:56 UTC
2,6,active,Finalistas,2,3.0,,2019-02-18 18:46:00 UTC


In [225]:
# what about applicants with just 1 status, where they rejected or is the application not closed yet?
# supuestamente status = active significa que todavía está siendo considerado
pd_dataset[pd_dataset['application_id']==3365945][['candidate_id',
                                                       'status','stage_title','stage_type','stage_order',
                                                  'expiration_date','created_at','candidate_status']]

Unnamed: 0,candidate_id,status,stage_title,stage_type,stage_order,expiration_date,created_at,candidate_status
2558664,1565673,active,Postulantes,0,1.0,,2020-02-12 23:59:38 UTC,active


In [226]:
pd_dataset[pd_dataset['status']=='active'][['candidate_id',
                                                       'status','stage_title','stage_type','stage_order',
                                                  'expiration_date','created_at','candidate_status']]

Unnamed: 0,candidate_id,status,stage_title,stage_type,stage_order,expiration_date,created_at,candidate_status
2,6,active,Finalistas,2,3.0,,2019-02-18 18:46:00 UTC,active
4,6,active,Video Entrevista,1,2.0,,2017-11-24 16:42:51 UTC,active
6,6,active,Video Entrevista,1,2.0,,2017-11-27 16:12:22 UTC,active
8,6,active,Video Entrevista,1,2.0,,2017-11-27 21:36:21 UTC,active
10,6,active,Atracción digital,1,2.0,,2017-11-29 17:10:29 UTC,active
...,...,...,...,...,...,...,...,...
2604391,1574935,active,Postulantes,0,1.0,,2020-02-12 23:57:29 UTC,active
2604392,1590535,active,Postulantes,0,1.0,,2020-02-12 23:58:06 UTC,active
2604393,1590540,active,Postulantes,0,1.0,,2020-02-12 23:58:31 UTC,active
2604397,1590552,active,Postulantes,0,1.0,,2020-02-12 23:59:58 UTC,active


In [227]:
for stage in (pd_dataset.stage_title.unique()):
    print(stage)

Postulantes a vacante
Video Entrevista
Finalistas
Atracción digital 
Atracción Digital
Postulantes
otra etapa
Prueba Psicométrica
Campaña de Sensibilizacion
PRUEBAS COMERCIALES
Aptos atracción digital
ENTREVISTA PRESENCIAL
Video entrevista
Pruebas V&A
Citación entrevista
Pruebas
DEG
Invitación Entrevista
Entrega cliente
Pruebas complementarias 
ENTREVISTA GRUPAL 
Entrevista grupal II
En revision
Recordatorio
Invitación a prueba
Postulantes Ejecutivos Comerciales
Atraccion Digital
Pruebas V&A y PSW
Entrevista Presencial 1
Citados a entrevista grupal
Por citar a entrevista grupal
Postulantes Asesores Comerciales
CONTRATACIÓN SMS
DAVIPLATA
DAVIVIENDA
RECHAZO
RECHAZO BANCOLOMBIA
PAGO EXITOSO
Pruebas complementarias PSW
Bogotá
Citaciones
Prueba de habilidades
Postulantes  Recepcionista
Programación de pruebas
Postulantes Auxiliares  Logísticos y Bodega
Prueba
Documentos
Prueba ensayo
INSCRIPCIÓN PORTAL
Entrevista HR
Assessment Center
Phone Screnning
Citación Prueba Técnica
Candidatos aptos


In [228]:
# building the label
pd_dataset['label'] = -1
pd_dataset.loc[pd_dataset.stage_title=='Finalista','label'] = 1 
pd_dataset.loc[(pd_dataset.status=='discarded') | (pd_dataset.status=='deleted'),'label'] = 0

# label as has_video_entrevista to check wheter the applicant passed to finalista or is merely in atracion digital
pd_dataset['has_video_entrevista'] = 0
pd_dataset.loc[pd_dataset.stage_title=='Video Entrevista','has_video_entrevista'] = 1

# if vacant does have attraccion digital, then the status has to be 'accespted' to be considered as 
# having passed
pd_dataset.loc[(pd_dataset.has_video_entrevista==1) & (pd_dataset.status=='accepted'),'label'] = 1

# if the vacant has no attraccion digital stage and has passed postulantes then it is considered as having passed
pd_dataset.loc[(pd_dataset.has_video_entrevista==0) & (pd_dataset.status=='accepted')
               & (pd_dataset.stage_title=='Postulantes'),'label'] = 1


pd_dataset.label.value_counts()

-1    1884449
 0     516731
 1     203219
Name: label, dtype: int64

In [246]:
# checking if we excluded the intermediate stages correctly
df_exclude = pd_dataset[pd_dataset['label']>=0].copy()
pd_grouped = df_exclude.groupby('application_id')[['id']].nunique()
pd_grouped[pd_grouped['id']>1]


Unnamed: 0_level_0,id
application_id,Unnamed: 1_level_1
40,2
54,2
259,2
260,2
262,2
...,...
3365682,2
3365728,2
3365730,2
3365774,2


In [259]:
df_exclude = df_exclude.sort_values(by='stage_order', ascending=False).copy()
df_try = df_exclude.drop_duplicates(subset=['application_id'], keep='first')
pd_grouped = df_try.groupby('application_id')[['id']].nunique()
pd_grouped[pd_grouped['id']>1]


Unnamed: 0_level_0,id
application_id,Unnamed: 1_level_1


# Relation with label

In [261]:
pd_exploration = pd_dataset[pd_dataset['label']>=0].copy()
pd_exploration = pd_exploration.sort_values(by='stage_order', ascending=False).copy()
pd_exploration = pd_exploration.drop_duplicates(subset=['application_id'], keep='first')
pd_exploration = pd_exploration.drop('vacant_id_y',axis=1)
pd_exploration = pd_exploration.rename(columns={'vacant_id_x':'vacant_id'})


for col in pd_exploration.columns:
    print(col)

candidate_id
email
first_name
last_name
phone
birthdate
gender
identification_type
identification_number
country_birth
city
candidate_education_level
salary
profile_description
without_experience
without_studies
title_or_profession
available_to_move
civil_status
has_video
studies
experiences
psy_tests
application_id
vacant_id
application_created_at
candidate_status
discard_type
vacant_title
description
salary_type
min_salary
max_salary
vacant_status
vacant_created_at
company
vacant_education_level
agree
requirements
publish_date
confidential
expiration_date
experience_and_positions
knowledge_and_skill
titles_and_studies
number_of_quotas
id
stage_id
created_at
status
stage_title
send_sms
send_email
send_call
stage_type
stage_order
label
has_video_entrevista


### identification columns


In [14]:
id_cols = ['candidate_id','email','first_name','last_name','phone','identification_type',
           'identification_number','application_id','vacant_id','company','id','stage_id','stage_title']



### categorical variables

In [15]:
cat_features = ['gender','country_birth','city','education_level','without_experience','without_studies',
               'title_or_profesion','available_to_move','civil_status','has_video']


### numerical variables

In [None]:
num_features = ['salary']

### date variables

### dictionary variables

### Text variables