In [1]:
import pandas as pd
import numpy as np
import datetime as DT
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Perceptron
from sklearn.metrics import accuracy_score

# Comenzamos con los datos de genero y edad

In [2]:
concatAge = pd.concat([pd.read_csv('data/datos_navent_fiuba/fiuba_2_postulantes_genero_y_edad.csv'),
                       pd.read_csv('data/entrega6/fiuba_2_postulantes_genero_y_edad.csv'),
                       pd.read_csv('data/Fiuba desde 15 Abril/fiuba_2_postulantes_genero_y_edad.csv')]
                    )
concatAge.head()

Unnamed: 0,idpostulante,fechanacimiento,sexo
0,NM5M,1970-12-03,FEM
1,5awk,1962-12-04,FEM
2,ZaO5,1978-08-10,FEM
3,NdJl,1969-05-09,MASC
4,eo2p,1981-02-16,MASC


In [3]:
concatAge.drop_duplicates('idpostulante',keep = 'last', inplace = True)
concatAge.reset_index(inplace = True)
concatAge.drop(columns = ['index'],inplace = True)
concatAge.rename(columns = {'fechanacimiento':'edad'}, inplace = True)


# hoy : 30/5/2018
hoy = pd.Timestamp(DT.datetime.now())
concatAge['edad'] = pd.to_datetime(concatAge['edad'],errors = 'coerce')
concatAge['edad'] = (hoy - concatAge['edad']).astype('<m8[Y]')
concatAge.head()

Unnamed: 0,idpostulante,edad,sexo
0,eo2p,37.0,MASC
1,1d2B,42.0,MASC
2,EBO0,44.0,FEM
3,a6MKW,43.0,MASC
4,6MWd4,43.0,MASC


In [4]:
def sexoAEntero(sexo):
    if (sexo == 'MASC'):
        return 0
    if (sexo == 'FEM'):
        return 1
    return np.nan

In [5]:
concatAge['sexo'] = concatAge['sexo'].apply(sexoAEntero)

In [6]:
concatAge.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 504407 entries, 0 to 504406
Data columns (total 3 columns):
idpostulante    504407 non-null object
edad            478311 non-null float64
sexo            479058 non-null float64
dtypes: float64(2), object(1)
memory usage: 11.5+ MB


# Pasamos a los datos sobre la educacion

In [7]:
concatEduc = pd.concat([pd.read_csv('data/datos_navent_fiuba/fiuba_1_postulantes_educacion.csv'),
                        pd.read_csv('data/entrega6/fiuba_1_postulantes_educacion.csv'),
                        pd.read_csv('data/Fiuba desde 15 Abril/fiuba_1_postulantes_educacion.csv')]
                      )
concatEduc.head()

Unnamed: 0,idpostulante,nombre,estado
0,NdJl,Posgrado,En Curso
1,8BkL,Universitario,En Curso
2,1d2B,Universitario,En Curso
3,NPBx,Universitario,En Curso
4,NPBx,Master,En Curso


In [8]:
concatEduc.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1103193 entries, 0 to 397508
Data columns (total 3 columns):
idpostulante    1103193 non-null object
nombre          1103193 non-null object
estado          1103193 non-null object
dtypes: object(3)
memory usage: 33.7+ MB


In [9]:
concatEduc['nombre'].value_counts()

Universitario        404619
Secundario           382927
Terciario/Técnico    180091
Otro                  84674
Posgrado              33484
Master                16436
Doctorado               962
Name: nombre, dtype: int64

In [10]:
def estudioSuperior(nivel):
    if ((nivel=='Posgrado') or (nivel =='Doctorado') or (nivel=='Master')):
        return 1
    else:
        return 0

In [11]:
graduados = (concatEduc['estado'].astype(str)== 'Graduado').astype(int)

universitarios = (concatEduc['nombre'].astype(str) == 'Universitario').astype(int)

secundario = (concatEduc['nombre'].astype(str) == 'Secundario').astype(int)

mayor_a_universitario = concatEduc['nombre'].apply(estudioSuperior)

In [12]:
concatEduc['titulo_universitario'] = universitarios*graduados
concatEduc['titulo_secundario'] = secundario*graduados
concatEduc['titulo_superior'] = mayor_a_universitario * graduados

In [13]:
titulos = concatEduc.groupby('idpostulante').agg({'titulo_universitario':'max',
                                                   'titulo_secundario':'max',
                                                   'titulo_superior':'max'})
titulos.reset_index(inplace = True)
titulos.head()

Unnamed: 0,idpostulante,titulo_universitario,titulo_secundario,titulo_superior
0,0z5Dmrd,1,0,0
1,0z5JW1r,0,1,0
2,0z5VvGv,0,1,0
3,0zB01pE,0,0,0
4,0zB026d,0,1,0


In [14]:
usuarios = concatAge.merge(titulos, on = 'idpostulante', how = 'left')
usuarios.head()

Unnamed: 0,idpostulante,edad,sexo,titulo_universitario,titulo_secundario,titulo_superior
0,eo2p,37.0,0.0,1.0,1.0,1.0
1,1d2B,42.0,0.0,0.0,0.0,0.0
2,EBO0,44.0,1.0,1.0,0.0,1.0
3,a6MKW,43.0,0.0,1.0,0.0,1.0
4,6MWd4,43.0,0.0,1.0,0.0,0.0


In [15]:
usuarios.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 504407 entries, 0 to 504406
Data columns (total 6 columns):
idpostulante            504407 non-null object
edad                    478311 non-null float64
sexo                    479058 non-null float64
titulo_universitario    447909 non-null float64
titulo_secundario       447909 non-null float64
titulo_superior         447909 non-null float64
dtypes: float64(5), object(1)
memory usage: 26.9+ MB


# Pasamos a los datos sobre los avisos

In [16]:
avisos = pd.concat([pd.read_csv('data/datos_navent_fiuba/fiuba_6_avisos_detalle.csv'),
                    pd.read_csv('data/entrega6/fiuba_6_avisos_detalle.csv'),
                    pd.read_csv('data/Fiuba desde 15 Abril/fiuba_6_avisos_detalle.csv'),
                    pd.read_csv('data/fiuba_6_avisos_detalle_missing_nivel_laboral.csv')])


avisos.head()

Unnamed: 0,idaviso,idpais,titulo,descripcion,nombre_zona,ciudad,mapacalle,tipo_de_trabajo,nivel_laboral,nombre_area,denominacion_empresa
0,8725750,1,VENDEDOR/A PROVINCIA DE SANTA FE,<p><strong><strong>Empresa:</strong></strong> ...,Gran Buenos Aires,,,Full-time,Senior / Semi-Senior,Comercial,VENTOR
1,17903700,1,Enfermeras,<p>Solicitamos para importante cadena de farma...,Gran Buenos Aires,,,Full-time,Senior / Semi-Senior,Salud,Farmacias Central Oeste
2,1000150677,1,Chofer de taxi,<p>TE GUSTA MANEJAR? QUERES GANAR PLATA HACIEN...,Capital Federal,,Empedrado 2336,Full-time,Senior / Semi-Senior,Transporte,FAMITAX SRL
3,1000610287,1,CHOFER DE CAMIONETA BAHIA BLANCA - PUNTA ALTA,<p><strong>Somos una empresa multinacional que...,Gran Buenos Aires,,,Full-time,Senior / Semi-Senior,Transporte,Wurth Argentina S.A
4,1000872556,1,Operarios de Planta - Rubro Electrodomésticos,<p><strong>OPERARIOS DE PLANTA</strong></p><p>...,Gran Buenos Aires,,,Full-time,Senior / Semi-Senior,Producción,ELECTRO OUTLET SRL


In [17]:
avisos.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45969 entries, 0 to 337
Data columns (total 11 columns):
idaviso                 45969 non-null int64
idpais                  45969 non-null int64
titulo                  45969 non-null object
descripcion             45969 non-null object
nombre_zona             45969 non-null object
ciudad                  225 non-null object
mapacalle               3444 non-null object
tipo_de_trabajo         45969 non-null object
nivel_laboral           45634 non-null object
nombre_area             45969 non-null object
denominacion_empresa    45955 non-null object
dtypes: int64(2), object(9)
memory usage: 4.2+ MB


In [18]:
avisos.drop_duplicates('idaviso', keep = 'last', inplace = True)
avisos.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25288 entries, 4 to 337
Data columns (total 11 columns):
idaviso                 25288 non-null int64
idpais                  25288 non-null int64
titulo                  25288 non-null object
descripcion             25288 non-null object
nombre_zona             25288 non-null object
ciudad                  160 non-null object
mapacalle               2070 non-null object
tipo_de_trabajo         25288 non-null object
nivel_laboral           24953 non-null object
nombre_area             25288 non-null object
denominacion_empresa    25281 non-null object
dtypes: int64(2), object(9)
memory usage: 2.3+ MB


In [19]:
avisos.drop(columns = ['mapacalle','ciudad'], inplace = True)
avisos['idpais'].value_counts()

1    25288
Name: idpais, dtype: int64

In [20]:
avisos.drop(columns = 'idpais',inplace = True)

In [21]:
avisos['tipo_de_trabajo'].value_counts()

Full-time          22831
Part-time           1746
Teletrabajo          248
Por Horas            125
Pasantia             119
Temporario            96
Por Contrato          88
Fines de Semana       28
Primer empleo          6
Voluntario             1
Name: tipo_de_trabajo, dtype: int64

In [22]:
avisos['tipo_de_trabajo'] = (avisos['tipo_de_trabajo'].astype(str) == 'Full-time').astype(int)

In [23]:
avisos['nombre_zona'].value_counts()

Gran Buenos Aires              23017
Capital Federal                 2183
Buenos Aires (fuera de GBA)       57
GBA Oeste                         10
La Plata                           4
Mendoza                            3
Ciudad de Mendoza                  3
Cordoba                            2
Rosario                            2
Tucuman                            1
Corrientes                         1
Santa Cruz                         1
Santa Fe                           1
Neuquen                            1
San Juan                           1
Catamarca                          1
Name: nombre_zona, dtype: int64

In [24]:
avisos.rename(columns = {'nombre_zona':'GBA', 'tipo_de_trabajo':'Full-time'},inplace = True)

In [25]:
avisos['GBA'] = (avisos['GBA'].astype(str) == 'Gran Buenos Aires').astype(int)
avisos.head(10)

Unnamed: 0,idaviso,titulo,descripcion,GBA,Full-time,nivel_laboral,nombre_area,denominacion_empresa
4,1000872556,Operarios de Planta - Rubro Electrodomésticos,<p><strong>OPERARIOS DE PLANTA</strong></p><p>...,1,1,Senior / Semi-Senior,Producción,ELECTRO OUTLET SRL
8,9240880,"Productores Asesores Independiente, para venta...",Agente\r\noficial Selecciona:</span></strong><...,0,1,Jefe / Supervisor / Responsable,Comercial,Agencia Oficial Alejandro Arizaga
19,1110185164,Administrativo de Recepción,<p>Centro Médico Accord se encuentra en la bús...,1,1,Junior,Administración,Unión Personal
21,1110513885,Distribuidor domiciliario con moto (SAN MIGUEL),<p>Importante correo privado ubicado en SAN MI...,1,1,Junior,Distribución,Asoko Tempo SA
32,1111034024,Vendedores para venta de medicina pre paga y o...,<p> </p><p><strong>Comercializadora incorpora ...,1,1,Senior / Semi-Senior,Comercial,JELS SRL
35,1111101289,PM. Zona Norte / Pacheco,<p>Acciona IT se encuentra en la búsqueda de P...,1,1,Senior / Semi-Senior,Liderazgo de Proyecto,ACCIONA IT
38,1111109704,"Supervisor de obra civil, eléctrico/ instrumen...",<p>Buscamos para importante empresa petrolera ...,1,1,Senior / Semi-Senior,Construcción,Hahn Solarz S.R.L.
45,1111172357,Empleado Administrativo para Tramites de Habil...,"<p style=""""><strong><em><span style="""">Brujula...",1,1,Senior / Semi-Senior,Administración,Brujula SA
46,1111174081,Gestor de Cobranzas - Telecobrador,<p><strong>En MAS ACTIVOS BPO te estamos esper...,0,0,Senior / Semi-Senior,Call Center,MAS ACTIVOS S.A.
47,1111235995,Analista de Recursos Humanos - Quilmes (Plazo...,<p>Cervecería y Maltería Quilmes es una de las...,1,1,Senior / Semi-Senior,Recursos Humanos,AB InBev - Cervecería y Maltería Quilmes


In [26]:
avisos['nivel_laboral'].value_counts()

Senior / Semi-Senior                    16975
Junior                                   4152
Otro                                     1977
Jefe / Supervisor / Responsable          1527
Gerencia / Alta Gerencia / Dirección      322
Name: nivel_laboral, dtype: int64

In [27]:
def nivelAEntero(nivel):
    if (nivel == 'Senior / Semi-Senior'):
        return 0
    if (nivel == 'Junior'):
        return -1
    if (nivel == 'Otro'):
        return 1
    return 3

In [28]:
avisos['nivel_laboral'] = avisos['nivel_laboral'].apply(nivelAEntero)
avisos.rename(columns = {'nivel_laboral':'nivel'}, inplace = True)


In [29]:
def esChofer(titulo):
    if ('chofer' in titulo):
        return 10
    if ('Chofer' in titulo):
        return 10
    return 0

In [30]:
avisos['chofer'] = avisos['titulo'].apply(esChofer)
avisos['chofer'].value_counts()

0     25113
10      175
Name: chofer, dtype: int64

In [31]:
avisos.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25288 entries, 4 to 337
Data columns (total 9 columns):
idaviso                 25288 non-null int64
titulo                  25288 non-null object
descripcion             25288 non-null object
GBA                     25288 non-null int64
Full-time               25288 non-null int64
nivel                   25288 non-null int64
nombre_area             25288 non-null object
denominacion_empresa    25281 non-null object
chofer                  25288 non-null int64
dtypes: int64(5), object(4)
memory usage: 1.9+ MB


# Pasamos a las postulaciones

In [32]:
postulaciones = pd.concat([pd.read_csv('data/datos_navent_fiuba/fiuba_4_postulaciones.csv'),
                           pd.read_csv('data/entrega6/fiuba_4_postulaciones.csv')])
postulaciones.head()

Unnamed: 0,idaviso,idpostulante,fechapostulacion
0,1112257047,NM5M,2018-01-15 16:22:34
1,1111920714,NM5M,2018-02-06 09:04:50
2,1112346945,NM5M,2018-02-22 09:04:47
3,1112345547,NM5M,2018-02-22 09:04:59
4,1112237522,5awk,2018-01-25 18:55:03


In [33]:
postulaciones = postulaciones.merge(avisos[['idaviso','nombre_area']], on = 'idaviso', how = 'left')
postulaciones.head()

Unnamed: 0,idaviso,idpostulante,fechapostulacion,nombre_area
0,1112257047,NM5M,2018-01-15 16:22:34,Atención al Cliente
1,1111920714,NM5M,2018-02-06 09:04:50,Telemarketing
2,1112346945,NM5M,2018-02-22 09:04:47,Telemarketing
3,1112345547,NM5M,2018-02-22 09:04:59,Telemarketing
4,1112237522,5awk,2018-01-25 18:55:03,Contabilidad


In [34]:
postulaciones.drop(columns = 'fechapostulacion', inplace = True)

In [35]:
postulaciones.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8311264 entries, 0 to 8311263
Data columns (total 3 columns):
idaviso         int64
idpostulante    object
nombre_area     object
dtypes: int64(1), object(2)
memory usage: 253.6+ MB


In [36]:
areas_por_postulante = postulaciones.groupby('idpostulante')\
                .agg({'nombre_area': lambda x: list(x),
                      'idaviso':'count'})

areas_por_postulante.head()

Unnamed: 0_level_0,nombre_area,idaviso
idpostulante,Unnamed: 1_level_1,Unnamed: 2_level_1
0z5Dmrd,"[Recursos Humanos, Recursos Humanos]",2
0z5JW1r,"[Almacén / Depósito / Expedición, Comercial, D...",7
0z5VvGv,"[Ventas, Ventas, Atención al Cliente, Atención...",84
0zB01pE,"[Administracion de Seguros, Administracion de ...",3
0zB026d,"[Producción, Producción, Producción, Producció...",13


In [37]:
areas_por_postulante.reset_index(inplace = True)

areas_por_postulante.rename(columns = {'nombre_area':'areas', 'idaviso':'#postulaciones'},inplace = True)

areas_por_postulante.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 348883 entries, 0 to 348882
Data columns (total 3 columns):
idpostulante      348883 non-null object
areas             348883 non-null object
#postulaciones    348883 non-null int64
dtypes: int64(1), object(2)
memory usage: 8.0+ MB


In [38]:
usuarios = usuarios.merge(areas_por_postulante, on = 'idpostulante', how = 'left')
usuarios.head()


Unnamed: 0,idpostulante,edad,sexo,titulo_universitario,titulo_secundario,titulo_superior,areas,#postulaciones
0,eo2p,37.0,0.0,1.0,1.0,1.0,"[nan, nan, nan, E-commerce, nan, Comercial]",6.0
1,1d2B,42.0,0.0,0.0,0.0,0.0,"[Administración, nan, Desarrollo de Negocios, ...",4.0
2,EBO0,44.0,1.0,1.0,0.0,1.0,"[Tesorería, Tesorería, Comercial, Comercial, n...",8.0
3,a6MKW,43.0,0.0,1.0,0.0,1.0,"[Desarrollo de Negocios, Comercial, Desarrollo...",3.0
4,6MWd4,43.0,0.0,1.0,0.0,0.0,"[Comercial, Comercial, Gerencia / Dirección Ge...",3.0


In [39]:
postulaciones.drop(columns = 'nombre_area',inplace = True)
postulaciones = postulaciones.merge(usuarios,on='idpostulante',how='left')
postulaciones.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8311264 entries, 0 to 8311263
Data columns (total 9 columns):
idaviso                 int64
idpostulante            object
edad                    float64
sexo                    float64
titulo_universitario    float64
titulo_secundario       float64
titulo_superior         float64
areas                   object
#postulaciones          float64
dtypes: float64(6), int64(1), object(2)
memory usage: 634.1+ MB


In [40]:
postulaciones = postulaciones.merge(avisos.drop(columns = ['titulo','descripcion','denominacion_empresa']),
                         on = 'idaviso', how = 'left')
postulaciones.head()

Unnamed: 0,idaviso,idpostulante,edad,sexo,titulo_universitario,titulo_secundario,titulo_superior,areas,#postulaciones,GBA,Full-time,nivel,nombre_area,chofer
0,1112257047,NM5M,47.0,1.0,0.0,1.0,0.0,"[Atención al Cliente, Telemarketing, Telemarke...",4.0,1.0,0.0,1.0,Atención al Cliente,0.0
1,1111920714,NM5M,47.0,1.0,0.0,1.0,0.0,"[Atención al Cliente, Telemarketing, Telemarke...",4.0,1.0,0.0,0.0,Telemarketing,0.0
2,1112346945,NM5M,47.0,1.0,0.0,1.0,0.0,"[Atención al Cliente, Telemarketing, Telemarke...",4.0,1.0,1.0,0.0,Telemarketing,0.0
3,1112345547,NM5M,47.0,1.0,0.0,1.0,0.0,"[Atención al Cliente, Telemarketing, Telemarke...",4.0,1.0,1.0,0.0,Telemarketing,0.0
4,1112237522,5awk,55.0,1.0,1.0,0.0,0.0,"[Contabilidad, Administración, Impuestos, Admi...",16.0,1.0,1.0,0.0,Contabilidad,0.0


In [41]:
postulaciones['postulacion'] = 1
postulaciones.head()

Unnamed: 0,idaviso,idpostulante,edad,sexo,titulo_universitario,titulo_secundario,titulo_superior,areas,#postulaciones,GBA,Full-time,nivel,nombre_area,chofer,postulacion
0,1112257047,NM5M,47.0,1.0,0.0,1.0,0.0,"[Atención al Cliente, Telemarketing, Telemarke...",4.0,1.0,0.0,1.0,Atención al Cliente,0.0,1
1,1111920714,NM5M,47.0,1.0,0.0,1.0,0.0,"[Atención al Cliente, Telemarketing, Telemarke...",4.0,1.0,0.0,0.0,Telemarketing,0.0,1
2,1112346945,NM5M,47.0,1.0,0.0,1.0,0.0,"[Atención al Cliente, Telemarketing, Telemarke...",4.0,1.0,1.0,0.0,Telemarketing,0.0,1
3,1112345547,NM5M,47.0,1.0,0.0,1.0,0.0,"[Atención al Cliente, Telemarketing, Telemarke...",4.0,1.0,1.0,0.0,Telemarketing,0.0,1
4,1112237522,5awk,55.0,1.0,1.0,0.0,0.0,"[Contabilidad, Administración, Impuestos, Admi...",16.0,1.0,1.0,0.0,Contabilidad,0.0,1


In [52]:
frecuencias = pd.Series()
frecuencias
len(frecuencias)
for i in range(0,0):
    print (i)

In [None]:
areas = postulaciones['areas']
area = postulaciones['nombre_area']


for i in range(0,len (areas)):
    area_i = area[i]
    areas_i = areas[i]
    frecuencia = areas_i.count(area_i)
    frecuencias.append(pd.Series(frecuencia))
frecuencias

In [44]:
no_postulaciones = usuarios.sample(10000)
for i in range(1,200):
    no_postulaciones = pd.concat([no_postulaciones, usuarios.sample(10000)])

In [45]:
avisos_no_postulaciones = avisos.sample(10000)
for i in range(1,200):
    avisos_no_postulaciones = pd.concat([avisos_no_postulaciones,avisos.sample(10000)])

In [46]:
no_postulaciones.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2000000 entries, 174804 to 178918
Data columns (total 8 columns):
idpostulante            object
edad                    float64
sexo                    float64
titulo_universitario    float64
titulo_secundario       float64
titulo_superior         float64
areas                   object
#postulaciones          float64
dtypes: float64(6), object(2)
memory usage: 137.3+ MB


In [47]:
no_postulaciones.head()

Unnamed: 0,idpostulante,edad,sexo,titulo_universitario,titulo_secundario,titulo_superior,areas,#postulaciones
174804,JBxaYpE,23.0,0.0,0.0,1.0,0.0,"[Abastecimiento, Abastecimiento, Almacén / Dep...",3.0
208957,xkdmMRY,,,,,,,
366858,0zrAMQa,26.0,1.0,1.0,1.0,0.0,"[Legal, Legal, Legal, Legal, Legal]",5.0
175208,owaMQ94,,,,,,,
441665,96Xkq5w,22.0,1.0,,,,,


In [48]:
no_postulaciones.reset_index(inplace = True)
no_postulaciones.drop(columns = 'index', inplace = True)
no_postulaciones

Unnamed: 0,idpostulante,edad,sexo,titulo_universitario,titulo_secundario,titulo_superior,areas,#postulaciones
0,JBxaYpE,23.0,0.0,0.0,1.0,0.0,"[Abastecimiento, Abastecimiento, Almacén / Dep...",3.0
1,xkdmMRY,,,,,,,
2,0zrAMQa,26.0,1.0,1.0,1.0,0.0,"[Legal, Legal, Legal, Legal, Legal]",5.0
3,owaMQ94,,,,,,,
4,96Xkq5w,22.0,1.0,,,,,
5,KBrMWjX,19.0,1.0,,,,[Ventas],1.0
6,5mLjKzM,24.0,1.0,0.0,1.0,0.0,"[Administración, Recepcionista, Comercial, Tes...",6.0
7,0zPjYAv,57.0,1.0,1.0,0.0,1.0,"[Impuestos, Corporate Finance / Banca Inversió...",14.0
8,JBd96oO,44.0,0.0,1.0,0.0,0.0,[Dirección de Obra],1.0
9,JBrEmAk,25.0,0.0,0.0,1.0,0.0,"[Ventas, Producción, Ventas, Distribución, Ven...",31.0


In [49]:
avisos_no_postulaciones.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2000000 entries, 3963 to 114
Data columns (total 9 columns):
idaviso                 int64
titulo                  object
descripcion             object
GBA                     int64
Full-time               int64
nivel                   int64
nombre_area             object
denominacion_empresa    object
chofer                  int64
dtypes: int64(5), object(4)
memory usage: 152.6+ MB


In [50]:
avisos_no_postulaciones.head()

Unnamed: 0,idaviso,titulo,descripcion,GBA,Full-time,nivel,nombre_area,denominacion_empresa,chofer
3963,1112192406,Consultor de Procesos TI Sr,<p>En Xelere nos encontramos en la búsqueda de...,1,1,0,Tecnologías de la Información,Xelere,0
3713,1112289892,1120 Vendedor para visitar Industrias Textiles.,"<div style=""""><ul style=""""><li style=""""><em><s...",1,1,0,Ventas,Aptitud Estratégica S.R.L.,0
6446,1112467965,EJECUTIVOS COMERCIALES MULTINACIONAL DE SEGURO...,"<p style="""">Nuestro cliente importante Multina...",1,1,0,Comercial,Suple,0
89,1112416484,Abogado/a Jr. Compliance,<p>Nos encontramos en la busqueda de un/a Abog...,0,1,3,Legal,Nicholson & Cano Abogados,0
9069,1112365278,Jefe de Producción  Obra para Subterráneo,"<p>Importante Empresa de Servicios, Construcci...",1,1,0,Construcción,ARRIVE RRHH,0


In [51]:
avisos_no_postulaciones.reset_index(inplace = True)
avisos_no_postulaciones.drop(columns =['titulo','descripcion','denominacion_empresa','index'], inplace = True)
avisos_no_postulaciones.head()

Unnamed: 0,idaviso,GBA,Full-time,nivel,nombre_area,chofer
0,1112192406,1,1,0,Tecnologías de la Información,0
1,1112289892,1,1,0,Ventas,0
2,1112467965,1,1,0,Comercial,0
3,1112416484,0,1,3,Legal,0
4,1112365278,1,1,0,Construcción,0


In [52]:
avisos_no_postulaciones.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000000 entries, 0 to 1999999
Data columns (total 6 columns):
idaviso        int64
GBA            int64
Full-time      int64
nivel          int64
nombre_area    object
chofer         int64
dtypes: int64(5), object(1)
memory usage: 91.6+ MB


In [53]:
no_postulaciones = no_postulaciones.join(avisos_no_postulaciones)
no_postulaciones.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000000 entries, 0 to 1999999
Data columns (total 14 columns):
idpostulante            object
edad                    float64
sexo                    float64
titulo_universitario    float64
titulo_secundario       float64
titulo_superior         float64
areas                   object
#postulaciones          float64
idaviso                 int64
GBA                     int64
Full-time               int64
nivel                   int64
nombre_area             object
chofer                  int64
dtypes: float64(6), int64(5), object(3)
memory usage: 213.6+ MB


In [54]:
no_postulaciones['postulacion'] = 0

In [55]:
postulaciones.columns

Index(['idaviso', 'idpostulante', 'edad', 'sexo', 'titulo_universitario',
       'titulo_secundario', 'titulo_superior', 'areas', '#postulaciones',
       'GBA', 'Full-time', 'nivel', 'nombre_area', 'chofer', 'postulacion'],
      dtype='object')

In [56]:
no_postulaciones = no_postulaciones[postulaciones.columns]
no_postulaciones.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000000 entries, 0 to 1999999
Data columns (total 15 columns):
idaviso                 int64
idpostulante            object
edad                    float64
sexo                    float64
titulo_universitario    float64
titulo_secundario       float64
titulo_superior         float64
areas                   object
#postulaciones          float64
GBA                     int64
Full-time               int64
nivel                   int64
nombre_area             object
chofer                  int64
postulacion             int64
dtypes: float64(6), int64(6), object(3)
memory usage: 228.9+ MB


In [57]:
postulaciones.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8311264 entries, 0 to 8311263
Data columns (total 15 columns):
idaviso                 int64
idpostulante            object
edad                    float64
sexo                    float64
titulo_universitario    float64
titulo_secundario       float64
titulo_superior         float64
areas                   object
#postulaciones          float64
GBA                     float64
Full-time               float64
nivel                   float64
nombre_area             object
chofer                  float64
postulacion             int64
dtypes: float64(10), int64(2), object(3)
memory usage: 1014.6+ MB


In [58]:
postulaciones = pd.concat([postulaciones,no_postulaciones])

In [59]:
postulaciones.fillna(0,inplace = True)

In [60]:
postulaciones.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10311264 entries, 0 to 1999999
Data columns (total 15 columns):
idaviso                 int64
idpostulante            object
edad                    float64
sexo                    float64
titulo_universitario    float64
titulo_secundario       float64
titulo_superior         float64
areas                   object
#postulaciones          float64
GBA                     float64
Full-time               float64
nivel                   float64
nombre_area             object
chofer                  float64
postulacion             int64
dtypes: float64(10), int64(2), object(3)
memory usage: 1.2+ GB


In [62]:
postulaciones.drop_duplicates(subset = ['idaviso','idpostulante'], keep = 'first',inplace = True)
postulaciones.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8602569 entries, 0 to 1999999
Data columns (total 15 columns):
idaviso                 int64
idpostulante            object
edad                    float64
sexo                    float64
titulo_universitario    float64
titulo_secundario       float64
titulo_superior         float64
areas                   object
#postulaciones          float64
GBA                     float64
Full-time               float64
nivel                   float64
nombre_area             object
chofer                  float64
postulacion             int64
dtypes: float64(10), int64(2), object(3)
memory usage: 1.0+ GB


In [63]:
postulaciones['postulacion'].value_counts()

1    6603752
0    1998817
Name: postulacion, dtype: int64

In [64]:
test = pd.read_csv('data/test_final_100k.csv')

In [65]:
test = test.merge(usuarios, on='idpostulante', how = 'left')
test

Unnamed: 0,id,idaviso,idpostulante,edad,sexo,titulo_universitario,titulo_secundario,titulo_superior,areas,#postulaciones
0,0,739260,6M9ZQR,42.0,1.0,1.0,0.0,1.0,"[Recepcionista, Laboratorio]",2.0
1,1,739260,6v1xdL,30.0,0.0,0.0,0.0,0.0,"[Comercial, Call Center, Atención al Cliente, ...",118.0
2,2,739260,ezRKm9,36.0,1.0,1.0,0.0,0.0,[Recursos Humanos],1.0
3,3,758580,1Q35ej,68.0,0.0,1.0,0.0,1.0,,
4,4,758580,EAN4J6,32.0,1.0,0.0,0.0,0.0,[Tesorería],1.0
5,5,758580,8R6pzR,28.0,0.0,0.0,1.0,0.0,"[Ventas, Atención al Cliente, Atención al Clie...",110.0
6,6,776420,aZJ2XN,29.0,0.0,1.0,0.0,0.0,"[Tesorería, Contabilidad, Contabilidad, Comerc...",4.0
7,7,776420,Nmpo3J,54.0,0.0,0.0,0.0,0.0,"[Soporte Técnico, Soporte Técnico, Soporte Téc...",18.0
8,8,776420,eVqWar,28.0,1.0,1.0,0.0,0.0,"[Selección, Selección]",2.0
9,9,820850,6ZBD33,37.0,0.0,1.0,0.0,1.0,"[Logística, Abastecimiento, Abastecimiento, Ab...",4.0


In [66]:
test = test.merge(avisos.drop(columns = 
                                         ['titulo',
                                         'descripcion',
                                         'denominacion_empresa']),
                            on = 'idaviso',
                            how = 'left')
test

Unnamed: 0,id,idaviso,idpostulante,edad,sexo,titulo_universitario,titulo_secundario,titulo_superior,areas,#postulaciones,GBA,Full-time,nivel,nombre_area,chofer
0,0,739260,6M9ZQR,42.0,1.0,1.0,0.0,1.0,"[Recepcionista, Laboratorio]",2.0,0,1,3,Comercial,0
1,1,739260,6v1xdL,30.0,0.0,0.0,0.0,0.0,"[Comercial, Call Center, Atención al Cliente, ...",118.0,0,1,3,Comercial,0
2,2,739260,ezRKm9,36.0,1.0,1.0,0.0,0.0,[Recursos Humanos],1.0,0,1,3,Comercial,0
3,3,758580,1Q35ej,68.0,0.0,1.0,0.0,1.0,,,0,1,1,Tecnologia / Sistemas,0
4,4,758580,EAN4J6,32.0,1.0,0.0,0.0,0.0,[Tesorería],1.0,0,1,1,Tecnologia / Sistemas,0
5,5,758580,8R6pzR,28.0,0.0,0.0,1.0,0.0,"[Ventas, Atención al Cliente, Atención al Clie...",110.0,0,1,1,Tecnologia / Sistemas,0
6,6,776420,aZJ2XN,29.0,0.0,1.0,0.0,0.0,"[Tesorería, Contabilidad, Contabilidad, Comerc...",4.0,0,1,1,Consultoria,0
7,7,776420,Nmpo3J,54.0,0.0,0.0,0.0,0.0,"[Soporte Técnico, Soporte Técnico, Soporte Téc...",18.0,0,1,1,Consultoria,0
8,8,776420,eVqWar,28.0,1.0,1.0,0.0,0.0,"[Selección, Selección]",2.0,0,1,1,Consultoria,0
9,9,820850,6ZBD33,37.0,0.0,1.0,0.0,1.0,"[Logística, Abastecimiento, Abastecimiento, Ab...",4.0,0,0,1,Recursos Humanos,0


In [67]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 15 columns):
id                      100000 non-null int64
idaviso                 100000 non-null int64
idpostulante            100000 non-null object
edad                    97162 non-null float64
sexo                    97325 non-null float64
titulo_universitario    94388 non-null float64
titulo_secundario       94388 non-null float64
titulo_superior         94388 non-null float64
areas                   76427 non-null object
#postulaciones          76427 non-null float64
GBA                     100000 non-null int64
Full-time               100000 non-null int64
nivel                   100000 non-null int64
nombre_area             100000 non-null object
chofer                  100000 non-null int64
dtypes: float64(6), int64(6), object(3)
memory usage: 12.2+ MB


In [68]:
test.fillna(0,inplace = True)

In [69]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 15 columns):
id                      100000 non-null int64
idaviso                 100000 non-null int64
idpostulante            100000 non-null object
edad                    100000 non-null float64
sexo                    100000 non-null float64
titulo_universitario    100000 non-null float64
titulo_secundario       100000 non-null float64
titulo_superior         100000 non-null float64
areas                   100000 non-null object
#postulaciones          100000 non-null float64
GBA                     100000 non-null int64
Full-time               100000 non-null int64
nivel                   100000 non-null int64
nombre_area             100000 non-null object
chofer                  100000 non-null int64
dtypes: float64(6), int64(6), object(3)
memory usage: 12.2+ MB


# Empieza perceptron


In [70]:
datos = postulaciones.drop(columns=['postulacion','idaviso','idpostulante','nombre_area','areas'])
categorias = postulaciones['postulacion']

In [71]:
test_size = 0.2
random_state = 0

In [72]:
datos_train, datos_test, cat_train, cat_test = train_test_split(datos,
                                                               categorias,
                                                               test_size = test_size,
                                                               random_state = random_state)

In [73]:
sc = StandardScaler()
sc.fit(datos_train)

datos_train_std = sc.transform(datos_train)
datos_test_std = sc.transform(datos_test)

In [89]:
data_train = datos_train_std[:,0:5]
data_test = datos_test_std[:,0:5]
n_iter=40
eta0 = 0.1

In [88]:
datos_train.iloc[1,0:5]

edad                    25.0
sexo                     1.0
titulo_universitario     0.0
titulo_secundario        0.0
titulo_superior          0.0
Name: 3765531, dtype: float64

In [90]:
ppn = Perceptron(n_iter = n_iter,
                eta0 = eta0,
                random_state = random_state)

ppn.fit(data_train,cat_train)





Perceptron(alpha=0.0001, class_weight=None, eta0=0.1, fit_intercept=True,
      max_iter=None, n_iter=40, n_jobs=1, penalty=None, random_state=0,
      shuffle=True, tol=None, verbose=0, warm_start=False)

In [91]:
cat_pred = ppn.predict(data_test)

print('accuracy: {0:.2f}%'.format(accuracy_score(cat_test,cat_pred)*100))

accuracy: 55.01%


In [92]:
ppn2 = Perceptron(n_iter = n_iter,
                eta0 = eta0,
                random_state = random_state)

ppn2.fit(datos_train_std,cat_train)





Perceptron(alpha=0.0001, class_weight=None, eta0=0.1, fit_intercept=True,
      max_iter=None, n_iter=40, n_jobs=1, penalty=None, random_state=0,
      shuffle=True, tol=None, verbose=0, warm_start=False)

In [93]:
cat_pred = ppn2.predict(datos_test_std)

print('accuracy: {0:.2f}%'.format(accuracy_score(cat_test,cat_pred)*100))

accuracy: 73.54%


In [94]:
final_test_data = test.drop(columns = ['id','idaviso','idpostulante','nombre_area','areas'])

In [95]:
final_test_data = sc.transform(final_test_data)

In [96]:
prediccion = ppn2.predict(final_test_data)
prediccion

array([1, 1, 1, ..., 1, 0, 1])

In [98]:
test['sepostulo'] = prediccion
entregable = test[['id','sepostulo']]


In [99]:
entregable.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 2 columns):
id           100000 non-null int64
sepostulo    100000 non-null int64
dtypes: int64(2)
memory usage: 2.3 MB


In [100]:
entregable.index = entregable['id']
entregable.drop(columns = 'id', inplace = True)
entregable

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0_level_0,sepostulo
id,Unnamed: 1_level_1
0,1
1,1
2,1
3,1
4,1
5,1
6,1
7,1
8,1
9,1


In [101]:
entregable.to_csv('submits/submit_1.csv')

In [104]:
entregable0 = pd.read_csv('submits/submit_0.csv')

In [106]:
print('accuracy: {0:.2f}%'.format(accuracy_score(entregable0['sepostulo'],entregable['sepostulo'])*100))

accuracy: 70.81%
