In [None]:
import nbimporter
from funciones_custom import *

# Avisos
import warnings
warnings.filterwarnings("ignore")

# Anexo

- ### Datos:

In [None]:
ruta_datos_sin_procesar = os.getcwd().replace("Codigo\\Limpieza", "Datos\\datos_sin_procesar\\")

In [None]:
df_datos_trabajos = pd.read_csv(ruta_datos_sin_procesar + 'datos_trabajos.csv')
df_ubicaciones = pd.read_csv(ruta_datos_sin_procesar + 'datos_trabajos_ubicaciones.csv')

## 1. Creacion de la lista ubicaciones:

In [None]:
localidades = df_ubicaciones['localidad'].unique()
provincias = df_ubicaciones['provincia'].unique()
comunidades = df_ubicaciones['comunidad'].unique()
paises = df_ubicaciones['pais'].unique()

#Agregar localidades
localidades = list(localidades) + ["alcala de henares"]
localidades_sobrantes = ['Para', 'Vir', 'Ver', 'Les', 'Vic', 'Pol', 'Tuy', 'Amer']

lista_ubicaciones = [localidades, provincias, comunidades, paises]
lista_ubicaciones = [[value for value in sublist if not pd.isna(value)] for sublist in lista_ubicaciones]
lista_ubicaciones[0] = [value for value in lista_ubicaciones[0] if value not in localidades_sobrantes]

with open('ubicaciones.pickle', 'wb') as archivo_pickle:
    pickle.dump(lista_ubicaciones, archivo_pickle)

## 2. Busqueda herramientas más frecuentes:

In [None]:
def tool_cleaner(df, numero= 1000):

    herramientas_totales = ""

    for herramientas in df["herramientas"]:
        herramientas = str(herramientas)
        herramientas_limpias = (re.sub(r'[^\w\s+]', '', herramientas) + " ")
        herramientas_totales += re.sub(r'[áéíóúüàèìòù]', lambda m: unidecode(m.group()), herramientas_limpias)
    
    # Tokenizamos las herramientas totales:
    tokens = nltk.word_tokenize(text = herramientas_totales, language = "spanish")
    stopwords = nltk.corpus.stopwords.words("spanish")

    # Quitamos stopwords:
    clean_tokens = []
    for token in tokens:
        if token not in stopwords:
            clean_tokens.append(token)

    herramientas_sin_stopwords = " ".join(clean_tokens)
      
    tool_count = Counter()
    for herramienta in herramientas_sin_stopwords.split():    
        tool_count[herramienta] += 1

    common_tools = tool_count.most_common(numero)
    
    return common_tools

In [None]:
common_tools = tool_cleaner(df_datos_trabajos, numero= 1000)
common_tools

### 2.1. Lista herramientas:

In [None]:
with open(ruta_archivos_mapeo + 'herramientas.json', 'rb') as archivo:
    dic_herramientas = json.load(archivo)

In [None]:
lista_herramientas = []
for herramientas in dic_herramientas.values():
    if type(herramientas) == list:
        lista_herramientas.extend(herramientas)
    else:
        lista_herramientas.append(herramientas)
        
lista_herramientas.remove("")

with open('lista_herramientas.pickle', 'wb') as archivo_pickle:
    pickle.dump(lista_herramientas, archivo_pickle)

## 3. Mapping

### 3.1. Titulo

In [None]:
mapping_titulo  = {
    #Data
    'data driven' : {
        r'.*data.*science.*senior.*'     : 'senior data scientist',
        r'.*data.*science.*junior.*'     : 'junior data scientist',
        r'.*data.*science.*lead.*'       : 'lead data scientist',
        r'.*data.*sciencist.*'           : 'data scientist',
        r'.*data.*science.*'             : 'data scientist',
        r'.*data.*sciencist_salud.*'     : 'data scientist',
        r'.*data.*scientist.*senior.*'   : 'senior data scientist',
        r'.*data.*scientist.*junior.*'   : 'junior data scientist',
        r'.*data.*scientist.*lead.*'     : 'lead data scientist',
        r'.*data.*scientist.*'           : 'data scientist',
        r'.*cientifico.*datos.*senior.*' : 'senior data scientist',
        r'.*cientifico.*datos.*junior.*' : 'junior data scientist',
        r'.*cientifico.*datos.*lead.*'   : 'lead data scientist',
        r'.*cientifico.*datos.*'         : 'data scientist',

        r'.*analista.*' : 'analyst',
        r'.*analyst.*'  : 'analyst',
    
        r'.*data.*engineer.*senior.*'    : 'senior data engineer',
        r'.*data.*engineer.*junior.*'    : 'junior data engineer',
        r'.*data.*engineer.*lead.*'      : 'lead data engineer',
        r'.*data.*engineer.*'            : 'data engineer',
        r'.*ingeniero.*datos.*senior.*'  : 'senior data engineer',
        r'.*ingeniero.*datos.*junior.*'  : 'junior data engineer',
        r'.*ingeniero.*datos.*lead.*'    : 'lead data engineer',
        r'.*ingeniero.*datos.*'          : 'data engineer',
    
        r'.*data.*analyst.*senior.*'     : 'senior data analyst',
        r'.*data.*analyst.*junior.*'     : 'junior data analyst',
        r'.*data.*analyst.*lead.*'       : 'lead data analyst',
        r'.*data.*analytics.*'           : 'data analyst',
        r'.*data.*analyst.*'             : 'data analyst',
        r'.*analista.*datos.*senior.*'   : 'senior data analyst',
        r'.*analista.*datos.*junior.*'   : 'junior data analyst',
        r'.*analista.*datos.*lead.*'     : 'lead data analyst',
        r'.*analista.*datos.*'           : 'data analyst',
      
        r'.*data.*architect.*'           : 'data architect',
        r'.*arquitecto.*datos.*'         : 'data architect',

        r'.*database.*administrator.*'     : 'database administrator',
        r'.*dba.*'                         : 'database administrator',
        r'.*administrador.*bases.*datos.*' : 'database administrator',
    
        r'.*dba.*oracle.*'                 : 'oracle database administrator',

        r'.*devops.*engineer.*senior.*'  : 'senior devops engineer',
        r'.*devops.*engineer.*junior.*'  : 'junior devops engineer',
        r'.*devops.*engineer.*lead.*'    : 'lead devops engineer',
        r'.*devops.*engineer.*'          : 'devops engineer',
        r'.*devops.*'                    : 'devops',
        r'.*ingeniero.*devops.*senior.*' : 'senior devops engineer',
        r'.*ingeniero.*devops.*junior.*' : 'junior devops engineer',
        r'.*ingeniero.*devops.*lead.*'   : 'lead devops engineer',
        r'.*ingeniero.*devops.*'         : 'devops engineer',
    },

    #Machine learning
    'machine learning engineer' : {
        r'.*machine.*learning.*engineer.*senior.*'  : 'senior machine learning engineer',
        r'.*machine.*learning.*engineer.*junior.*'  : 'junior machine learning engineer',
        r'.*machine.*learning.*engineer.*lead.*'    : 'lead machine learning engineer',   
        r'.*machine.*learning.*engineer.*'          : 'machine learning engineer',
    
        r'.*machine.*learning.*scientist.*senior.*' : 'senior machine learning data scientist',
        r'.*machine.*learning.*scientist.*junior.*' : 'junior machine learning data scientist',
        r'.*machine.*learning.*scientist.*lead.*'   : 'lead machine learning data scientist', 
        r'.*data.*scientist.*machine.*learning.*'   : 'machine learning data scientist',
        r'.*machine.*learning.*scientist.*'         : 'machine learning data scientist',
    },

    #Programacion
    'programador' : {
        r'.*programmer.*senior.*'  : 'senior programmer',
        r'.*programmer.*junior.*'  : 'junior programmer',
        r'.*programmer.*lead.*'    : 'lead programmer',
        r'.*programmer*'           : 'programmer',
        r'.*programador.*senior.*' : 'senior programmer',
        r'.*programador.*junior.*' : 'junior programmer',
        r'.*programador.*lead.*'   : 'lead programmer',
        r'.*programador*'          : 'programmer',

        r'.*fullstack.*senior.*'  : 'senior fullstack',
        r'.*fullstack.*junior.*'  : 'junior fullstack',
        r'.*fullstack.*lead.*'    : 'lead fullstack',
        r'.*full.*stack.*'        : 'fullstack',
        r'.*fullstack.*'          : 'fullstack',

        r'.*developer.*senior.*'     : 'senior developer',
        r'.*developer.*junior.*'     : 'junior developer',
        r'.*developer.*lead.*'       : 'lead developer',
        r'.*developer*'              : 'developer',
        r'.*desarrollador.*senior.*' : 'senior developer',
        r'.*desarrollador.*junior.*' : 'junior developer',
        r'.*desarrollador.*lead.*'   : 'lead developer',
        r'.*desarrollador*'          : 'developer',

        r'.*engineer.*id.*'  : 'id engineer',
        r'.*ingeniero.*id.*' : 'id engineer',

        r'.*software.*engineer.*senior.*'  : 'senior software engineer',
        r'.*software.*engineer.*junior.*'  : 'junior software engineer',
        r'.*software.*engineer.*lead.*'    : 'lead software engineer',
        r'.*software.*engineer.*'          : 'software engineer',
        r'.*ingeniero.*software.*senior.*' : 'senior software engineer',
        r'.*ingeniero.*software.*junior.*' : 'junior software engineer',
        r'.*ingeniero.*software.*lead.*'   : 'lead software engineer',
        r'.*ingeniero.*software.*'         : 'software engineer',
    
        r'.*software.*architect.*'         : 'software architect',
        r'.*arquitecto.*software.*'        : 'software architect',
    },

    #Sistemas
    'sistemas' : {
        r'.*systems.*engineer.*senior.*'   : 'senior systems engineer',
        r'.*systems.*engineer.*junior.*'   : 'junior systems engineer',
        r'.*systems.*engineer.*lead.*'     : 'lead systems engineer',
        r'.*systems.*engineer.*'           : 'systems engineer',
        r'.*ingeniero.*sistemas.*senior.*' : 'senior systems engineer',
        r'.*ingeniero.*sistemas.*junior.*' : 'junior systems engineer',
        r'.*ingeniero.*sistemas.*lead.*'   : 'lead systems engineer',
        r'.*ingeniero.*sistemas.*'         : 'systems engineer',
    
        r'.*systems.*technician.*senior.*' : 'senior systems technician',
        r'.*systems.*technician.*junior.*' : 'junior systems technician',
        r'.*systems.*technician.*lead.*'   : 'lead systems technician',
        r'.*systems.*technician.*'         : 'systems technician',
        r'.*tecnico.*sistemas.*senior.*'   : 'senior systems technician',
        r'.*tecnico.*sistemas.*junior.*'   : 'junior systems technician',
        r'.*tecnico.*sistemas.*lead.*'     : 'lead systems technician',
        r'.*tecnico.*sistemas.*'           : 'systems technician',
    
        r'.*systems.*administrator.*'      : 'systems administrator',
        r'.*administrador.*sistemas.*'     : 'systems administrator',
        r'.*operador.*sistemas.*'          : 'systems administrator',

        r'.*technician.*network.*security.*senior.*' : 'senior network and security technician',
        r'.*technician.*network.*security.*junior.*' : 'junior network and security technician',
        r'.*technician.*network.*security.*lead.*'   : 'lead network and security technician',
        r'.*technician.*network.*security.*'         : 'network and security technician',
        r'.*tecnico.*redes.*seguridad.*senior.*'     : 'senior network and security technician',
        r'.*tecnico.*redes.*seguridad.*junior.*'     : 'junior network and security technician',
        r'.*tecnico.*redes.*seguridad.*lead.*'       : 'lead network and security technician',
        r'.*tecnico.*redes.*seguridad.*'             : 'network and security technician',
        r'.*tecnico.*seguridad.*comunicaciones.*'    : 'network and security technician',
    
        r'.*network.*technician.*'                   : 'network technician',
        r'.*tecnico.*redes.*'                        : 'network technician',
    
        r'.*network.*engineer.*'                     : 'network engineer',
        r'.*ingeniero.*redes.*'                      : 'network engineer',
    
        r'.*network.*administrator.*'                : 'network administrator',
        r'.*administrador.*redes.*'                  : 'network administrator',

        r'.*it.*engineer.*senior.*'     : 'senior it engineer',
        r'.*it.*engineer.*junior.*'     : 'junior it engineer',
        r'.*it.*engineer.*lead.*'       : 'lead it engineer',
        r'.*it.*engineer.*'             : 'it engineer',
        r'.*ingeniero.*it.*senior.*'    : 'senior it engineer',
        r'.*ingeniero.*it.*junior.*'    : 'junior it engineer',
        r'.*ingeniero.*it.*lead.*'      : 'lead it engineer',
        r'.*ingeniero.*it.*'            : 'it engineer',
        r'.*ingeniero.*ti.*senior.*'    : 'senior it engineer',
        r'.*ingeniero.*ti.*junior.*'    : 'junior it engineer',
        r'.*ingeniero.*ti.*lead.*'      : 'lead it engineer',
        r'.*ingeniero.*ti.*'            : 'it engineer',
    
        r'.*it.*specialist.*senior.*'   : 'senior it specialist',
        r'.*it.*specialist.*junior.*'   : 'junior it specialist',
        r'.*it.*specialist.*lead.*'     : 'lead it specialist',
        r'.*it.*specialist.*'           : 'it specialist',
        r'.*especialista.*ti.*senior.*' : 'senior it specialist',
        r'.*especialista.*ti.*junior.*' : 'junior it specialist',
        r'.*especialista.*ti.*lead.*'   : 'lead it specialist',
        r'.*especialista.*ti.*'         : 'it specialist',
        
        r'.*it.*support.*'              : 'it support',
        r'.*it.*technician.*'           : 'it technician',
        r'.*tecnico.*it.*'              : 'it technician',
        r'.*it.*manager.*'              : 'it manager',
        r'.*responsable.*it.*'          : 'it manager',

        r'.*computer.*technician.*'      : 'computer technician',
        r'.*tecnico.*informatico.*'      : 'computer technician',
        r'.*tecnico.*microinformatica.*' : 'computer technician',
        r'.*tecnica.*microinformatica.*' : 'computer technician',
    
        r'.*ingeniero.*informatico.*'    : 'computer engineer',
        r'.*computer.*engineer.*'        : 'computer engineer',

        r'.*cloud.*engineering.*' : 'cloud engineering',
        r'.*cloud.*engineer.*'    : 'cloud engineering',
        r'.*ingeniero.*cloud.*'   : 'cloud engineering',
    
        r'.*cloud.*architect.*'   : 'cloud architect',
        r'.*arquitecto.*cloud.*'  : 'cloud architect',

        r'.*support.*technician.*' : 'support technician',
        r'.*technical.*support.*'  : 'support technician',
        r'.*tecnico.*soporte.*'    : 'support technician',
        r'.*tecnico.*helpdesk.*'   : 'support technician',
    
        r'.*ingeniero.*soporte.*'  : 'support engineer',

        r'.*hardware.*engineer.*' : 'hardware engineer',
        r'.*ingeniero.*hardware.*': 'hardware engineer',

        r'.*teleoperator.*' : 'teleoperator',
        r'.*teleoperador.*' : 'teleoperator',
    },

    #Adminnistracion
    'administracion' : {
        r'.*qa.*engineer.*senior.*'       : 'senior qa engineer',
        r'.*qa.*engineer.*junior.*'       : 'junior qa engineer',
        r'.*qa.*engineer.*lead.*'         : 'lead qa engineer',
        r'.*qa.*engineer.*'               : 'qa engineer',
        r'.*ingeniero.*calidad.*senior.*' : 'senior qa engineer',
        r'.*ingeniero.*calidad.*junior.*' : 'junior qa engineer',
        r'.*ingeniero.*calidad.*lead.*'   : 'lead qa engineer',
        r'.*ingeniero.*calidad.*'         : 'qa engineer',
        r'.*ingeniero.*qa.*'              : 'qa engineer',
        r'.*ingeniero.*testing.*'         : 'qa engineer',
        
        r'.*qa.*automation.*'             : 'qa automation',
    
        r'.*qa.*tester.*'                 : 'qa tester',

        r'.*tech.*lead.*' : 'lead tech',

        r'.*project.*manager.*senior.*' : 'senior project manager',
        r'.*project.*manager.*junior.*' : 'junior project manager',
        r'.*project.*manager.*lead.*'   : 'lead project manager',
        r'.*project.*manager.*'         : 'project manager',
        r'.*manager.*'                  : 'project manager',
        r'.*jefe.*proyecto.*'           : 'project manager',
    },

    #Ciberseguridad
    'ciberseguridad' : {
        r'.*cybersecurity.*senior.*'  : 'senior cybersecurity',
        r'.*cybersecurity.*junior.*'  : 'junior cybersecurity',
        r'.*cybersecurity.*lead.*'    : 'lead cybersecurity',
        r'.*cybersecurity.*'          : 'cybersecurity',
        r'.*cibersecurity.*'          : 'cybersecurity',
        r'.*ciberseguridad.*senior.*' : 'senior cybersecurity',
        r'.*ciberseguridad.*junior.*' : 'junior cybersecurity',
        r'.*ciberseguridad.*lead.*'   : 'lead cybersecurity',
        r'.*ciberseguridad.*'         : 'cybersecurity',
    },
    
    #Consultor
    'otros' : {
        r'.*consultant.*senior.*' : 'senior consultant',
        r'.*consultant.*junior.*' : 'junior consultant',
        r'.*consultant.*lead.*'   : 'lead consultant',
        r'.*consultant*'          : 'consultant',
        r'.*consultor.*senior.*'  : 'senior consultant',
        r'.*consultor.*junior.*'  : 'junior consultant',
        r'.*consultor.*lead.*'    : 'lead consultant',
        r'.*consultor*'           : 'consultant',
    },
}

### 3.2. Presencialidad

In [None]:
mapping_presencialidad  = {
    r'.*remoto.*'     : 'remoto',
    r'.*hibrido.*'    : 'hibrido',
    r'.*presencial.*' : 'presencial'
}

def map_presencialidad(text):
    for pattern, label in mapping_presencialidad.items():
        if re.match(pattern, text, re.IGNORECASE):
            return label
    return 'no especificado'

### 3.3. Jornada

In [None]:
mapping_jornada  = {
    r'.*completa.*'             : 'jornada completa',
    r'.*intensiva.*mañana.*'    : 'jornada intensiva de mañana',
    r'.*parcial.*mañana.*'      : 'jornada parcial de mañana',
    r'.*media.*'                : 'media jornada',
    r'.*parcial.*indiferente.*' : 'jornada parcial indiferente',
    r'.*practicas.*'            : 'practicas',
    r'.*indiferente.*'          : 'jornada indiferente',
    r'.*por horas.*'            : 'por horas',
    r'.*obra.*'                 : 'contrato por obra',
    r'.*temporal.*'             : 'contrato temporal',
    r'.*autonomo.*'             : 'trabajo autonomo',
    r'.*rotatorio.*'            : 'turno rotatorio',
    r'.*tarde.*'                : 'jornada de tarde',
    r'.*estudiantes.*'          : 'trabajo para estudiantes'
}

def map_jornada(text):
    for pattern, label in mapping_jornada.items():
        if re.match(pattern, text, re.IGNORECASE):
            return label
    return 'no especificado'

### 3.4. Contrato

In [None]:
mapping_contrato  = {
    r'.*indefinido.*'      : 'indefinido',
    r'.*discontinuo.*'     : 'indefinido',
    r'.*temporal.*'        : 'temporal',
    r'.*determinada.*'     : 'temporal',
    r'.*determinar.*'      : 'temporal',
    r'.*practicas.*'       : 'practicas',
    r'.*formativo.*'       : 'practicas',
    r'.*autonomo.*'        : 'autonomo',
    r'.*otros contratos.*' : 'otros contratos',
    r'.*obra.*servicio.*'  : 'obra o servicio',
    r'.*parcial.*'         : 'parcial',
    r'.*relevo.*'          : 'relevo'
}

def map_contrato(text):
    for pattern, label in mapping_contrato.items():
        if re.match(pattern, text, re.IGNORECASE):
            return label
    return 'no especificado'

### 3.4. Beneficios

In [None]:
lista_beneficios = ["beneficios", "pensiones", "pension", "cheque", "transporte", "transport", "cheques", "medical", "ticket", "tickets", "dental", "medico", "guarderia", "childcare", "child", "vida", "health"]

def buscar_beneficios(texto, lista_beneficios):
    
    beneficios = 0
    for beneficio in lista_beneficios:
        if re.search(r'\b' + beneficio + r'\b', texto):
            beneficios += 1
            if beneficios > 1:
                return True
            
    return False