
**DataSet**:  DataScientist.csv  
    https://www.kaggle.com/andrewmvd/data-scientist-jobs  


In [13]:
#bibliotecas
import sys
import warnings
import pathlib
from termcolor import colored
#bibliotecas para manejo de datos
import pandas as pd
import numpy as np
from scipy import stats
import re
import unicodedata
import nltk
import unicodedata
from random import sample
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.impute import SimpleImputer
from category_encoders import TargetEncoder
from unidecode import unidecode
from nltk.corpus import stopwords
from nltk import FreqDist
from statsmodels.stats.outliers_influence import variance_inflation_factor
#bibliotecas para graficar
import plotly
import plotly.graph_objects as go
import plotly.express as px
import cufflinks as cf
import stylecloud
from PIL import Image
from plotly.offline import plot,iplot
pd.options.plotting.backend = "plotly"
cf.go_offline()
pd.set_option("display.max_columns",200)

## LIBRERIA

### Importación de módulo personal

In [27]:
sys.path.append(  pathlib.Path().parent.absolute()  )
#Importando módulo con funciones a utilizar
# from library import *

### Campos de información

**Job Title**: Nombre de la oferta de trabajo.  
**Salary Estimate**: Rango del salario estimado por el portal Glassdoor, en miles de dólares representado en la notación K.  
**Job Description**: Descripción de distintos rubros de la empresa, puesto y solicitud.  
**Rating**: Calificación por parte de usuarios de la empresa en cuestion. El valor se encuentra entre 1.0 y 5.0, redondeado con un decimal.  
**Company Name**: Nombre de la compañía que ofrece la oferta de trabajo.  
**Location**: Ciudad donde se ubica el trabajo ofertado, junto con el estado o país al que pertenece.  
**Headquarters**: Ciudad y estado/país donde se ubica la sede de la empresa.  
**Size**: Divide a las empresas según su número de empleados en rangos
específicos.  
**Founded**: Año de fundación de la empresa.  
**Type of Ownership**: Tipo de compañía según razón social o identidad.  
**Industry**: Tipo de compañía según idustria en la que trabaja.  
**Sector**: Sector en la que la compañía ofrece sus bienes o servicios.  
**Revenue**: Utilidades de la compañía en dolares al año.  
**Competitors**: Principal compañía competidora de la compañía que ofrece la oferta de trabajo.
**Easy Apply**: Etiqueta que indica si la aplicación por el puesto es sencilla.  
**Salary Minimum**: Rango inferior del estimado del salario. Extraido de Salary Estimate.  
**Salary Maximum**: Rango superior del estimado del salario. Extraido de Salary Estimate.



### Importación del dataset

* Como valores ausentes, contamos con registros de tipo -1 o 'Unknown / Non-Applicable'.  
* De la variable de Salary Estimate extraemos salary minimum y salary maximum desde la importación de los datos por la importancia de dicha información.

In [28]:
#Importando dataset
#   Valores nulos son -1 en esta tabla
df = pd.read_csv("DataScientist.csv",na_values=[-1,'-1','Unknown / Non-Applicable'])
#Eliminando columnas de indice
df = df.drop(df.columns[:2], axis=1)
#Cambiando espacios en nombres de columnas por '_'
# Pasando a minusculas
df.columns = df.columns.str.replace(' ','_').map(str.lower)
#Separando salary_estimate
df[['salary_minimum','salary_maximum']] = df['salary_estimate'].str.split('-', expand=True)
df[['salary_maximum','salary_estimate_source']] = df['salary_maximum'].str.split(' ',1,expand=True)
df.head()

Unnamed: 0,job_title,salary_estimate,job_description,rating,company_name,location,headquarters,size,founded,type_of_ownership,industry,sector,revenue,competitors,easy_apply,salary_minimum,salary_maximum,salary_estimate_source
0,Senior Data Scientist,$111K-$181K (Glassdoor est.),"ABOUT HOPPER\n\nAt Hopper, we’re on a mission ...",3.5,Hopper\n3.5,"New York, NY","Montreal, Canada",501 to 1000 employees,2007.0,Company - Private,Travel Agencies,Travel & Tourism,,,,$111K,$181K,(Glassdoor est.)
1,"Data Scientist, Product Analytics",$111K-$181K (Glassdoor est.),"At Noom, we use scientifically proven methods ...",4.5,Noom US\n4.5,"New York, NY","New York, NY",1001 to 5000 employees,2008.0,Company - Private,"Health, Beauty, & Fitness",Consumer Services,,,,$111K,$181K,(Glassdoor est.)
2,Data Science Manager,$111K-$181K (Glassdoor est.),Decode_M\n\nhttps://www.decode-m.com/\n\nData ...,,Decode_M,"New York, NY","New York, NY",1 to 50 employees,,Unknown,,,,,True,$111K,$181K,(Glassdoor est.)
3,Data Analyst,$111K-$181K (Glassdoor est.),Sapphire Digital seeks a dynamic and driven mi...,3.4,Sapphire Digital\n3.4,"Lyndhurst, NJ","Lyndhurst, NJ",201 to 500 employees,2019.0,Company - Private,Internet,Information Technology,,"Zocdoc, Healthgrades",,$111K,$181K,(Glassdoor est.)
4,"Director, Data Science",$111K-$181K (Glassdoor est.),"Director, Data Science - (200537)\nDescription...",3.4,United Entertainment Group\n3.4,"New York, NY","New York, NY",51 to 200 employees,2007.0,Company - Private,Advertising & Marketing,Business Services,,"BBDO, Grey Group, Droga5",,$111K,$181K,(Glassdoor est.)


In [29]:
df["job_title"].value_counts().index

Index(['Data Scientist', 'Data Engineer', 'Data Analyst',
       'Senior Data Scientist', 'Senior Data Analyst',
       'Machine Learning Engineer', 'Big Data Engineer',
       'Business Intelligence Analyst', 'Senior Data Engineer',
       'Sr. Data Scientist',
       ...
       'Senior Data Engineer - Apps Systems Engineer 5',
       'Big Data Engineer II', 'Information Security Data Analyst',
       'Sr Data Engineer (memSql)', 'Data Architect - Azure',
       'Phoenix Data Science Tutor Jobs', 'Civil Engineer/GIS Data Analyst',
       'Principal Device Modeling Engineer', 'SQL/SAS Data Analyst',
       'Patient Safety Physician or Safety Scientist - UK, Europe or the US'],
      dtype='object', length=2079)

# Etiquetado variables

In [30]:
#Defininiendo generador para tener facil acceso al nombre de las 
# columnas con cada invocación.
gen_columns = (col for col in df.columns)

In [31]:
#Revisando tipo de dato y ejemplos de cada columna para decidir 
# su etiqueta.
col = next(gen_columns)
print(col)
print(type(df[col][0]))
print('HEAD\n',df[col].head())
df[col].value_counts()

job_title
<class 'str'>
HEAD
 0                Senior Data Scientist
1    Data Scientist, Product Analytics
2                 Data Science Manager
3                         Data Analyst
4               Director, Data Science
Name: job_title, dtype: object


Data Scientist                                                         274
Data Engineer                                                          260
Data Analyst                                                           246
Senior Data Scientist                                                   91
Senior Data Analyst                                                     47
                                                                      ... 
Phoenix Data Science Tutor Jobs                                          1
Civil Engineer/GIS Data Analyst                                          1
Principal Device Modeling Engineer                                       1
SQL/SAS Data Analyst                                                     1
Patient Safety Physician or Safety Scientist - UK, Europe or the US      1
Name: job_title, Length: 2079, dtype: int64

In [32]:
#Definiendo tipo de las columnas.
 #continuas
c_feats = ['salary_minimum','salary_maximum']
 #discretas
v_feats = ['job_title','rating','location','headquarters','size','founded','type_of_ownership','industry','sector','revenue','competitors','easy_apply','salary_estimate_source','salary_estimate']
 #fehcas
d_feats = []
 #texto
t_feats = ['job_description','company_name']

#Etiquetando columnas
df = label_columns(df,c_feats,"c_")
df = label_columns(df,v_feats,"v_")
df = label_columns(df,t_feats,"t_")
df = label_columns(df,d_feats,"d_")

NameError: name 'label_columns' is not defined

# Duplicados

In [33]:
df

Unnamed: 0,job_title,salary_estimate,job_description,rating,company_name,location,headquarters,size,founded,type_of_ownership,industry,sector,revenue,competitors,easy_apply,salary_minimum,salary_maximum,salary_estimate_source
0,Senior Data Scientist,$111K-$181K (Glassdoor est.),"ABOUT HOPPER\n\nAt Hopper, we’re on a mission ...",3.5,Hopper\n3.5,"New York, NY","Montreal, Canada",501 to 1000 employees,2007.0,Company - Private,Travel Agencies,Travel & Tourism,,,,$111K,$181K,(Glassdoor est.)
1,"Data Scientist, Product Analytics",$111K-$181K (Glassdoor est.),"At Noom, we use scientifically proven methods ...",4.5,Noom US\n4.5,"New York, NY","New York, NY",1001 to 5000 employees,2008.0,Company - Private,"Health, Beauty, & Fitness",Consumer Services,,,,$111K,$181K,(Glassdoor est.)
2,Data Science Manager,$111K-$181K (Glassdoor est.),Decode_M\n\nhttps://www.decode-m.com/\n\nData ...,,Decode_M,"New York, NY","New York, NY",1 to 50 employees,,Unknown,,,,,True,$111K,$181K,(Glassdoor est.)
3,Data Analyst,$111K-$181K (Glassdoor est.),Sapphire Digital seeks a dynamic and driven mi...,3.4,Sapphire Digital\n3.4,"Lyndhurst, NJ","Lyndhurst, NJ",201 to 500 employees,2019.0,Company - Private,Internet,Information Technology,,"Zocdoc, Healthgrades",,$111K,$181K,(Glassdoor est.)
4,"Director, Data Science",$111K-$181K (Glassdoor est.),"Director, Data Science - (200537)\nDescription...",3.4,United Entertainment Group\n3.4,"New York, NY","New York, NY",51 to 200 employees,2007.0,Company - Private,Advertising & Marketing,Business Services,,"BBDO, Grey Group, Droga5",,$111K,$181K,(Glassdoor est.)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3904,AWS Data Engineer,$55K-$112K (Glassdoor est.),About Us\n\nTachyon Technologies is a Digital ...,4.4,Tachyon Technologies\n4.4,"Dublin, OH","Irving, TX",201 to 500 employees,2011.0,Company - Private,IT Services,Information Technology,$10 to $25 million (USD),,,$55K,$112K,(Glassdoor est.)
3905,Data Analyst â Junior,$55K-$112K (Glassdoor est.),"Job description\nInterpret data, analyze resul...",5.0,"Staffigo Technical Services, LLC\n5.0","Columbus, OH","Woodridge, IL",51 to 200 employees,2008.0,Company - Private,IT Services,Information Technology,$50 to $100 million (USD),,,$55K,$112K,(Glassdoor est.)
3906,Security Analytics Data Engineer,$55K-$112K (Glassdoor est.),Job DescriptionThe Security Analytics Data Eng...,3.8,"PDS Tech, Inc.\n3.8","Dublin, OH","Irving, TX",5001 to 10000 employees,1977.0,Company - Private,Staffing & Outsourcing,Business Services,$100 to $500 million (USD),,,$55K,$112K,(Glassdoor est.)
3907,Security Analytics Data Engineer,$55K-$112K (Glassdoor est.),The Security Analytics Data Engineer will inte...,4.0,Data Resource Technologies\n4.0,"Dublin, OH","Omaha, NE",1 to 50 employees,,Company - Private,Accounting,Accounting & Legal,Less than $1 million (USD),,,$55K,$112K,(Glassdoor est.)


In [34]:
#Numero de duplicados
df.duplicated().sum()

0

No hay duplicados de facil identidicación en el dataframe.

# Completitud

In [40]:
df["v_competitors"].value_counts().index

KeyError: 'v_competitors'

In [36]:
df["v_revenue"].value_counts()

KeyError: 'v_revenue'

In [37]:
df["t_company_name"].value_counts()

KeyError: 't_company_name'

In [38]:
df["v_sector"].value_counts()

KeyError: 'v_sector'

In [39]:
df[df["v_revenue"]=="$10+ billion (USD)"]["t_company_name"].value_counts().index

KeyError: 'v_revenue'

In [None]:
completitud(df)

In [None]:
# eliminando columnas con menos de 80% de completitud
df=df.drop(columns=['v_easy_apply','v_competitors','v_founded','v_revenue'])

# Limpieza de texto

In [None]:
# Limpiando todas las variables que consistan en texto
df['v_job_title'] = df["v_job_title"].map(lambda x:clean_text(x.lower(), pattern="[^a-zA-Z ]",replace=""))
df['v_location'] = df["v_location"].map(lambda x:clean_text(x.lower(), pattern="[^a-zA-Z, ]",replace=""))
df["v_headquarters"] = df["v_headquarters"].map(lambda x:clean_text(x.lower(), pattern="[^a-zA-Z, ]",replace=""),na_action='ignore')
df['v_industry'] = df['v_industry'].map(lambda x:clean_text(x.lower(), pattern="[^a-zA-Z ]",replace=""),na_action='ignore')
df['v_sector'] = df['v_sector'].map(lambda x:clean_text(x.lower(), pattern="[^a-zA-Z ]",replace=""),na_action='ignore')
df['v_type_of_ownership'] = df['v_type_of_ownership'].map(lambda x:clean_text(x.lower(), pattern="[^a-zA-Z-/ ]",replace=""),na_action='ignore')
df['v_size'] = df['v_size'].map(lambda x:clean_text(x.lower(), pattern="[^a-zA-Z0-9 ]",replace=""),na_action='ignore')
df['t_company_name'] = df['t_company_name'].map(lambda x:clean_text(x.lower(), pattern="[^a-zA-Z- ]",replace=""))
df['t_job_description'] = df['t_job_description'].map(lambda x:clean_text(x.lower(), pattern="[^a-zA-Z ]",replace=""))

# Consistencia

**c_salary_minimum & c_salary_maximum**

In [None]:
## Se transforma el salario a flotante
df['c_salary_minimum'] = df['c_salary_minimum'].map(transform_salary)    
df['c_salary_maximum'] = df['c_salary_maximum'].map(transform_salary)    

In [None]:
#Creando columna con media del salario mínimo y máximo por su 
#   importancia para el EDA
# Dicha columna es nuestro objetivo a modelar
df['c_salary_mean'] = df[['c_salary_minimum','c_salary_maximum']].mean(axis=1)
df[['c_salary_minimum','c_salary_maximum','c_salary_mean']].head()

In [None]:
df['c_salary_mean'].describe(percentiles=np.arange(0.1,1,.1))

In [None]:
#Asegurandonos que todos lo valores sobrepasen el salario minimo estadounidense
#Considerando salario minimo = $7.5/h
#   trabajo de medio tiempo = 4h/dia
#   calendario laboral estadounidense en 2019 = 261 dias
salario_minimo = 7.25*4*261
print(df['c_salary_minimum'][df['c_salary_minimum']<salario_minimo])
print(df['c_salary_maximum'][df['c_salary_maximum']<salario_minimo])

In [None]:
#No hay ninguna oferta por debajo del salario mínimo.

In [None]:
#Asegurandonos que salary_maximum es mayor a salary_minimum
df[df['c_salary_minimum']>df['c_salary_maximum']]

**v_rating**

In [None]:
#Transformando a numero
pd.to_numeric(df['v_rating'])
#Verificando valores entre 1 a 5 
df['v_rating'].describe()

# Normalizacion

### Normalizando v_job_title

In [None]:
df['v_job_title'].value_counts(1)

**Normalizando de tal forma que las únicas categorías sean:**  
- data scientist: cualquier titulo que contanga ambas palabras  
- data analyst: cualquier titulo que contanga ambas palabras  
- data engineer: cualquier titulo que contanga ambas palabras  
- specific discipline scientist: cualquier titulo que contanga la palabra scientist en él, sin contener la palabra data.  
- machine learning professional: cualquier titulo que contanga las palabras machine learning.  
- businesss intelligence analyst: cualquier empleo que contenga las palabras business intelligence.  
- analyst of other nature: cualquier empleo que contenga la palabra analyst sin, sin contener las palabras data o business.
- highly specific: todos los títulos con una sola ocurrencia que no fueron agrupados en las anteriores categorías.
- others: categorías restantes que en conjunto, siguen consistiendo en la categoría de menor frecuencia.

In [None]:
df["v_job_title"] = df["v_job_title"].str.replace(r'(^.*data.*scien.*$)', 'data scientist')
df["v_job_title"] = df["v_job_title"].str.replace(r'(^.*data.*engin.*$)', 'data engineer')
df["v_job_title"] = df["v_job_title"].str.replace(r'(^.*data.*anal.*$)', 'data analyst')
df["v_job_title"] = df["v_job_title"].str.replace(r'(^.*machine.*learning.*$)', 'machine learning professional')
df["v_job_title"] = df["v_job_title"].str.replace(r'(^.*business.* anal.*$)', 'business intelligence analyst')
df["v_job_title"] = df["v_job_title"].str.replace(r'(^((?!data).)*scientist.*$)', 'specific discipline scientist')
df["v_job_title"] = df["v_job_title"].str.replace(r'(^((?!(data)|(business)).)*anal.*$)', 'analyst of other nature')
df["v_job_title"] = df["v_job_title"].str.replace(r'(^.*data.*architect.*$)', 'data architect')
df["v_job_title"] = df["v_job_title"].str.replace(r'(^.*cybersecurity.*$)', 'cybersecurity applied to data science')

#Separando a los trabajos que no pudieron ser agrupados en las categorias de arriba
# y que tinen una sola ocurrencia
counts = pd.DataFrame(df["v_job_title"].value_counts())
one_occurrence = counts[counts['v_job_title']==1].index
dictio = dict(zip(one_occurrence,["highly specific"]*len(one_occurrence)))
df["v_job_title"] = df["v_job_title"].replace(dictio)

#Separando categorias menored en otros
dictio = dict(zip(list(df["v_job_title"].value_counts().index.tolist()[-9:]),["others"]*(9)))
df["v_job_title"] = df["v_job_title"].replace(dictio)

In [None]:
df['v_job_title'].value_counts(1)

### Normalizando location

In [None]:
df['v_location'].value_counts(1)

In [None]:
#Crearemos una columna para ciudad y estado a partir de la locacion
# De esta forma, es más sencillo agrupar location por estado y hacer EDA.
df[['v_city','v_state']] = df['v_location'].str.split(', ', expand=True)
#Eliminamos v_location
df = df.drop(columns='v_location')
df[['v_city','v_state']]

### Normalizando v_industry

In [None]:
df['v_industry'].value_counts()

In [None]:
#Agrupamos los que tienen 3 o menos ocurrencias en la categoria others
counts = pd.DataFrame(df["v_industry"].value_counts())
low_occurrence = counts[counts['v_industry']<=4].index
dictio = dict(zip(low_occurrence,["others"]*len(low_occurrence)))
df["v_industry"] = df["v_industry"].replace(dictio)

In [None]:
print(f"no de categorias: {len(df['v_industry'].value_counts())}")
df["v_industry"].value_counts()

### Normalizando Headquarters

In [None]:
df["v_headquarters"].value_counts()

In [None]:
#Crearemos una columna para ciudad y estado de los headquarters
df[['v_headquarters_city','v_headquarters_state']] = df['v_headquarters'].str.split(', ',1,expand=True)
#Eliminamos v_lheadquarters
df = df.drop(columns='v_headquarters')
df[['v_headquarters_city','v_headquarters_state']]

In [None]:
#Notamos que hay registros que son de otros paises,
# por lo que creamos la categoria 'another country'
paises = list(filter(lambda text:len(text)>2,df['v_headquarters_state'].dropna()))
paises = dict(zip(paises,['foreign country']*len(paises)))
df['v_headquarters_state']=df['v_headquarters_state'].replace(paises)
df['v_headquarters_state'].value_counts()

### Normalizando v_type_ownership

In [None]:
df['v_type_of_ownership'].value_counts()

In [None]:
#Reemplazamos franchise por other organization
df['v_type_of_ownership']=df['v_type_of_ownership'].replace({'franchise':'other organization'})

In [None]:
#Reemplazamos franchise por other organization
df['v_type_of_ownership'].value_counts()

**Revisando dataset nuevamente**

In [15]:
df.head()

NameError: name 'df' is not defined

In [None]:
df.dtypes

In [None]:
df.describe()

# Analisis Exploratorio de Datos 

### Distribución de puestos ofertados

In [None]:
my_bar_count(df,'v_job_title','Puestos ofertados','','cantidad')

In [None]:
my_pie_count(df,'v_job_title','Puestos ofertados')

### Distribución de la media de la estimación de salarios.

In [None]:
fig_salary_1 = my_histogram(df,'c_salary_mean',50,x_title='salario')
fig_salary_1.show()

### Distribución de salarios de los principales 3 títulos

In [None]:
data_jobs = ['data scientist','data engineer','data analyst']
aux = df[df['v_job_title'].isin(data_jobs)]
my_box(aux,'v_job_title','c_salary_mean','Distribucion del salario de principales 3 puestos',x_title='puesto',y_title='salario')

In [None]:
data_jobs = ['data scientist','data engineer','data analyst']
aux = df[df['v_job_title'].isin(data_jobs)]
fig_salary_2 = px.histogram(aux, x="c_salary_mean", color="v_job_title",marginal='box',
                  color_discrete_sequence=['#6faa9f','#344647','#779a7c'], 
                  )
fig_salary_2.update_layout(title_text='Distribución del salario de distintas ramas <br> de la ciencia de datos',
                  title_font_size=16,
                  title_font_color="#002020",
                  xaxis_title_text='saladio medio',
                  yaxis_title='conteo',
                  plot_bgcolor="rgb(208,208,2084)",
                  font=dict(
                     family="Courier New, monospace",size=13)
    )
fig_salary_2.show()

### Principales sectores involucrados en las ofertas de trabajo

In [16]:
my_bar_count(df,'v_sector','Distribución de sectores involucrados','sectores','cantidad')

NameError: name 'my_bar_count' is not defined

In [None]:
my_pie_count(df,'v_sector','Distribución de sectores involucrados')

### Distribución del tamaño de las compañías ofertando empleo

In [None]:
my_bar_count(df,'v_size','Distribución del tamaño de las compañías','tamaño','cantidad')

In [None]:
my_pie_count(df,'v_size','Distribución del tamaño de las compañías')

### Distribución del tipo de dueños

In [None]:
my_bar_count(df,'v_type_of_ownership','Tipo de compañía','tipo','cantidad')

In [None]:
my_pie_count(df,'v_type_of_ownership','Tipo de compañía')

### Estados que ofrecen trabajo

In [None]:
my_bar_count(df,'v_state','Estados que ofrecen trabajo','estado','cantidad')

In [17]:
my_pie_count(df,'v_state','Estados que ofrecen trabajo')

NameError: name 'my_pie_count' is not defined

### Las 10 ciudades con más ofertas 

In [None]:
fig = my_bar_count(df,'v_city')
fig.update_layout(title_text='Las 10 ciudades con más ofertas de trabajo',
                  title_font_size=16,
                  xaxis_title_text='ciudad',
                  xaxis=dict(range=[0,11]),
                  yaxis_title='conteo',
                 font=dict(
                     family="Courier New, monospace",size=13)
                 )
fig.show()

### Distribución de la ubicación de la sede de las compañías

In [None]:
fig = my_bar_count(df,'v_headquarters_city')
fig.update_layout(title_text='Las 10 ciudades con más sedes de empresas ofertando',
                  title_font_size=16,
                  xaxis_title_text='ciudad',
                  xaxis=dict(range=[0,11]),
                  yaxis_title='conteo',
                 font=dict(
                     family="Courier New, monospace",size=13)
                 )
fig.show()

In [None]:
fig = my_bar_count(df,'v_headquarters_state')
fig.update_layout(title_text='Los 10 estados con más sedes de empresas ofertando',
                  title_font_size=16,
                  xaxis_title_text='Estado',
                  xaxis=dict(range=[0,11]),
                  yaxis_title='conteo',
                 font=dict(
                     family="Courier New, monospace",size=13)
                 )
fig.show()

### Distribución de la calificación de las empresas

In [None]:
fig = my_histogram(df,'v_rating',5,'Calificación de las empresas','calificación','conteo')
fig.update_traces(xbins=dict(
        start=1.0,
        end=5.0,
        size=0.5
    ))

fig.show()

### Palabras más mencionadas en la descripción del trabajo

In [None]:
def text_clean(text, pattern="[^a-zA-Z0-9 ]"):
    text_clean = unicodedata.normalize('NFD', text).encode('ascii', 'ignore')
    text_clean= re.sub(pattern, " ", text_clean.decode("utf-8"), flags=re.UNICODE)
    text_clean = u' '.join(text_clean.lower().split())
    return text_clean

In [None]:
corpus = " ".join(df['t_job_description'])

In [None]:
lista_stopwords = stopwords.words("english")
texto_clean=text_clean(" ".join(sample(corpus.split(),10000)))
fdist=FreqDist(texto_clean.split())
hapaxes=fdist.hapaxes()

In [18]:
texto_clean=" ".join(list(filter(lambda x:x not in lista_stopwords,texto_clean.split())))

NameError: name 'texto_clean' is not defined

In [None]:
texto_clean=" ".join(list(filter(lambda x:x not in hapaxes,texto_clean.split())))
fdist=FreqDist(texto_clean.split())

In [None]:
 with open("texto-graf.txt","w") as f:
        f.write(texto_clean)
path_texto="texto-graf.txt"
path_imagen="texto-graf.png"

stylecloud.gen_stylecloud(file_path = path_texto,output_name=path_imagen,icon_name="fas fa-address-book",
                          palette='colorbrewer.diverging.Spectral_11',background_color='black',gradient='horizontal')

Image.open(path_imagen)

### Empresas más comunes

In [None]:
corpus = " ".join(df['t_company_name'])

In [None]:
lista_stopwords = stopwords.words("english")
texto_clean=text_clean(" ".join(corpus.split()))
fdist=FreqDist(texto_clean.split())
hapaxes=fdist.hapaxes()+['inc','corporation','group','llc']
texto_clean=" ".join(list(filter(lambda x:x not in lista_stopwords,texto_clean.split())))
texto_clean=" ".join(list(filter(lambda x:x not in hapaxes,texto_clean.split())))
fdist=FreqDist(texto_clean.split())

In [None]:
 with open("texto-graf.txt","w") as f:
        f.write(texto_clean)
path_texto="texto-graf.txt"
path_imagen="texto-graf.png"

stylecloud.gen_stylecloud(file_path = path_texto,output_name=path_imagen,icon_name="fas fa-book-reader",
                          palette='colorbrewer.diverging.Spectral_11',background_color='black',gradient='horizontal')

Image.open(path_imagen)

### Correlación entre raiting y salario medio

In [None]:
df[["v_rating","c_salary_mean"]].corr().iplot(kind="heatmap",colorscale='spectral')

# Outliers

In [None]:
#Bucando outliers
feats=list(df.filter(like="c_").columns)
outliers=OUTLIERS(df,feats)
outliers

In [None]:
#Separandolos
indices=list(outliers[outliers["features"]=="c_salary_mean"]["indices"].values)[0]
aux=df[~df.index.isin(indices)]

In [None]:
#Comparando
#Distribución con outliers
df['c_salary_mean'].iplot(kind='box')

In [None]:
#Distribución sin outliers
aux['c_salary_mean'].iplot(kind='box')

El porcentaje de outliers es tan pequeño que no se nota.relevante
Sin embargo, por ser las ofertas mejor pagadas, su inclusión en el modelo podría resultar relevante.

In [19]:
#Observamos los valores outliers
df.iloc[indices]['c_salary_mean']

NameError: name 'df' is not defined

In [None]:
#Observando la diferencia entre los valores outliers y el valor del
#   tercer cuartil más 1.5 veces el rango intercuartílico. (Q3+1.5IQR)
q1 = df['c_salary_mean'].quantile(0.25)
q3 = df['c_salary_mean'].quantile(0.75)
diff = 225000.0 - (q3+1.5*(q3-q1))
diff

In [None]:
#La diferencia es tan sólo del 1.85% de la media del salario
2000.0/df['c_salary_mean'].mean()*100

In [None]:
#visulizando a los outliers
fig = my_bar_count(df,'c_salary_mean','Distribución del salario','salario','conteo')
fig.add_vline(x=(q3+1.5*(q3-q1)),
             line_dash='dot',
             annotation_text='Q3+1.5IQR',
             annotation_position='top right')
fig.show()

Los valores outliers son de gran interés, pues representan las ofertas de empleo con mejor paga y además sobrepasan el límite permitido por una cantidad pequeña, por lo que se decide conservar dichos outliers.

# Valores Ausentes

### valores unarios

In [None]:
#Eliminando la coumna de v_salary_estimate_source porque todos los 
#   datos provinen de glassdoor
df = df.drop(columns='v_salary_estimate_source')

In [20]:
from sklearn.model_selection import train_test_split
X_train,X_test=train_test_split(df,test_size=.2,random_state=0)

NameError: name 'df' is not defined

In [None]:
miss = completitud(X_train)
miss

**v_type_of_ownership**

In [None]:
#Se imputa la moda
X_train['v_type_of_ownership'],X_test['v_type_of_ownership'] = imputar_moda(df,'v_type_of_ownership',X_train,X_test)

In [None]:
X_train['v_type_of_ownership'].value_counts()

**v_size**

In [None]:
#No se puede imputar usando la moda
X_train['v_size'],X_test['v_size'] = imputar_moda(df,'v_size',X_train,X_test)

**v_headquarters_city**

In [None]:
#Se imputa la moda
X_train['v_headquarters_city'],X_test['v_headquarters_city'] = imputar_moda(df,'v_headquarters_city',X_train,X_test)

**v_headquarters_state**

In [21]:
#Se imputa la moda
X_train['v_headquarters_state'],X_test['v_headquarters_state'] = imputar_moda(df,'v_headquarters_state',X_train,X_test)

NameError: name 'imputar_moda' is not defined

**v_rating**

In [None]:
#No se puede imputar usando la moda
X_train['v_rating'],X_test['v_rating'] = imputar_moda(df,'v_rating',X_train,X_test)

In [None]:
#Imputamos como continua
imputar_continua(df,'v_rating')

In [None]:
imp = SimpleImputer(missing_values=np.nan,strategy="median")
imp.fit(X_train[['v_rating']])
X_train[['v_rating']]=imp.transform(X_train[['v_rating']])
X_test[['v_rating']]=imp.transform(X_test[['v_rating']])

**v_industry**

In [None]:
X_train['v_industry'],X_test['v_industry'] = imputar_moda(df,'v_industry',X_train,X_test)

In [None]:
X_train['v_sector'],X_test['v_sector'] = imputar_moda(df,'v_sector',X_train,X_test)

**A pesar de que unas variables no pasaron la prueba de chi cuadrada, se les imputa la moda, pues no podemos dejar dichos valores vacíos**

In [None]:
print(X_train['v_size'].mode())
print(X_train['v_industry'].mode())
print(X_train['v_sector'].mode())

In [None]:
X_train['v_size']=X_train['v_size'].fillna('10000 employees')
X_test['v_size']=X_test['v_size'].fillna('10000 employees')
X_train['v_industry']=X_train['v_industry'].fillna('it services')
X_test['v_industry']=X_test['v_industry'].fillna('it services')
X_train['v_sector']=X_train['v_sector'].fillna('information technology')
X_test['v_sector']=X_test['v_sector'].fillna('information technology')

In [None]:
completitud(X_train)

# Ingeniería de variables

**Debido a su importancia para agrupamiento y para el EDA, se crearon las siquientes columnas en secciones anteriores**  
- c_salary_mean  
- v_city, v_state  
- v_headquarters_city, v_headquarters_state

In [22]:
#Se revisa dimensión del df antes de la ingeniería de variables
print(X_train.shape)
X_test.shape

NameError: name 'X_train' is not defined

**Creamos la variable dummy v_big_city para indicar si la ciudad de la oferta está dentro de las 20 más grandes de EUA**

In [None]:
#Creamos una variable dummy para indicar si la ciudad está dentro de 
# las 20 ciudades mas pobladas de EUA
top_20_city = list(map(lambda city:city.lower(),
                ['New York','Los Angeles','Chicago','Houston','Phoenix',
                 'Philadelphia[','San Antonio','San Diego','Dallas','San Jose',
                 'Austin','Jacksonville','Fort Worth','Columbus','Charlotte',
                 'San Francisco','Indianapolis','Seattle','Denver','Washington']))

df['v_big_city'] = df['v_city'].isin(top_20_city).astype(int)
df['v_big_city'].value_counts()

## Categóricas

### One-Hot encoding / Dummies

In [23]:
feats = ['v_job_title','v_sector','v_size','v_state','v_type_of_ownership']
for col in feats:
    X_train=pd.get_dummies(X_train,columns=[col],prefix=col)
    X_test=pd.get_dummies(X_test,columns=[col],prefix=col)
X_train.head(1)

NameError: name 'X_train' is not defined

In [None]:
#Añadimos columnas que no estan presentes entre los sets.
miss_cols_test = set(X_train.columns)-set(X_test.columns)
for col in miss_cols_test:
    X_test[col]=0
#Asegurando mismo orden de columnas
X_test =X_test[X_train.columns]

### Target encoding

In [None]:
X_train["c_salary_mean"].value_counts()

In [None]:
#Codificamos dependiendo la relación entre la categoría y el objetivo
encoder = TargetEncoder()

- Para el caso del objetivo categórico: las características se reemplazan con una combinación de probabilidad posterior del objetivo dado un valor categórico particular y la probabilidad previa del objetivo sobre todos los datos de entrenamiento.

- Para el caso del objetivo continuo: las características se reemplazan con una combinación del valor esperado del objetivo dado un valor categórico particular y el valor esperado del objetivo sobre todos los datos de entrenamiento.

In [None]:
encoder.fit(X_train[['v_industry','v_city','v_headquarters_city','v_headquarters_state']],X_train['c_salary_mean'])

In [None]:
X_train[['v_industry','v_city','v_headquarters_city','v_headquarters_state']] = encoder.transform(X_train[['v_industry','v_city','v_headquarters_city','v_headquarters_state']])
X_test[['v_industry','v_city','v_headquarters_city','v_headquarters_state']] = encoder.transform(X_test[['v_industry','v_city','v_headquarters_city','v_headquarters_state']])

In [None]:
X_train.head(2)

## Texto // Count Vectorizer

**t_job_description**

In [24]:
#Eliminando stop words
stop_words = stopwords.words("english")
X_train["t_job_description"]=X_train["t_job_description"].map(lambda text:" ".join([x for x in text.split(" ") if x not in stop_words]))
X_test["t_job_description"]=X_test["t_job_description"].map(lambda text:" ".join([x for x in text.split(" ") if x not in stop_words]))

NameError: name 'X_train' is not defined

In [None]:
corpus_jd = " ".join(df['t_job_description'].values)

In [None]:
# La eliminación de apaches es prácticamente imposible, 
#   pues la lista es enorme, el tiempo de ejecución se extiende demasiado
fdist=FreqDist(corpus_jd.split())
hapaxes=fdist.hapaxes()
len(hapaxes)

No se eliminan hapaxes, pues después de todo, la vectorización se hace sólo para palabras con más del 15% de ocurrencias.

In [None]:
#Tokenizando
X_train["t_job_description"]=X_train["t_job_description"].map(lambda x:x.split())
X_test["t_job_description"]=X_test["t_job_description"].map(lambda x:x.split())

In [None]:
#Lematizando
#nltk.download('wordnet')
lem = nltk.stem.wordnet.WordNetLemmatizer()
X_train["t_job_description"]=X_train["t_job_description"].map(lambda text:[lem.lemmatize(word) for word in text])
X_test["t_job_description"]=X_test["t_job_description"].map(lambda text:[lem.lemmatize(word) for word in text])

In [None]:
#El modelo para vectorizar sólo usa palabras con 15% o más de frecuencia
vect = CountVectorizer(analyzer= 'word', min_df=0.15)
X_train["t_job_description"]=X_train["t_job_description"].map(lambda list:' '.join(word for word in list))
X_test["t_job_description"]=X_test["t_job_description"].map(lambda list:' '.join(word for word in list))
#Entrenando al modelo
vect.fit(X_train['t_job_description'])
#Ajustando ambos sets
array_train = vect.transform(X_train['t_job_description'])
array_test = vect.transform(X_test['t_job_description'])

In [None]:
#Incorporando al dataframe
aux_train=pd.DataFrame(array_train.toarray(),columns=vect.get_feature_names())
aux_test=pd.DataFrame(array_test.toarray(),columns=vect.get_feature_names())
X_train = X_train.reset_index()
X_test = X_test.reset_index()
X_train = pd.concat([X_train,aux_train],axis=1)
X_test = pd.concat([X_test,aux_test],axis=1)
X_train = X_train.set_index('index').sort_index()
X_test = X_test.set_index('index').sort_index()

In [None]:
X_train

**t_company_name**

In [None]:
#Eliminando stop words
stop_words = stopwords.words("english")
X_train['t_company_name']=X_train["t_company_name"].map(lambda text:" ".join([x for x in text.split(" ") if x not in stop_words]))
X_test['t_company_name']=X_test["t_company_name"].map(lambda text:" ".join([x for x in text.split(" ") if x not in stop_words]))

In [None]:
corpus_jd = " ".join(df['t_company_name'].values)

In [None]:
#Eliminando hapaxes
fdist=FreqDist(corpus_jd.split())
hapaxes=fdist.hapaxes()
X_train["t_company_name"]=X_train["t_company_name"].map(lambda text:" ".join([x for x in text.split(" ") if x not in hapaxes]))
X_test["t_company_name"]=X_test["t_company_name"].map(lambda text:" ".join([x for x in text.split(" ") if x not in hapaxes]))

In [None]:
#Tokenizando
X_train["t_company_name"]=X_train["t_company_name"].map(lambda x:x.split())
X_test["t_company_name"]=X_test["t_company_name"].map(lambda x:x.split())

In [None]:
#Lematizando
#nltk.download('wordnet')
lem = nltk.stem.wordnet.WordNetLemmatizer()
X_train["t_company_name"]=X_train["t_company_name"].map(lambda text:[lem.lemmatize(word) for word in text])
X_test["t_company_name"]=X_test["t_company_name"].map(lambda text:[lem.lemmatize(word) for word in text])

In [None]:
#El modelo vectoriza sólo palabras que se repiten 5%
vect = CountVectorizer(analyzer= 'word', min_df=0.05)
X_train["t_company_name"]=X_train["t_company_name"].map(lambda list:' '.join(word for word in list))
X_test["t_company_name"]=X_test["t_company_name"].map(lambda list:' '.join(word for word in list))
#Entrenando
vect.fit(X_train["t_company_name"])
#Ajustando
array_train = vect.transform(X_train["t_company_name"])
array_test = vect.transform(X_test["t_company_name"])

In [None]:
aux_train=pd.DataFrame(array_train.toarray(),columns=vect.get_feature_names())
aux_test=pd.DataFrame(array_test.toarray(),columns=vect.get_feature_names())

In [None]:
X_train = X_train.reset_index()
X_test = X_test.reset_index()
X_train = pd.concat([X_train,aux_train],axis=1)
X_test = pd.concat([X_test,aux_test],axis=1)
X_train = X_train.set_index('index').sort_index()
X_test = X_test.set_index('index').sort_index()

In [None]:
# Se revisan dimensiones del df
print(X_train.shape)
X_test.shape

In [None]:
vect.get_feature_names()

# Reducción de variables

Ya que nuestra variable objetivo proviene del promedio de c_salary_minimum y c_salary_maximum, eliminaremos estas mismas.
Así mismo, borramos las variables de texto que ya se vectorizaron

In [25]:
X_train = X_train.drop(columns=['c_salary_minimum','c_salary_maximum','v_salary_estimate','t_job_description','t_company_name'])
X_test = X_test.drop(columns=['c_salary_minimum','c_salary_maximum','v_salary_estimate','t_job_description','t_company_name'])

NameError: name 'X_train' is not defined

### Filtro de alta correlación

In [None]:
X_train.columns

In [None]:
X_train.describe()

In [None]:
correlacion = abs(X_train[['v_rating', 'v_industry', 'c_salary_mean', 'v_city',
       'v_headquarters_city', 'v_headquarters_state']].corr(method="spearman"))

In [None]:
correlacion.iplot(kind="heatmap",colorscale="orrd",title="Matriz de Correlación")

In [None]:
for col in correlacion.columns:
    aux = correlacion[[col]][correlacion[[col]]>0.7].dropna()
    if len(aux)>1:
        display(aux)

In [None]:
col_drop = ['v_headquarters_city','v_sector_education','v_sector_government']

### Correlación con objetivo

In [None]:
correlacion=abs(X_train[['v_rating', 'v_industry', 'c_salary_mean', 'v_city',
       'v_headquarters_city', 'v_headquarters_state']].corr(method='spearman'))
low_corr = correlacion[correlacion["c_salary_mean"]<0.1][['c_salary_mean']].sort_values(by="c_salary_mean")
low_corr

In [None]:
#Eliminando columnas con correlacion menor a 0.1 con el objetivo
X_train = X_train.drop(columns=low_corr.index)
X_test = X_test.drop(columns=low_corr.index)

### Multicolinealidad

In [26]:
high_vif = calc_vif(X_train.drop(columns='c_salary_mean'))
high_vif = high_vif[high_vif['VIF']>10]
high_vif

NameError: name 'calc_vif' is not defined

In [None]:
#Eliminamos variables con VIF mayor a 10
X_train = X_train.drop(columns=high_vif['variables'].values)
X_test = X_test.drop(columns=high_vif['variables'].values)

# Tabla Final

In [None]:
#Renombrando al objetivo
X_train = X_train.rename(columns={'c_salary_mean':'tgt_salary_mean'})
X_test = X_test.rename(columns={'c_salary_mean':'tgt_salary_mean'})

In [None]:
X_train

In [None]:
X_train.describe()

In [None]:
X_test