In [None]:
# Importación de librerías que se utilizarán
import os
import re
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report, roc_curve, auc, confusion_matrix, precision_score, recall_score, f1_score

import plotly.express as px # Crear gráficos interactivos
import matplotlib.pyplot as plt
from matplotlib import style # Cambiar el estilo visual de los gráficos

import seaborn as sns
import mlflow
import mlflow.sklearn

import nltk #  Librería para procesamiento de lenguaje natural
from nltk.util import ngrams # Crear n-gramas

import pycaret
from pycaret.classification import * # Módulo de PyCaret para problemas de clasificación
from pycaret.utils import version # Obtener la versión de PyCaret instalada
from pycaret.nlp import * # Módulo de PyCaret para procesamiento de texto

import spacy # Para procesamiento de lenguaje natural.
from spacy.lang.en.examples import sentences # Da ejemplos de frases en español para pruebas.
from spacy.lang.en.stop_words import STOP_WORDS # Stop words en inglés
from nltk.corpus import stopwords # Lista de stopwords pero de la librería NLTK (más general).

import pyLDAvis # Para visualizar modelos de temas
#import pyLDAvis.gensim_models
pyLDAvis.enable_notebook() # Permite que las visualizaciones de pyLDAvis se vean directamente en el notebook
version() # muestra la versión de PyCaret

import sys

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")
    # Este bloque desactiva los mensajes de advertencia para no "ensuciar" la salida de resultados en la consola o notebook.


Hubo instalar lo siguiente: 
* !pip install pycaret
* !python -m spacy download en_core_web_sm
* !python -m spacy download es_core_news_sm
* !pip install pyLDAvis==3.2.2

## 1. Web scraping
This endpoint will receive as input Web scrapping:
- Open web page
- Read the content
- Extract content
- Save all in a data frame

Por el momento el web scraping se realizará con un archivo previamente desacargado de [Kaggle](https://www.kaggle.com/datasets/davidgauthier/glassdoor-job-reviews), para lo que lo cargaremos 

In [2]:
# Cargamos el archivo con el que trabajaremos
file_path = "C:/Users/palom/OneDrive/Dokumen/CUCEA/2DO_SEM/Challenges-Progra2/Challenge_2/data/glassdoor_reviews.csv"
df = pd.read_csv(file_path)
# Mostramos las primeras filas del archivo para analizar su estructura
df.head()

Unnamed: 0,firm,date_review,job_title,current,location,overall_rating,work_life_balance,culture_values,diversity_inclusion,career_opp,comp_benefits,senior_mgmt,recommend,ceo_approv,outlook,headline,pros,cons
0,AFH-Wealth-Management,2015-04-05,,Current Employee,,2,4.0,3.0,,2.0,3.0,3.0,x,o,r,"Young colleagues, poor micro management",Very friendly and welcoming to new staff. Easy...,"Poor salaries, poor training and communication."
1,AFH-Wealth-Management,2015-12-11,Office Administrator,"Current Employee, more than 1 year","Bromsgrove, England, England",2,3.0,1.0,,2.0,1.0,4.0,x,o,r,"Excellent staff, poor salary","Friendly, helpful and hard-working colleagues",Poor salary which doesn't improve much with pr...
2,AFH-Wealth-Management,2016-01-28,Office Administrator,"Current Employee, less than 1 year","Bromsgrove, England, England",1,1.0,1.0,,1.0,1.0,1.0,x,o,x,"Low salary, bad micromanagement",Easy to get the job even without experience in...,"Very low salary, poor working conditions, very..."
3,AFH-Wealth-Management,2016-04-16,,Current Employee,,5,2.0,3.0,,2.0,2.0,3.0,x,o,r,Over promised under delivered,Nice staff to work with,No career progression and salary is poor
4,AFH-Wealth-Management,2016-04-23,Office Administrator,"Current Employee, more than 1 year","Bromsgrove, England, England",1,2.0,1.0,,2.0,1.0,1.0,x,o,x,client reporting admin,"Easy to get the job, Nice colleagues.","Abysmal pay, around minimum wage. No actual tr..."


In [3]:
print(f"Dimensiones del dataframe: {df.shape}")
# obtenemos información del tipo de dato por columna
df.info()

Dimensiones del dataframe: (838566, 18)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 838566 entries, 0 to 838565
Data columns (total 18 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   firm                 838566 non-null  object 
 1   date_review          838566 non-null  object 
 2   job_title            838566 non-null  object 
 3   current              838566 non-null  object 
 4   location             541223 non-null  object 
 5   overall_rating       838566 non-null  int64  
 6   work_life_balance    688672 non-null  float64
 7   culture_values       647193 non-null  float64
 8   diversity_inclusion  136066 non-null  float64
 9   career_opp           691065 non-null  float64
 10  comp_benefits        688484 non-null  float64
 11  senior_mgmt          682690 non-null  float64
 12  recommend            838566 non-null  object 
 13  ceo_approv           838566 non-null  object 
 14  outlook              838566 

Se observa que de las 18 variables, 11 son de tipo object, de las cuales 4 de ellas contienen registros vacíos (location, headline, pros, cons).
Observado esto se creará un nuevo dataframe con solo variables _object_ y será este al que se le aplicará el NLP.

In [4]:
df_text = df.select_dtypes(include='object')

# Imprimimos los primeros registros del dataframe para visualizar el filtrado
df_text.head()

Unnamed: 0,firm,date_review,job_title,current,location,recommend,ceo_approv,outlook,headline,pros,cons
0,AFH-Wealth-Management,2015-04-05,,Current Employee,,x,o,r,"Young colleagues, poor micro management",Very friendly and welcoming to new staff. Easy...,"Poor salaries, poor training and communication."
1,AFH-Wealth-Management,2015-12-11,Office Administrator,"Current Employee, more than 1 year","Bromsgrove, England, England",x,o,r,"Excellent staff, poor salary","Friendly, helpful and hard-working colleagues",Poor salary which doesn't improve much with pr...
2,AFH-Wealth-Management,2016-01-28,Office Administrator,"Current Employee, less than 1 year","Bromsgrove, England, England",x,o,x,"Low salary, bad micromanagement",Easy to get the job even without experience in...,"Very low salary, poor working conditions, very..."
3,AFH-Wealth-Management,2016-04-16,,Current Employee,,x,o,r,Over promised under delivered,Nice staff to work with,No career progression and salary is poor
4,AFH-Wealth-Management,2016-04-23,Office Administrator,"Current Employee, more than 1 year","Bromsgrove, England, England",x,o,x,client reporting admin,"Easy to get the job, Nice colleagues.","Abysmal pay, around minimum wage. No actual tr..."


## 2. Model to classifier and make a sentiment analysis
This part will contein 3 sections:
* Text preprocessing
* Classification proposed
* Extraction of main features

In [None]:
#### TEXT PREPROCESSING


## 3. Create a pipline to MLOps