# Pipeline

In [1]:
#Instalaciones

#Libreria para remplazar numeros por palabras
!pip install num2words
#Instalar  pandas profiler
#!pip install pandas-profiling==2.7.1
#deteccion de lenguaje para eliminar entradas que no esten en español
!pip install langdetect
#Procesamientno de lenguaje natural en español
!pip install stanza



In [2]:
#Imports para procesamiento de texto

#tokenizacion y lematizacion
import stanza
#Para integrar pasos de la limpieza adicionales
from stanza.pipeline.processor import Processor, register_processor
#paquete español
stanza.download('es')

#Para manejo de numeros, singluares, plurarles en lenguaje
from num2words import num2words
#Deteccion de lenguaje
from langdetect import detect
# librería Natural Language Toolkit, usada para trabajar con textos
import nltk
# Punkt permite separar un texto en frases.
nltk.download('stopwords')
from nltk.corpus import stopwords

#Operaciones con expresiones regulares y unicode
import re, string, unicodedata

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-04-18 17:42:16 INFO: Downloaded file to C:\Users\zeify\stanza_resources\resources.json
2024-04-18 17:42:16 INFO: Downloading default packages for language: es (Spanish) ...
2024-04-18 17:42:18 INFO: File exists: C:\Users\zeify\stanza_resources\es\default.zip
2024-04-18 17:42:24 INFO: Finished downloading models and saved to C:\Users\zeify\stanza_resources
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\zeify\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
#Imports generales para analisis de datos y ML
import pandas as pd
import numpy as np
import sys
from pandas_profiling import ProfileReport
import statistics

import sklearn
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

  from pandas_profiling import ProfileReport


## 1. Lectura de los datos

In [4]:
# Lectura de los datos.
df_turi = pd.read_csv('tipo2_entrenamiento_estudiantes.csv', sep=',', encoding='utf-8')

df_turi.head()

Unnamed: 0,Review,Class
0,Muy buena atención y aclaración de dudas por p...,5
1,Buen hotel si están obligados a estar cerca de...,3
2,"Es un lugar muy lindo para fotografías, visite...",5
3,Abusados con la factura de alimentos siempre s...,3
4,Tuvimos un par de personas en el grupo que rea...,3


## 2. Construcción del pipeline

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

In [6]:
class CustomPreprocessor(BaseEstimator, TransformerMixin):
    def init(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):

        X_processed = self.customPreprocessing(X)
        #Retornar los datos
        return X_processed
    
    #Remplaza los numeros por su representacion en palabras
    def replace_numbers(self, words):
        """Replace all interger occurrences in list of tokenized words with textual representation"""
        new_words = []
        for word in words:
            if word.isdigit():
                new_word = num2words(word, lang='es')
                new_words.append(new_word)
            else:
                new_words.append(word)
        return new_words
    #Remueve todo caracter no latino (conserva espacios y numeros)
    def remove_nonlatin(self, words):
      new_words = []
      for word in words:
        new_word = ''
        for ch in word:
          if unicodedata.name(ch).startswith(('LATIN', 'DIGIT', 'SPACE')):
            new_word += ch
        new_words.append(new_word)
      return new_words

    #Remueve palabras comunes que no aportan informacion
    def remove_stopwords(self, words):
        """Remove stop words from list of tokenized words"""
        new_words = []
        s = set(stopwords.words('spanish'))
        for word in words:
            if word not in s:
                new_words.append(word)
        return new_words

    #Remueve puntuacion
    def remove_punctuation(self, words):
        """Remove punctuation from list of tokenized words"""
        new_words = ''
        for word in words:
                new_words += re.sub(r'[^\w\s]', ' ', word)
        return new_words

     #Procesamiento de cada review usando stanza
    def tokenLemma(self, data):
      data['words'] = data['Review'].apply(self.remove_punctuation)
      #Creamos un pipeline para tokenizacion y lematizacion
      nlp = stanza.Pipeline('es', processors = 'tokenize,mwt,pos,lemma', use_gpu=True)
      in_docs = [stanza.Document([], text=d) for d in data.words]
      return nlp(in_docs)

    #Funcion secundaria para procesar cada token
    def procesamientoPalabras(self, words):
        words = self.remove_nonlatin(words)
        words = self.replace_numbers(words)
        words = self.remove_stopwords(words)
        return words

    #Funcion principal para el pre-procesamiento
    def customPreprocessing(self, data):
        out_docs = self.tokenLemma(data)
        palabras = []

        for doc in out_docs:
            reviewAct = []
            for sentence in doc.sentences:
              for word in sentence.words:
                if(word.pos != 'PUNCT' and word.pos != 'SYM'):
                  reviewAct.append(word.lemma.lower())
            palabras.append(reviewAct)
        
        data['words'] = palabras
        data['words'] = data['words'].apply(self.procesamientoPalabras)
        return data


In [7]:
from matplotlib import style
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.model_selection import RepeatedKFold

In [8]:
class CustomRegression(BaseEstimator, TransformerMixin):
    def init(self):
        self.model = None
        self.params = None
        self.accuracy = None
        self.vec = None
        
    def fit(self, X, y=None):
                
        X['words'] = X['words'].apply(lambda x: ' '.join(map(str, x)))
        
        #Separación de los datos en conjunto de test y train
        X = X.drop('Review', axis = 1)
        df_train, df_test = sklearn.model_selection.train_test_split(X, test_size=0.2, random_state=0)

        X_train = df_train['words']
        y_train = df_train['Class']

        X_test = df_test['words']
        y_test = df_test['Class']
        
        #Vectorizar los datos con Tfid
        vectorizer = TfidfVectorizer()
        train_vectors = vectorizer.fit_transform(X_train)
        test_vectors = vectorizer.transform(X_test)
        
        self.vec = vectorizer
        
        parameters = {
            'penalty' : ['l1','l2', 'elasticnet', None], 
            'C'       : np.logspace(-10,10,3),
            'solver'  : ['newton-cg', 'lbfgs', 'liblinear'],
        }
        
        metrica = RepeatedKFold(n_splits=20, n_repeats= 10, random_state=0)

        logreg = LogisticRegression()
        modelo = GridSearchCV(logreg, param_grid = parameters, scoring='accuracy', cv=metrica, n_jobs=-1)  
        
        modelo.fit(train_vectors,y_train)
        self.params = modelo.best_params_
        self.accuracy = modelo.best_score_
        modelo_optimo = modelo.best_estimator_
                
        self.model = modelo_optimo
        
        return self

    def transform(self, X):        

        return X

In [9]:
custom_preprocessor = CustomPreprocessor()
custom_regression = CustomRegression()
pipeline = Pipeline(
    [
        ("processing", custom_preprocessor),
        ("model", custom_regression)
    ]
)

In [10]:
# Ajusta el modelo en tus datos transformados
pipeline.fit(df_turi)

2024-04-18 18:18:43 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-04-18 18:18:43 INFO: Downloaded file to C:\Users\zeify\stanza_resources\resources.json
2024-04-18 18:18:44 INFO: Loading these models for language: es (Spanish):
| Processor | Package         |
-------------------------------
| tokenize  | ancora          |
| mwt       | ancora          |
| pos       | ancora_charlm   |
| lemma     | ancora_nocharlm |

2024-04-18 18:18:44 INFO: Using device: cpu
2024-04-18 18:18:44 INFO: Loading: tokenize
2024-04-18 18:18:50 INFO: Loading: mwt
2024-04-18 18:18:50 INFO: Loading: pos
2024-04-18 18:18:51 INFO: Loading: lemma
2024-04-18 18:18:51 INFO: Done loading processors!
3600 fits failed out of a total of 7200.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
600 fits failed with the following error

In [11]:
reg = pipeline["model"].model
vectorizer = pipeline["model"].vec

## 3. Testear el funcionamiento del pipeline

In [12]:
#Se cargan los datos no etiquetados
df_test = pd.read_csv("df_procesado_predecir.csv", sep = ";", encoding='utf-8')
import ast
df_test['words'] = df_test['words'].apply(ast.literal_eval)

In [13]:
#Convertimos los tokens nuevamente en strings
df_test['words'] = df_test['words'].apply(lambda x: ' '.join(map(str, x)))

In [14]:
#Separar datos para predecir
df_test = df_test.drop('Review', axis = 1)

X_predict = df_test['words']

In [15]:
X_predict

0       primero noche encontrar habitación nido cinco ...
1       calle catedral platillo tradicional tipo gourm...
2       porción miserable agua sabor cloro distraído m...
3       cartagena encantar ciudad colonial haber visit...
4       ir ilusion disfrutar espectaculo luz sonido ve...
                              ...                        
1745    subir funicular bajar teleferico ser buen expe...
1746    gente esperar lugar central habana fiesta naci...
1747    excelente hotel alberca niño mejor atención me...
1748    detener bocado puesta sol haber nadie restaura...
1749    tener agua mantenimiento desastre mochila baño...
Name: words, Length: 1750, dtype: object

In [16]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

In [17]:
#Vectorizar los datos con Tfid
vectors = vectorizer.transform(X_predict)

In [18]:
#Predecir los datos y guardarlos en el archivo

predict = reg.predict(vectors)
predict

array([1, 5, 2, ..., 5, 3, 1], dtype=int64)

## 4. Persistencia del modelo

In [19]:
from joblib import dump, load

In [20]:
filename = "ModeloReview.joblib"
dump(pipeline, filename)

['ModeloReview.joblib']