# **0. Importación de librerías y carga de datos**

> ## ***Librerías***

In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer # Modelado
from sklearn.model_selection import train_test_split # Modelado
from sklearn.model_selection import GridSearchCV # Modelado, hiperparámetros
from sklearn.linear_model import LogisticRegression # Modelo
from sklearn.ensemble import RandomForestClassifier # Modelo
from sklearn.metrics import accuracy_score, classification_report # Reporte

from joblib import dump # Para descarga del modelo
from scipy import sparse # Para descarga de datos 'spliteados'

> ## ***Carga del corpus***

In [2]:
corpus_reviews = pd.read_csv('./preprocessed_data_file.csv') # Archivo descargado del notebook 2
corpus_reviews.sample(5)

Unnamed: 0,overall,reviewText,preprocessed_review
54734,4.0,"Its a pretty nice case, looks really good but ...",pretty nice case look really good make phone m...
40398,4.0,no problems,problem
14677,1.0,lost for words.,lost word
63248,5.0,Best cord I've ever bought and I definitely ne...,best cord ever bought definitely needed extra ...
32801,2.0,The nano towel was damp. Like it was dried out...,nano towel damp like dried installed per instr...


# **1. Bag of Words**

> ## ***Conversión a un problema binario***

In [3]:
corpus_reviews['overall'] = corpus_reviews['overall'].apply(lambda x: 1 if x > 3 else 0)

corpus_reviews['overall'].value_counts()

0    40000
1    40000
Name: overall, dtype: int64

In [4]:
corpus_reviews.isnull().sum()

overall                 0
reviewText              0
preprocessed_review    89
dtype: int64

In [5]:
corpus_reviews.dropna(inplace = True)

> ## ***Vectorización***

In [6]:
vectorizer = CountVectorizer(max_features = 2000, # Para reducir en gran medida la dimensionalidad del problema, enfocándome en las palabras más repetidas
                             ngram_range = (1,3), # Para incluir unigramas, bigramas y trigramas ya que vimos que todos estos son importantes
                             max_df = 0.8,        # Para tener un máximo de una frecuencia del 80% de apariciones
                             min_df = 0.015       # Si está en menos del 1.5% de veces en el corpus (), tampoco se tiene en cuenta
                             )

In [7]:
X = vectorizer.fit_transform(corpus_reviews['preprocessed_review'])
y = corpus_reviews['overall']

# **2. Modelado**

> ## ***División train-test***

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle = True)

## **2.1 Primer modelo**

In [9]:
# Definición de hiperparámetros para explorar
pg_logistic = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
                       'penalty': ['l1', 'l2', 'elasticnet'],
                       'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
                       }

# Configuración del GridSearch
gs_logistic = GridSearchCV(LogisticRegression(max_iter = 200),
                           param_grid = pg_logistic,
                           cv = 3,
                           scoring = 'accuracy',
                           verbose = 2)

# Compilación del modelo
gs_logistic.fit(X_train, y_train)

# Mejores hiperparámetros
print("Mejores hiperparámetros para regresor logístico:", gs_logistic.best_params_)

Fitting 3 folds for each of 90 candidates, totalling 270 fits
[CV] END ..............C=0.001, penalty=l1, solver=newton-cg; total time=   0.0s
[CV] END ..............C=0.001, penalty=l1, solver=newton-cg; total time=   0.0s
[CV] END ..............C=0.001, penalty=l1, solver=newton-cg; total time=   0.0s
[CV] END ..................C=0.001, penalty=l1, solver=lbfgs; total time=   0.0s
[CV] END ..................C=0.001, penalty=l1, solver=lbfgs; total time=   0.0s
[CV] END ..................C=0.001, penalty=l1, solver=lbfgs; total time=   0.0s
[CV] END ..............C=0.001, penalty=l1, solver=liblinear; total time=   0.1s
[CV] END ..............C=0.001, penalty=l1, solver=liblinear; total time=   0.1s
[CV] END ..............C=0.001, penalty=l1, solver=liblinear; total time=   0.1s
[CV] END ....................C=0.001, penalty=l1, solver=sag; total time=   0.0s
[CV] END ....................C=0.001, penalty=l1, solver=sag; total time=   0.0s
[CV] END ....................C=0.001, penalty=l



[CV] END .....................C=0.1, penalty=l1, solver=saga; total time=  17.0s




[CV] END .....................C=0.1, penalty=l1, solver=saga; total time=  16.2s




[CV] END .....................C=0.1, penalty=l1, solver=saga; total time=  16.7s
[CV] END ................C=0.1, penalty=l2, solver=newton-cg; total time=   0.7s
[CV] END ................C=0.1, penalty=l2, solver=newton-cg; total time=   0.7s
[CV] END ................C=0.1, penalty=l2, solver=newton-cg; total time=   0.7s
[CV] END ....................C=0.1, penalty=l2, solver=lbfgs; total time=   0.4s
[CV] END ....................C=0.1, penalty=l2, solver=lbfgs; total time=   0.4s
[CV] END ....................C=0.1, penalty=l2, solver=lbfgs; total time=   0.4s
[CV] END ................C=0.1, penalty=l2, solver=liblinear; total time=   0.4s
[CV] END ................C=0.1, penalty=l2, solver=liblinear; total time=   0.4s
[CV] END ................C=0.1, penalty=l2, solver=liblinear; total time=   0.4s




[CV] END ......................C=0.1, penalty=l2, solver=sag; total time=   6.1s
[CV] END ......................C=0.1, penalty=l2, solver=sag; total time=   5.2s




[CV] END ......................C=0.1, penalty=l2, solver=sag; total time=   5.5s




[CV] END .....................C=0.1, penalty=l2, solver=saga; total time=   5.6s




[CV] END .....................C=0.1, penalty=l2, solver=saga; total time=   4.5s




[CV] END .....................C=0.1, penalty=l2, solver=saga; total time=   6.2s
[CV] END ........C=0.1, penalty=elasticnet, solver=newton-cg; total time=   0.0s
[CV] END ........C=0.1, penalty=elasticnet, solver=newton-cg; total time=   0.0s
[CV] END ........C=0.1, penalty=elasticnet, solver=newton-cg; total time=   0.0s
[CV] END ............C=0.1, penalty=elasticnet, solver=lbfgs; total time=   0.0s
[CV] END ............C=0.1, penalty=elasticnet, solver=lbfgs; total time=   0.0s
[CV] END ............C=0.1, penalty=elasticnet, solver=lbfgs; total time=   0.0s
[CV] END ........C=0.1, penalty=elasticnet, solver=liblinear; total time=   0.0s
[CV] END ........C=0.1, penalty=elasticnet, solver=liblinear; total time=   0.0s
[CV] END ........C=0.1, penalty=elasticnet, solver=liblinear; total time=   0.0s
[CV] END ..............C=0.1, penalty=elasticnet, solver=sag; total time=   0.0s
[CV] END ..............C=0.1, penalty=elasticnet, solver=sag; total time=   0.0s
[CV] END ..............C=0.1



[CV] END .......................C=1, penalty=l1, solver=saga; total time=  18.7s




[CV] END .......................C=1, penalty=l1, solver=saga; total time=  19.9s




[CV] END .......................C=1, penalty=l1, solver=saga; total time=  18.2s
[CV] END ..................C=1, penalty=l2, solver=newton-cg; total time=   0.7s
[CV] END ..................C=1, penalty=l2, solver=newton-cg; total time=   0.7s
[CV] END ..................C=1, penalty=l2, solver=newton-cg; total time=   0.7s
[CV] END ......................C=1, penalty=l2, solver=lbfgs; total time=   0.6s
[CV] END ......................C=1, penalty=l2, solver=lbfgs; total time=   0.6s
[CV] END ......................C=1, penalty=l2, solver=lbfgs; total time=   0.5s
[CV] END ..................C=1, penalty=l2, solver=liblinear; total time=   0.5s
[CV] END ..................C=1, penalty=l2, solver=liblinear; total time=   0.6s
[CV] END ..................C=1, penalty=l2, solver=liblinear; total time=   0.4s




[CV] END ........................C=1, penalty=l2, solver=sag; total time=   4.7s




[CV] END ........................C=1, penalty=l2, solver=sag; total time=   4.9s




[CV] END ........................C=1, penalty=l2, solver=sag; total time=   5.7s




[CV] END .......................C=1, penalty=l2, solver=saga; total time=   5.4s




[CV] END .......................C=1, penalty=l2, solver=saga; total time=   6.0s




[CV] END .......................C=1, penalty=l2, solver=saga; total time=   5.0s
[CV] END ..........C=1, penalty=elasticnet, solver=newton-cg; total time=   0.0s
[CV] END ..........C=1, penalty=elasticnet, solver=newton-cg; total time=   0.0s
[CV] END ..........C=1, penalty=elasticnet, solver=newton-cg; total time=   0.0s
[CV] END ..............C=1, penalty=elasticnet, solver=lbfgs; total time=   0.0s
[CV] END ..............C=1, penalty=elasticnet, solver=lbfgs; total time=   0.0s
[CV] END ..............C=1, penalty=elasticnet, solver=lbfgs; total time=   0.0s
[CV] END ..........C=1, penalty=elasticnet, solver=liblinear; total time=   0.0s
[CV] END ..........C=1, penalty=elasticnet, solver=liblinear; total time=   0.0s
[CV] END ..........C=1, penalty=elasticnet, solver=liblinear; total time=   0.0s
[CV] END ................C=1, penalty=elasticnet, solver=sag; total time=   0.0s
[CV] END ................C=1, penalty=elasticnet, solver=sag; total time=   0.0s
[CV] END ................C=1



[CV] END ......................C=10, penalty=l1, solver=saga; total time=  20.2s




[CV] END ......................C=10, penalty=l1, solver=saga; total time=  19.6s




[CV] END ......................C=10, penalty=l1, solver=saga; total time=  19.3s
[CV] END .................C=10, penalty=l2, solver=newton-cg; total time=   0.8s
[CV] END .................C=10, penalty=l2, solver=newton-cg; total time=   0.7s
[CV] END .................C=10, penalty=l2, solver=newton-cg; total time=   0.7s
[CV] END .....................C=10, penalty=l2, solver=lbfgs; total time=   0.4s
[CV] END .....................C=10, penalty=l2, solver=lbfgs; total time=   0.5s
[CV] END .....................C=10, penalty=l2, solver=lbfgs; total time=   0.4s
[CV] END .................C=10, penalty=l2, solver=liblinear; total time=   0.4s
[CV] END .................C=10, penalty=l2, solver=liblinear; total time=   0.3s
[CV] END .................C=10, penalty=l2, solver=liblinear; total time=   0.4s




[CV] END .......................C=10, penalty=l2, solver=sag; total time=   5.6s




[CV] END .......................C=10, penalty=l2, solver=sag; total time=   5.7s




[CV] END .......................C=10, penalty=l2, solver=sag; total time=   5.3s




[CV] END ......................C=10, penalty=l2, solver=saga; total time=   5.0s




[CV] END ......................C=10, penalty=l2, solver=saga; total time=   5.0s




[CV] END ......................C=10, penalty=l2, solver=saga; total time=   5.8s
[CV] END .........C=10, penalty=elasticnet, solver=newton-cg; total time=   0.0s
[CV] END .........C=10, penalty=elasticnet, solver=newton-cg; total time=   0.0s
[CV] END .........C=10, penalty=elasticnet, solver=newton-cg; total time=   0.0s
[CV] END .............C=10, penalty=elasticnet, solver=lbfgs; total time=   0.0s
[CV] END .............C=10, penalty=elasticnet, solver=lbfgs; total time=   0.0s
[CV] END .............C=10, penalty=elasticnet, solver=lbfgs; total time=   0.0s
[CV] END .........C=10, penalty=elasticnet, solver=liblinear; total time=   0.0s
[CV] END .........C=10, penalty=elasticnet, solver=liblinear; total time=   0.0s
[CV] END .........C=10, penalty=elasticnet, solver=liblinear; total time=   0.0s
[CV] END ...............C=10, penalty=elasticnet, solver=sag; total time=   0.0s
[CV] END ...............C=10, penalty=elasticnet, solver=sag; total time=   0.0s
[CV] END ...............C=10



[CV] END .....................C=100, penalty=l1, solver=saga; total time=  20.2s




[CV] END .....................C=100, penalty=l1, solver=saga; total time=  22.5s




[CV] END .....................C=100, penalty=l1, solver=saga; total time=  20.7s
[CV] END ................C=100, penalty=l2, solver=newton-cg; total time=   0.8s
[CV] END ................C=100, penalty=l2, solver=newton-cg; total time=   0.7s
[CV] END ................C=100, penalty=l2, solver=newton-cg; total time=   0.7s
[CV] END ....................C=100, penalty=l2, solver=lbfgs; total time=   0.5s
[CV] END ....................C=100, penalty=l2, solver=lbfgs; total time=   0.5s
[CV] END ....................C=100, penalty=l2, solver=lbfgs; total time=   0.4s
[CV] END ................C=100, penalty=l2, solver=liblinear; total time=   0.4s
[CV] END ................C=100, penalty=l2, solver=liblinear; total time=   0.4s
[CV] END ................C=100, penalty=l2, solver=liblinear; total time=   0.3s




[CV] END ......................C=100, penalty=l2, solver=sag; total time=   6.0s




[CV] END ......................C=100, penalty=l2, solver=sag; total time=   5.0s




[CV] END ......................C=100, penalty=l2, solver=sag; total time=   4.7s




[CV] END .....................C=100, penalty=l2, solver=saga; total time=   5.8s




[CV] END .....................C=100, penalty=l2, solver=saga; total time=   5.0s


144 fits failed out of a total of 270.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
18 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.

--------------------------------

[CV] END .....................C=100, penalty=l2, solver=saga; total time=   5.9s
[CV] END ........C=100, penalty=elasticnet, solver=newton-cg; total time=   0.0s
[CV] END ........C=100, penalty=elasticnet, solver=newton-cg; total time=   0.0s
[CV] END ........C=100, penalty=elasticnet, solver=newton-cg; total time=   0.0s
[CV] END ............C=100, penalty=elasticnet, solver=lbfgs; total time=   0.0s
[CV] END ............C=100, penalty=elasticnet, solver=lbfgs; total time=   0.0s
[CV] END ............C=100, penalty=elasticnet, solver=lbfgs; total time=   0.0s
[CV] END ........C=100, penalty=elasticnet, solver=liblinear; total time=   0.0s
[CV] END ........C=100, penalty=elasticnet, solver=liblinear; total time=   0.0s
[CV] END ........C=100, penalty=elasticnet, solver=liblinear; total time=   0.0s
[CV] END ..............C=100, penalty=elasticnet, solver=sag; total time=   0.0s
[CV] END ..............C=100, penalty=elasticnet, solver=sag; total time=   0.0s
[CV] END ..............C=100



In [10]:
logistic_model = LogisticRegression(C = gs_logistic.best_params_['C'],
                                    solver = gs_logistic.best_params_['solver'],
                                    penalty = gs_logistic.best_params_['penalty'],
                                    max_iter = 300
                                    )

logistic_model.fit(X_train, y_train)

test_predict = logistic_model.predict(X_test)

print('Precisión del modelo Logistic Regression en Test: {}'.format(accuracy_score(y_test, test_predict)))

Precisión del modelo Logistic Regression en Test: 0.7945942563974223




## **2.2 Segundo modelo**

In [11]:
# Definición de hiperparámetros para explorar
pg_rf = {'n_estimators': [50, 100, 200],
         'max_depth': [10, 15, 20]
         }

# Configuración del GridSearch
gs_rf = GridSearchCV(RandomForestClassifier(),
                     param_grid = pg_rf,
                     cv = 3,
                     scoring = 'accuracy',
                     verbose = 2
                     )

# Compilación del modelo
gs_rf.fit(X_train, y_train)

# Mejores hiperparámetros
print("Mejores hiperparámetros para Random Forest:", gs_rf.best_params_)

Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] END ......................max_depth=10, n_estimators=50; total time=   1.6s
[CV] END ......................max_depth=10, n_estimators=50; total time=   1.6s
[CV] END ......................max_depth=10, n_estimators=50; total time=   1.6s
[CV] END .....................max_depth=10, n_estimators=100; total time=   3.1s
[CV] END .....................max_depth=10, n_estimators=100; total time=   4.0s
[CV] END .....................max_depth=10, n_estimators=100; total time=   3.1s
[CV] END .....................max_depth=10, n_estimators=200; total time=   5.9s
[CV] END .....................max_depth=10, n_estimators=200; total time=   7.0s
[CV] END .....................max_depth=10, n_estimators=200; total time=   6.0s
[CV] END ......................max_depth=15, n_estimators=50; total time=   4.0s
[CV] END ......................max_depth=15, n_estimators=50; total time=   3.1s
[CV] END ......................max_depth=15, n_es

In [12]:
forest_model = RandomForestClassifier(n_estimators = gs_rf.best_params_['n_estimators'],
                                      max_depth = gs_rf.best_params_['max_depth']
                                      )

forest_model.fit(X_train, y_train)

test_predict_f = forest_model.predict(X_test)

print('Precisión del modelo Random Forest Classifier en Test: {}'.format(accuracy_score(y_test, test_predict_f)))

Precisión del modelo Random Forest Classifier en Test: 0.7812050303447413


# **3. Comparación y elección de modelos**

In [13]:
# Regresión Logística
print("Regresión Logística:")
print(classification_report(y_test, test_predict))

# Random Forest
print("Random Forest:")
print(classification_report(y_test, test_predict_f))

Regresión Logística:
              precision    recall  f1-score   support

           0       0.78      0.82      0.80      7983
           1       0.81      0.77      0.79      8000

    accuracy                           0.79     15983
   macro avg       0.80      0.79      0.79     15983
weighted avg       0.80      0.79      0.79     15983

Random Forest:
              precision    recall  f1-score   support

           0       0.75      0.85      0.79      7983
           1       0.83      0.71      0.77      8000

    accuracy                           0.78     15983
   macro avg       0.79      0.78      0.78     15983
weighted avg       0.79      0.78      0.78     15983



Luego de analizar las métricas, concluyo que la regresión logística es superior en la clasificación general comparado con el Random Forest. A pesar que el RF no se queda atrás en rendimiento tampoco, muestra ciertas limitaciones para clasificar las muestras negativas.

Además hay un elemento que no se puede perder de vista y es el costo computacional. La regresión logística es más eficiente en término de recursos que el RF, esto es también un determinante sabiendo que no dieron resultados muy diferentes en la precisión, pero sí en tiempo de ejecución y gasto de recursos. El modelo de regresión logística compila entre 8 y 10 segundos; El modelo de Random Forest compila entre 45 y 50 segundos.

Ayudas usadas:

[joblib](https://www.datasmarts.net/como-guardar-y-cargar-modelos-de-machine-learning-en-scikit-learn/)

[scipy](https://stackoverflow.com/questions/8955448/save-load-scipy-sparse-csr-matrix-in-portable-data-format)

In [14]:
# Se descargan los datos y el modelo para su uso posterior

sparse.save_npz("X_train.npz", X_train)
sparse.save_npz("X_test.npz", X_test)

y_train.to_csv('y_train.csv', index = False)
y_test.to_csv('y_test.csv', index = False)

dump(logistic_model, 'model.joblib')

['model.joblib']