# NLP model

## Importing main libraries

In [1]:
import numpy as np
import pandas as pd
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/usuario/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Data prepocessing

In [2]:
df_feelings = pd.read_excel('reviews_claro.xlsx')

In [3]:
df = df_feelings.copy()
df = df[['at', 'content','score']]
df = df.dropna(axis=0).reset_index(drop=True)
df = df.rename(columns={'at': 'date', 'content': 'review'})
df['date'] = pd.to_datetime(df['date'], infer_datetime_format=True).dt.strftime('%Y-%m-%d')

### Data tokenization

In [4]:
def tokenization(df):
    corpus = []
    for i in range(0, len(df)):
        review = re.sub('[^a-zA-ZÀ-ÿ]', ' ', df['review'][i])
        review = review.lower()
        review = review.split()
        ps = PorterStemmer()
        all_stopwords = stopwords.words('spanish')
        all_stopwords.remove('sí')
        all_stopwords.remove('no')
        all_stopwords.remove('ni')
        review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
        review = ' '.join(review)
        corpus.append(review)
    return corpus

## Model training

In [13]:
dataset = tokenization(df)
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=4000)
X = cv.fit_transform(dataset).toarray()
y = df['score'].values

In [71]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [72]:
# We use random forest classifier for easiness in application.
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators = 10, random_state=0)
RF.fit(X_train, y_train)
y_pred = RF.predict(X_test)

In [73]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred) # 0.751

[[ 5411   100   131    45   370]
 [  832    53    79    23   147]
 [  557    54   116    54   427]
 [  247    30    91   123  1533]
 [  338    25    73   196 10449]]


0.7511160714285714

In [74]:
import pickle
pickle.dump(cv, open("count_vectorizer.pickle", "wb"))
pickle.dump(RF, open("RF_nlp.pickle", "wb"))

In [59]:
df[df['score']==2]

Unnamed: 0,date,review,score
58,2022-05-17,"Según mi plan es de 150 mb y no llega ni a 60,...",2
70,2022-05-17,NO TENGO RESPUESTA CLARA DE MIS FACTURAS,2
77,2022-05-17,Me pide que libere espació y no puedo quitar l...,2
91,2022-05-16,Jxmmm en u Kik uocnm mi oficina k. Kun nn el,2
93,2022-05-16,Un poco malisima,2
...,...,...,...
107451,2017-11-17,No me puedo ni registrar hahahahaha que chafa 😂,2
107493,2017-11-16,No me gusto porque en la app anterior gestiona...,2
107495,2017-11-16,"Por que son abusivos , en la anterior invadía ...",2
107500,2017-11-16,"Se ve bien, pero : solo da información de un s...",2


In [87]:
from pysentimiento import create_analyzer

In [89]:
analyzer = create_analyzer(task="sentiment", lang="es")

loading file https://huggingface.co/pysentimiento/robertuito-sentiment-analysis/resolve/main/tokenizer.json from cache at /Users/usuario/.cache/huggingface/transformers/47dd2d3180a6186d30715516321375322e3a84d5e4656762e083091bbb5d5dc4.0843b07596b388e054bae078721182b4846b9e28a7bbf04d7079b274f8613ae3
loading file https://huggingface.co/pysentimiento/robertuito-sentiment-analysis/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/pysentimiento/robertuito-sentiment-analysis/resolve/main/special_tokens_map.json from cache at /Users/usuario/.cache/huggingface/transformers/25e0e805456d2786a12b70b86278c6e839d19958cb4f541ee1f78621140098f7.0dc5b1041f62041ebbd23b1297f2f573769d5c97d8b7c28180ec86b8f6185aa8
loading file https://huggingface.co/pysentimiento/robertuito-sentiment-analysis/resolve/main/tokenizer_config.json from cache at /Users/usuario/.cache/huggingface/transformers/4fdf9a5a8e0a6023e1a9cdef62921158c3db9545e73f8dc0f46340a21bbb64d5.50a2bcf7668df2ff5a82b7

In [86]:
analyzer.predict('Nutritivo y gordo')

AnalyzerOutput(output=NEU, probas={NEU: 0.977, POS: 0.020, NEG: 0.004})

In [None]:
pred_sentimientos = analyzer.predict(corpus)
output_sent = [pred.output for pred in pred_sentimientos]
vec_sent = []
for elem in output_sent:
    if elem == 'POS':
        vec_sent.append('positivo')
    elif elem == 'NEG':
        vec_sent.append('negativo')
    else:
        vec_sent.append('neutro')

In [188]:
df['feeling'] = vec_sent

In [205]:
df.to_csv('db.csv', index=False)

In [67]:
import sys
!{sys.executable} -m pip install scikit-learn==1.1.1

Collecting scikit-learn==1.1.1
  Using cached scikit_learn-1.1.1-cp38-cp38-macosx_10_13_x86_64.whl (8.5 MB)
Collecting joblib>=1.0.0
  Using cached joblib-1.1.0-py2.py3-none-any.whl (306 kB)
Installing collected packages: joblib, scikit-learn
  Attempting uninstall: joblib
    Found existing installation: joblib 0.16.0
    Uninstalling joblib-0.16.0:
      Successfully uninstalled joblib-0.16.0
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 0.23.1
    Uninstalling scikit-learn-0.23.1:
      Successfully uninstalled scikit-learn-0.23.1
Successfully installed joblib-1.1.0 scikit-learn-1.1.1


In [54]:
from typing_extensions import ParamSpec


In [56]:
from platform import python_version
print(python_version())

3.8.3
