# NLP model

## Importing libraries

In [207]:
import numpy as np
import pandas as pd
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
import pickle

from pysentimiento import create_analyzer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/usuario/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Data prepocessing

In [208]:
df_feelings = pd.read_excel('db_reviews_claro.xlsx')
df = df_feelings.copy()
df = df[['at', 'content','score']]
df = df.dropna(axis=0).reset_index(drop=True)
df = df.rename(columns={'at': 'date', 'content': 'review'})
df['date'] = pd.to_datetime(df['date'], infer_datetime_format=True).dt.strftime('%Y-%m-%d')

In [209]:
def data_preprocessing(df):
    corpus = []
    for i in range(0, len(df)):
        review = re.sub('[^a-zA-ZÀ-ÿ]', ' ', df['review'][i])
        review = review.lower()
        review = review.split()
        ps = PorterStemmer()
        all_stopwords = stopwords.words('spanish')
        all_stopwords.remove('sí')
        all_stopwords.remove('no')
        all_stopwords.remove('ni')
        review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
        review = ' '.join(review)
        corpus.append(review)
    cv = CountVectorizer(max_features=4000)
    X = cv.fit_transform(corpus).toarray()
    y = df['score'].values
    
    # Saving transformer.
    pickle.dump(cv, open("count_vectorizer.pickle", "wb"))
    return corpus, X, y

In [210]:
corpus, X, y = data_preprocessing(df)

## Model training

In [211]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [212]:
def model_trainer(X_train, y_train):
    # Initialization of model.
    RF = RandomForestClassifier(n_estimators = 10, random_state=0)
    
    # Train model.
    RF.fit(X_train, y_train)
    
    # Save model.
    pickle.dump(RF, open("RF_nlp.pickle", "wb"))
    return RF

In [213]:
trained_model = model_trainer(X_train, y_train)

In [214]:
def accuracy(X_test, y_test, trained_model):
    y_pred = trained_model.predict(X_test)
    return accuracy_score(y_test, y_pred), confusion_matrix(y_test, y_pred)

In [215]:
acc, cm = accuracy(X_test, y_test, train)

## Feeling deduction

In [None]:
pred_sentimientos = analyzer.predict(corpus)
output_sent = [pred.output for pred in pred_sentimientos]
vec_sent = []
for elem in output_sent:
    if elem == 'POS':
        vec_sent.append('positivo')
    elif elem == 'NEG':
        vec_sent.append('negativo')
    else:
        vec_sent.append('neutro')
df['feeling'] = vec_sent

# Export final dataframe.
df.to_csv('db_reviews_claro_feelings.csv', index=False)