# SentimentAnalysis
## Random Forest
### [ Opiniones VS Atracciones ]
#### Ing. Luis Felipe Narvaez Gomez. E-mail: luis.narvaez@usantoto.edu.co. Cod: 2312660. Facultad de Ingenieria de Sistemas. USTA.


### Importar Librerias

Vamos ahora a importar los modelos que ya hemos realizado para la prediccion

In [1]:
import numpy as np
import pandas as pd
import re

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from IPython.display import Image

import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

import pickle

[nltk_data] Downloading package stopwords to C:\Users\Ruiso Local
[nltk_data]     Pc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Leer un archivo XLSX

In [3]:
# Leer el archivo de tipo XLSX
dfData = pd.read_excel("Rest_Mex_2022_Sentiment_Analysis_Track_Train.xlsx")

In [5]:
dfData.keys()

Index(['Title', 'Opinion', 'Polarity', 'Attraction'], dtype='object')

### Establecer FEATURES y LABELS

In [4]:
# [0]       [1]         [2]        [3]
# ['Title', 'Opinion', 'Polarity', 'Attraction']

features = dfData.iloc[:,1].values  # Opinion
labels = dfData.iloc[:,3].values    # Attraction

### Limpiar la Data

In [6]:
processed_features = []

for sentence in range(0, len(features)):
    #Remove all the special characters
    processed_feature = re.sub(r'\W',' ', str(features[sentence]))
    #remove all single characters
    processed_feature = re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)
    #remove single characters from the start
    processed_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature)
    #substituting multiple spaces with single space
    processed_feature = re.sub(r'\s+', ' ', processed_feature, flags=re.I)
    #removing prefixxed 'b'
    processed_feature = re.sub(r'^b\s+', '', processed_feature)
    #converting to lowercase
    processed_feature = processed_feature.lower()
    #añadirlo al arreglo alv
    processed_features.append(processed_feature)

### StopWords

In [7]:
vectorizer = TfidfVectorizer (max_features=2500, 
                              min_df = 7, 
                              max_df=0.8, 
                              stop_words=stopwords.words('spanish'))
processed_features = vectorizer.fit_transform(processed_features).toarray()

### Vectores de entrenamiento

In [8]:
X_train, X_test, y_train, y_test = train_test_split(processed_features, 
                                                    labels, 
                                                    test_size = 0.2,
                                                    random_state=0)

### Entrenamiento

In [9]:
text_classifier = RandomForestClassifier(n_estimators=200, 
                                         random_state=0)
text_classifier.fit(X_train, y_train)

RandomForestClassifier(n_estimators=200, random_state=0)

### Ahora a predecir

In [10]:
predictions  =(text_classifier.predict(X_test))

In [11]:
print(confusion_matrix(y_test, predictions))

[[1024    6   25]
 [  13 3170   65]
 [  14  100 1626]]


In [12]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

  Attractive       0.97      0.97      0.97      1055
       Hotel       0.97      0.98      0.97      3248
  Restaurant       0.95      0.93      0.94      1740

    accuracy                           0.96      6043
   macro avg       0.96      0.96      0.96      6043
weighted avg       0.96      0.96      0.96      6043



In [13]:
print(accuracy_score(y_test, predictions))

0.9630977991064041


### Exportar Modelo de entrenamiento

In [14]:
# vamos a exportar este modelo porque aja :v

with open('SentimentAnalysis_RandomForest_OvA','wb') as picklefile:
    pickle.dump(text_classifier, picklefile)

### Importar Modelo de entrenamiento

In [17]:
# podemos llamar este modelo para despues porque aja :v

with open('SentimentAnalysis_RandomForest_OvA','rb') as training_model:
    model = pickle.load(training_model)

In [18]:
y_pred2 = model.predict(X_test)

print(confusion_matrix(y_test, y_pred2))
print(classification_report(y_test, y_pred2))
print(accuracy_score(y_test, y_pred2))

[[1024    6   25]
 [  13 3170   65]
 [  14  100 1626]]
              precision    recall  f1-score   support

  Attractive       0.97      0.97      0.97      1055
       Hotel       0.97      0.98      0.97      3248
  Restaurant       0.95      0.93      0.94      1740

    accuracy                           0.96      6043
   macro avg       0.96      0.96      0.96      6043
weighted avg       0.96      0.96      0.96      6043

0.9630977991064041
