In [2]:
# --> Librerias básicas
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Lectura de dataset

In [3]:
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter='\t', quoting=3)
dataset.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


## Limpieza de texto

In [4]:
import re
import nltk
from nltk.corpus import stopwords

from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()                                            # Crea un objeto para reducir las palabras a su raíz

nltk.download('stopwords')                                      # Descarga palabras clave innecesarias
stop_words = stopwords.words('english')                         # Carga las palabras clave en español

corpus = []                                                     # Crea una lista vacía para almacenar las palabras clave

for i in range(0, 1000):                                            # Recorre todas las filas del dataset
    review = dataset["Review"][i]                                   # Selecciona la columna "Review" de la fila i
    review = re.sub('[^a-zA-Z]', ' ', review)                       # Elimina todos los caracteres que no sean letras
    review = review.lower()                                         # Convierte todas las letras a minúsculas
    review = review.split()                                         # Separa las palabras en una lista

    review = [ps.stem(word) for word in review if not word in set(stop_words)]    # Elimina las palabras clave
    review = ' '.join(review)                                       # Une las palabras en una cadena
    corpus.append(review)                                           # Añade la cadena a la lista



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\uriel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Bolsa de palabras

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

# Create a CountVectorizer instance
vectorizer = CountVectorizer(max_features=1500)
x = vectorizer.fit_transform(corpus).toarray()


## Obtener variable dependiente

In [6]:
y = dataset.iloc[:, 1].values

## Implementar algoritmo de clasificación

In [12]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.20, random_state = 0)

In [13]:
# --> Teorema de bayes
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(x_train, y_train)


In [14]:
y_predict = classifier.predict(x_test)


In [15]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_predict)
cm


array([[55, 42],
       [12, 91]], dtype=int64)

## Evaluar rendimiento

In [17]:
tp = 55
tn = 91
fp = 12
fn = 42

accuracy = (tp + tn) / (tp + tn + fp + fn)
print("Accuracy: ", accuracy)

precision = tp / (tp + fp)
print("Precision: ", precision)

recall = tp / (tp + fn)
print("Recall: ", recall)

f1 = 2 * (precision * recall) / (precision + recall)
print("F1: ", f1)

Accuracy:  0.73
Precision:  0.8208955223880597
Recall:  0.5670103092783505
F1:  0.6707317073170731


## Predicción

In [35]:
# Texto de entrada para hacer la predicción
input_text = "This restaurant i like."

# Limpieza y procesamiento del texto
cleaned_input = re.sub('[^a-zA-Z]', ' ', input_text)
cleaned_input = cleaned_input.lower()
cleaned_input = cleaned_input.split()
cleaned_input = [ps.stem(word) for word in cleaned_input if not word in set(stopwords.words('english'))]
cleaned_input = ' '.join(cleaned_input)

# Transforma el texto procesado
input_vector = vectorizer.transform([cleaned_input]).toarray()

# Realiza la predicción
prediction = classifier.predict(input_vector)
print(prediction)

# Imprime el resultado de la predicción
if prediction[0] == 1:
    print("La predicción es: Positiva")
else:
    print("La predicción es: Negativa")


[1]
La predicción es: Positiva
