In [1]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_curve
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
#from google.cloud import bigquery
#from google.cloud import storage
from joblib import dump
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
from textblob import TextBlob

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

from collections import Counter

import nltk
import os
import pandas as pd
import numpy as np


ModuleNotFoundError: No module named 'textblob'

In [2]:
# Importamos el archivo de los datos para trabajar
df = pd.read_parquet(r'business_reviews.parquet',engine='pyarrow')

# Eliminamos las columnas que no sean necesarias
df.drop(columns=['business_id','address', 'postal_code',
       'latitude', 'longitude', 'stars_mean', 'review_count', 'is_open',
       'review_id', 'user_id', 'useful', 'funny',
       'cool', 'date'],inplace=True)

# Tomamos como referencia x cantidad de valores para trabajar
df = df.sample(10000)

In [3]:
# Toquenizamos la columna text
df['tokens'] = df['text'].apply(lambda x: word_tokenize(x.lower())) 

# Lematizamos y hacimos stopwords
lemmatizer = WordNetLemmatizer()
stopwords = nltk.corpus.stopwords.words('english')

# Función de lematización
def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens if token.isalpha() and token not in stopwords]
df['lemmatized_tokens'] = df['tokens'].apply(lemmatize_tokens)
df['reconstructed_text'] = df['lemmatized_tokens'].apply(lambda tokens: ' '.join(tokens))


In [100]:
# Aplicamos un modelo de Analisis de Sentimientos
df['polaridad'] = df['reconstructed_text'].apply(lambda x: TextBlob(x).sentiment.polarity)
df['sentimiento'] = df['polaridad'].apply(lambda x: 'positivo' if x > 0.3 else 'negativo' if x < 0 else 'negativo')

In [101]:
# Visualizamos 
df.head(3)

Unnamed: 0,name,city,state,categories,stars_review,text,tokens,lemmatized_tokens,reconstructed_text,polaridad,sentimiento,target
872460,Ye Olde College Inn,New Orleans,Louisiana,"Restaurants, Southern, Cajun/Creole, American ...",4.0,"This place was incredibly tasty. I got the ""c...","[this, place, was, incredibly, tasty, ., i, go...","[place, incredibly, tasty, got, crawfish, dela...",place incredibly tasty got crawfish delacroix ...,0.322429,positivo,0
735319,St Charles Bar & Tavern,New Orleans,Louisiana,"Bars, Nightlife",2.0,"The waitresses her are always drunk, unkempt, ...","[the, waitresses, her, are, always, drunk, ,, ...","[waitress, always, drunk, unkempt, otherwise, ...",waitress always drunk unkempt otherwise icky g...,-0.05,negativo,0
818606,Cafe Negril,New Orleans,Louisiana,"Nightlife, Arts & Entertainment, Mexican, Jazz...",4.0,"The night I went, there was a funk band playin...","[the, night, i, went, ,, there, was, a, funk, ...","[night, went, funk, band, playing, whole, plac...",night went funk band playing whole place danci...,0.242857,negativo,0


In [102]:
# Creamos una columna Targuet que contiene valores de igualdad para los positivos y negativos
df['target'] = (df['sentimiento'] == 'positivo').astype(int)
df['target']

872460     1
735319     0
818606     0
742806     0
428867     0
          ..
873832     0
1210791    1
576029     0
918296     0
967621     0
Name: target, Length: 10000, dtype: int64

In [103]:
reviews = df['reconstructed_text']

In [104]:
# Creamos una matriz TF-IDF para comenzar a con el modelo de Regresion Logistica
vectorizer = TfidfVectorizer(max_features=30000, stop_words='english') 
X = vectorizer.fit_transform(reviews)

In [105]:
# Observamos los valores de X
print(X.shape)

(10000, 20463)


In [106]:
# Creamos la variable Y para luego entrenar el modelo
y = df["target"]

In [115]:
# Dividimos el dataset en conjuntos de entrenamiento
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [116]:
# Importamos el modelo de regresión logística de Scikit-learn

model = LogisticRegression()
model.fit(X_train, y_train)

In [117]:
# Hacemos la prediccion
y_pred = model.predict(X_test)
# Imprimimos el reporte de clasificación
print("Reporte de Clasificación:")
print("--------------------------")
print(classification_report(y_test, y_pred))

# Imprimimos la matriz de confusión
print("Matriz de Confusión:")
print("--------------------")
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

Reporte de Clasificación:
--------------------------
              precision    recall  f1-score   support

           0       0.84      0.91      0.88      1095
           1       0.88      0.80      0.84       905

    accuracy                           0.86      2000
   macro avg       0.86      0.85      0.86      2000
weighted avg       0.86      0.86      0.86      2000

Matriz de Confusión:
--------------------
[[998  97]
 [185 720]]
