In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer
import string
import pyarrow.parquet as pq

In [None]:
nltk.download('stopwords')
nltk.download('vader_lexicon')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df = pd.read_parquet('/content/reviews_restaurants_v1.parquet')

In [None]:
# Función para realizar el preprocesamiento del texto
def preprocess_text(text):
    if text is not None:
        # Convierte a minúsculas
        text = text.lower()

        # Elimina signos de puntuación
        text = text.translate(str.maketrans("", "", string.punctuation))

        # Tokenización
        tokens = word_tokenize(text)

        # Elimina stopwords
        stop_words = set(stopwords.words('english'))  # Puedes cambiar 'english' por tu idioma
        tokens = [word for word in tokens if word not in stop_words]

        # Reconstruye el texto después del preprocesamiento
        processed_text = ' '.join(tokens)

        return processed_text
    else:
        return None

In [None]:
# Aplica la función de preprocesamiento a la columna 'text'
df['processed_text'] = df['text'].apply(preprocess_text)

# Instanciar el analizador de sentimientos VADER
sia = SentimentIntensityAnalyzer()

# Analizar sentimientos y asignar 0 a las columnas con None
sentiment_scores = []
for opinion in df['processed_text']:
    if opinion is not None:
        # Obtener el puntaje de sentimiento para cada opinión
        sentiment = sia.polarity_scores(opinion)

        # Clasificar la opinión según el puntaje obtenido
        if sentiment['compound'] >= 0.05:
            sentiment_scores.append(1)  # Positivo
        elif sentiment['compound'] <= -0.05:
            sentiment_scores.append(-1)  # Negativo
        else:
            sentiment_scores.append(0)  # Neutral
    else:
        sentiment_scores.append(0)  # Asignar 0 a las columnas con None

df['sentiment_analysis'] = sentiment_scores

In [None]:
  df

Unnamed: 0,user_id,name,rating,text,gmap_id,state_name,year,month,processed_text,sentiment_analysis
0,1.179759e+20,Anthony Roberts,4,"On the higher end of price for pizza, but they...",0x8889221157fb3455:0x5c125c40c3eccc2a,Alabama,2016,5,higher end price pizza many different types ca...,1
1,1.143165e+20,Jonathan Robert,3,"Food was ok, felt like the atmosphere as well ...",0x8889221157fb3455:0x5c125c40c3eccc2a,Alabama,2015,11,food ok felt like atmosphere well service litt...,1
2,1.136326e+20,Brian Harvey,4,"Good food, service so so",0x8889221157fb3455:0x5c125c40c3eccc2a,Alabama,2016,7,good food service,1
3,1.076840e+20,Ashley Maddox,5,Love it,0x8889221157fb3455:0x5c125c40c3eccc2a,Alabama,2015,9,love,1
4,1.126036e+20,West Martin,5,Yum,0x8889221157fb3455:0x5c125c40c3eccc2a,Alabama,2013,10,yum,0
...,...,...,...,...,...,...,...,...,...,...
11900477,1.066896e+20,Nancy Schaffer,5,,0x5335fac0b65b4243:0x78f735c40c344b01,Wyoming,2017,11,,0
11900478,1.019377e+20,Mackenzie Mayer,2,,0x5335fac0b65b4243:0x78f735c40c344b01,Wyoming,2018,6,,0
11900479,1.087467e+20,Walter Orum,5,,0x5335fac0b65b4243:0x78f735c40c344b01,Wyoming,2017,4,,0
11900480,1.124218e+20,Charissa Carver,5,,0x5335fac0b65b4243:0x78f735c40c344b01,Wyoming,2019,4,,0


In [None]:
df['sentiment_analysis'].value_counts()

sentiment_analysis
 1    5561289
 0    5363368
-1     498918
Name: count, dtype: int64

In [None]:
df.to_parquet('nombre_del_archivo.parquet', index=False)