In [1]:
import pandas as pd
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

### Cargar archivo review.parquet, realizar analisis sentimental, variables dummy, agrupación por business_id

In [44]:
# Carga el archivo .parquet en un DataFrame de Pandas
df_review_sent = pd.read_parquet('..//data//review.parquet')

In [45]:
# Eliminar las columnas 'review_id' y 'user_id'
df_review_sent.drop(columns=['review_id', 'user_id'], inplace=True)

In [46]:
# Descargar el lexicón de VADER
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\jhcat\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [47]:
# Inicializar el analizador de sentimientos
sia = SentimentIntensityAnalyzer()

In [48]:
# Función para clasificar el sentimiento
def classify_sentiment(text):
    scores = sia.polarity_scores(text)
    if scores['compound'] >= 0.05:
        return 2  # Positivo
    elif scores['compound'] <= -0.05:
        return 0  # Negativo
    else:
        return 1  # Neutral

In [49]:
# Aplicar la función a la columna 'text'
df_review_sent['sentiment'] = df_review_sent['text'].apply(classify_sentiment)

In [50]:
df_review_sent.head()

Unnamed: 0,business_id,stars,useful,funny,cool,text,date,sentiment
0,aJvxWyQIG5OLfBw3qAe8xA,2.0,0,0,0,"Ordered Caramel Frappe At Drive Thru, Big Mist...",2017-12-29 19:38:31,0
1,MjZQqZAmJeMco_Vq-Y9h-g,4.0,0,0,0,Drum-roll Please! Review #100 Coming Right Up!...,2014-02-05 19:38:24,2
2,u7MJKcNdZXYyTeb67vD5jw,4.0,1,0,0,We Stopped Here For My Chai And Hubby's Coffee...,2017-02-09 04:35:39,2
3,saJFbz12EnzanelpD8_xXQ,2.0,0,0,0,There's Been Three Times That I've Ordered A G...,2016-08-25 14:08:18,1
4,KiE0h68HGOO7ZXAqkMBdiw,1.0,0,1,0,"I Went In When They Had 4 People Working, Wait...",2016-01-30 01:10:42,0


In [51]:
# Guardar el DataFrame como archivo Parquet
df_review_sent.to_parquet('..//data//review_sent.parquet')

In [52]:
df_review_sent_total = df_review_sent

In [53]:
# Crear variables dummy
df_review_sent_total['positive'] = df_review_sent['sentiment'].apply(lambda x: 1 if x == 2 else 0)
df_review_sent_total['neutral'] = df_review_sent['sentiment'].apply(lambda x: 1 if x == 1 else 0)
df_review_sent_total['negative'] = df_review_sent['sentiment'].apply(lambda x: 1 if x == 0 else 0)

In [54]:
# Eliminar la columna 'sentiment' si no la necesitas
df_review_sent_total.drop(columns=['sentiment'], inplace=True)

In [55]:
# Eliminar las columnas 'text' y 'date'
df_review_sent_total.drop(columns=['text', 'date'], inplace=True)

In [56]:
# Funciones personalizadas para sumar los valores > 0
def sum_positive(x):
    return x[x > 0].sum()

def sum_neutral(x):
    return x[x > 0].sum()

def sum_negative(x):
    return x[x > 0].sum()

In [58]:
# Crear columnas total para positive, neutral, y negative antes de agrupar
df_review_sent_total['positive_total'] = df_review_sent_total.groupby('business_id')['positive'].transform(lambda x: sum_positive(x))
df_review_sent_total['neutral_total'] = df_review_sent_total.groupby('business_id')['neutral'].transform(lambda x: sum_neutral(x))
df_review_sent_total['negative_total'] = df_review_sent_total.groupby('business_id')['negative'].transform(lambda x: sum_negative(x))


In [59]:
# Agrupar por la columna 'business_id' y calcular el valor promedio de las columnas relevantes
df_review_sent_total = df_review_sent_total.groupby('business_id').agg({
    'stars': 'mean',
    'useful': 'mean',
    'funny': 'mean',
    'cool': 'mean',
    'positive': 'mean',
    'neutral': 'mean',
    'negative': 'mean',
    'positive_total': 'sum',  # Sumar los valores totales
    'neutral_total': 'sum',   # Sumar los valores totales
    'negative_total': 'sum'   # Sumar los valores totales
}).reset_index()

In [60]:
df_review_sent_total.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 730 entries, 0 to 729
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   business_id     730 non-null    object 
 1   stars           730 non-null    float64
 2   useful          730 non-null    float64
 3   funny           730 non-null    float64
 4   cool            730 non-null    float64
 5   positive        730 non-null    float64
 6   neutral         730 non-null    float64
 7   negative        730 non-null    float64
 8   positive_total  730 non-null    int64  
 9   neutral_total   730 non-null    int64  
 10  negative_total  730 non-null    int64  
dtypes: float64(7), int64(3), object(1)
memory usage: 62.9+ KB


In [61]:
df_review_sent_total.head()

Unnamed: 0,business_id,stars,useful,funny,cool,positive,neutral,negative,positive_total,neutral_total,negative_total
0,-6kIZWnXPuDC6JiQJ-A1fg,3.317073,0.829268,0.439024,0.463415,0.707317,0.0,0.292683,1189,0,492
1,-85kJMtb9wqNWDT8yLbitw,3.666667,0.861111,0.555556,0.583333,0.777778,0.055556,0.166667,1008,72,216
2,-FSNRWP_3twzsH-qliHcZQ,2.333333,0.333333,0.111111,0.111111,0.555556,0.0,0.444444,45,0,36
3,-GJN01qCjGgnNBTbZexNpQ,3.375,0.625,0.375,0.375,0.625,0.0,0.375,40,0,24
4,-KdeX92-JV2K8GWbAxVj2w,2.73913,0.608696,0.195652,0.108696,0.630435,0.065217,0.304348,1334,138,644


In [62]:
# Guardar el DataFrame como archivo Parquet
df_review_sent_total.to_parquet('..//data//review_sent_total.parquet')