In [1]:
import pandas as pd
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

### Cargar archivo tip.parquet, realizar analisis sentimental, variables dummy, agrupación por business_id

In [48]:
# Carga el archivo .parquet en un DataFrame de Pandas
df_tip_sent = pd.read_parquet('..//data//tip.parquet')

In [49]:
# Eliminar las columnas 'date' y 'user_id'
df_tip_sent.drop(columns=['date', 'user_id'], inplace=True)

In [50]:
# Descargar el lexicón de VADER
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\jhcat\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [51]:
# Inicializar el analizador de sentimientos
sia = SentimentIntensityAnalyzer()

In [52]:
# Función para clasificar el sentimiento
def classify_sentiment(text):
    scores = sia.polarity_scores(text)
    if scores['compound'] >= 0.05:
        return 2  # Positivo
    elif scores['compound'] <= -0.05:
        return 0  # Negativo
    else:
        return 1  # Neutral

In [53]:
# Aplicar la función a la columna 'text'
df_tip_sent['sentiment'] = df_tip_sent['text'].apply(classify_sentiment)

In [54]:
# Crear variables dummy
df_tip_sent['positive'] = df_tip_sent['sentiment'].apply(lambda x: 1 if x == 2 else 0)
df_tip_sent['neutral'] = df_tip_sent['sentiment'].apply(lambda x: 1 if x == 1 else 0)
df_tip_sent['negative'] = df_tip_sent['sentiment'].apply(lambda x: 1 if x == 0 else 0)

In [55]:
# Eliminar las columnas 'text' y 'sentiment'
df_tip_sent.drop(columns=['text', 'sentiment'], inplace=True)

In [56]:
# Agrupar por la columna 'business_id' y calcular el valor promedio de las columnas relevantes
df_tip_sent = df_tip_sent.groupby('business_id').agg({
    'compliment_count': 'mean',
    'positive': 'mean',
    'neutral': 'mean',
    'negative': 'mean'
}).reset_index()

In [57]:
df_tip_sent.head(10)

Unnamed: 0,business_id,compliment_count,positive,neutral,negative
0,-6kIZWnXPuDC6JiQJ-A1fg,0.0,0.611111,0.333333,0.055556
1,-85kJMtb9wqNWDT8yLbitw,0.0,0.5,0.5,0.0
2,-FSNRWP_3twzsH-qliHcZQ,0.0,0.0,0.0,1.0
3,-GJN01qCjGgnNBTbZexNpQ,0.0,0.5,0.5,0.0
4,-KdeX92-JV2K8GWbAxVj2w,0.045455,0.636364,0.227273,0.136364
5,-QG6KSRQKTQ80--wqrnLTg,0.0,0.0,1.0,0.0
6,-cDVG1zBZPBYU3TAoshgxw,0.166667,0.5,0.333333,0.166667
7,-ilTnXu41RrxQITuolQhmQ,0.0,0.333333,0.333333,0.333333
8,-vSkeoIujNpKhITwvcYVLw,0.0,1.0,0.0,0.0
9,-xgWMTF_F8E85I_XImNEgg,0.0,0.1,0.4,0.5


In [58]:
df_tip_sent.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 651 entries, 0 to 650
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   business_id       651 non-null    object 
 1   compliment_count  651 non-null    float64
 2   positive          651 non-null    float64
 3   neutral           651 non-null    float64
 4   negative          651 non-null    float64
dtypes: float64(4), object(1)
memory usage: 25.6+ KB


In [59]:
# Guardar el DataFrame como archivo Parquet
df_tip_sent.to_parquet('..//data//tip_sent.parquet')