In [1]:
import pandas as pd
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

### Cargar archivo review.parquet, realizar analisis sentimental, variables dummy, agrupación por business_id

In [2]:
# Carga el archivo .parquet en un DataFrame de Pandas
df_review_dunkin_sent = pd.read_parquet('..//data//review_dunkin.parquet')

In [3]:
# Eliminar las columnas 'review_id' y 'user_id'
df_review_dunkin_sent.drop(columns=['review_id', 'user_id'], inplace=True)

In [4]:
# Descargar el lexicón de VADER
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\jhcat\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [5]:
# Inicializar el analizador de sentimientos
sia = SentimentIntensityAnalyzer()

In [6]:
# Función para clasificar el sentimiento
def classify_sentiment(text):
    scores = sia.polarity_scores(text)
    if scores['compound'] >= 0.05:
        return 2  # Positivo
    elif scores['compound'] <= -0.05:
        return 0  # Negativo
    else:
        return 1  # Neutral

In [7]:
# Aplicar la función a la columna 'text'
df_review_dunkin_sent['sentiment'] = df_review_dunkin_sent['text'].apply(classify_sentiment)

In [8]:
df_review_dunkin_sent.head()

Unnamed: 0,business_id,stars,useful,funny,cool,text,date,sentiment
0,GUAF7ybULhg68asLfFZYbA,4.0,0,0,0,I Was Greated By A Nice Friendly Staff. There ...,2016-09-18 14:59:21,2
1,H1bbYNKgk6JF9pKcBXyDXw,1.0,2,0,1,Get Off The Phone And Serve Your Customers. Al...,2012-03-01 01:48:48,0
2,vMp55ea__Pk0fYbBDCn6jg,1.0,0,1,0,This Place Is A Disaster In Slow Motion. I Wou...,2010-05-15 15:42:08,0
3,-3dkEoYgH8AlUtBMZvzUfg,1.0,0,0,0,Had The Worst Experience At Dunkin Doughnuts [...,2017-04-30 15:33:53,2
4,nXJ1dAI-UGbXfeaI_b1abA,1.0,0,0,0,Gave Them Another Try Today ... Smdh .... How ...,2017-02-19 13:05:34,0


In [9]:
# Guardar el DataFrame como archivo Parquet
df_review_dunkin_sent.to_parquet('..//data//review_dunkin_sent.parquet')

In [10]:
df_review_dunkin_sent_total = df_review_dunkin_sent

In [11]:
# Crear variables dummy
df_review_dunkin_sent_total['positive'] = df_review_dunkin_sent['sentiment'].apply(lambda x: 1 if x == 2 else 0)
df_review_dunkin_sent_total['neutral'] = df_review_dunkin_sent['sentiment'].apply(lambda x: 1 if x == 1 else 0)
df_review_dunkin_sent_total['negative'] = df_review_dunkin_sent['sentiment'].apply(lambda x: 1 if x == 0 else 0)

In [12]:
# Eliminar la columna 'sentiment' si no la necesitas
df_review_dunkin_sent_total.drop(columns=['sentiment'], inplace=True)

In [13]:
# Eliminar las columnas 'text' y 'date'
df_review_dunkin_sent_total.drop(columns=['text', 'date'], inplace=True)

In [14]:
# Funciones personalizadas para sumar los valores > 0
def sum_positive(x):
    return x[x > 0].sum()

def sum_neutral(x):
    return x[x > 0].sum()

def sum_negative(x):
    return x[x > 0].sum()

In [15]:
# Crear columnas total para positive, neutral, y negative antes de agrupar
df_review_dunkin_sent_total['positive_total'] = df_review_dunkin_sent_total.groupby('business_id')['positive'].transform(lambda x: sum_positive(x))
df_review_dunkin_sent_total['neutral_total'] = df_review_dunkin_sent_total.groupby('business_id')['neutral'].transform(lambda x: sum_neutral(x))
df_review_dunkin_sent_total['negative_total'] = df_review_dunkin_sent_total.groupby('business_id')['negative'].transform(lambda x: sum_negative(x))


In [16]:
# Agrupar por la columna 'business_id' y calcular el valor promedio de las columnas relevantes
df_review_dunkin_sent_total = df_review_dunkin_sent_total.groupby('business_id').agg({
    'stars': 'mean',
    'useful': 'mean',
    'funny': 'mean',
    'cool': 'mean',
    'positive': 'mean',
    'neutral': 'mean',
    'negative': 'mean',
    'positive_total': 'sum',  # Sumar los valores totales
    'neutral_total': 'sum',   # Sumar los valores totales
    'negative_total': 'sum'   # Sumar los valores totales
}).reset_index()

In [17]:
df_review_dunkin_sent_total.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550 entries, 0 to 549
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   business_id     550 non-null    object 
 1   stars           550 non-null    float64
 2   useful          550 non-null    float64
 3   funny           550 non-null    float64
 4   cool            550 non-null    float64
 5   positive        550 non-null    float64
 6   neutral         550 non-null    float64
 7   negative        550 non-null    float64
 8   positive_total  550 non-null    int64  
 9   neutral_total   550 non-null    int64  
 10  negative_total  550 non-null    int64  
dtypes: float64(7), int64(3), object(1)
memory usage: 47.4+ KB


In [18]:
df_review_dunkin_sent_total.head()

Unnamed: 0,business_id,stars,useful,funny,cool,positive,neutral,negative,positive_total,neutral_total,negative_total
0,-2BDt9OdGiBONysWCdKgNg,3.0,0.588235,0.235294,0.235294,0.764706,0.058824,0.176471,221,17,51
1,-3dkEoYgH8AlUtBMZvzUfg,2.619048,0.428571,0.380952,0.238095,0.714286,0.047619,0.238095,315,21,105
2,-R2w_cB1_nA9ZrgvP2PkUw,3.076923,0.307692,0.307692,0.384615,0.692308,0.0,0.307692,117,0,52
3,-RLJlk68dmL--ff0lsP0JQ,3.0,0.75,0.125,0.125,0.75,0.0,0.25,48,0,16
4,-RkGrQUxlwgaR5By4p2tAQ,2.461538,1.076923,0.461538,0.538462,0.692308,0.0,0.307692,117,0,52


In [19]:
# Guardar el DataFrame como archivo Parquet
df_review_dunkin_sent_total.to_parquet('..//data//review_dunkin_sent_total.parquet')