In [6]:
import json
import pandas as pd
from tqdm import tqdm
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [None]:
def analyze_reviews_with_vader(input_path, output_path, text_field='text', batch_size=10000):
    """
    Analyze reviews with VADER sentiment and preserve review_id.

    Args:
        input_path: Path to JSON/JSONL file with reviews
        output_path: Path to save CSV with sentiment scores
        text_field: Name of field containing review text
        batch_size: Number of reviews to process at once
    """
    analyzer = SentimentIntensityAnalyzer()
    chunks = []

    # stream file line-by-line (more memory efficient), i shouldve used pandas looking back
    with open(input_path, 'r') as f:
        batch = []
        for line in tqdm(f, desc="Processing reviews"):
            review = json.loads(line)

            # basically ensuring only reviews with location classification are processed
            if review['location_classification']:
                batch.append(review)

            if len(batch) >= batch_size:
                df_batch = pd.DataFrame(batch)

                # add vader scores to batch
                df_batch = _add_vader_scores(df_batch, analyzer, text_field)
                chunks.append(df_batch)
                batch = []

        # process remaining reviews
        if batch:
            df_batch = pd.DataFrame(batch)
            df_batch = _add_vader_scores(df_batch, analyzer, text_field)
            chunks.append(df_batch)

    # combine and save
    result = pd.concat(chunks, ignore_index=True)
    result.to_csv(output_path, index=False)
    print(f"Saved {len(result)} reviews with sentiment scores to {output_path}")
    return result

def _add_vader_scores(df, analyzer, text_field):
    """Add VADER sentiment scores to dataframe."""
    scores = df[text_field].apply(lambda x: analyzer.polarity_scores(str(x)) if pd.notna(x) else {'neg': 0, 'neu': 0, 'pos': 0, 'compound': 0})

    df['vader_neg'] = scores.apply(lambda x: x['neg'])
    df['vader_neu'] = scores.apply(lambda x: x['neu'])
    df['vader_pos'] = scores.apply(lambda x: x['pos'])
    df['vader_compound'] = scores.apply(lambda x: x['compound'])

    return df

def get_sentiment_summary(df, group_by='review_id'):
    """Get summary statistics of sentiment scores grouped by review_id or other field."""
    summary = df.groupby(group_by).agg({
        'vader_compound': ['mean', 'min', 'max', 'count'],
        'vader_pos': 'mean',
        'vader_neg': 'mean',
        'vader_neu': 'mean'
    }).round(3)

    return summary

In [7]:
review_sentiment_df = analyze_reviews_with_vader(
    input_path='../yelp_dataset/yelp_restaurant_reviews_classified.json',
    output_path='../yelp_dataset/yelp_restaurant_reviews_with_vader.csv',
    text_field='text',
    batch_size=5000
)

Processing reviews: 5126287it [1:06:31, 1284.34it/s]


Saved 5126287 reviews with sentiment scores to ../yelp_dataset/yelp_restaurant_reviews_with_vader.csv


In [8]:
review_sentiment_df.columns

Index(['review_id', 'user_id', 'business_id', 'stars', 'useful', 'funny',
       'cool', 'text', 'date', 'location_classification', 'vader_neg',
       'vader_neu', 'vader_pos', 'vader_compound'],
      dtype='object')

In [None]:
# df.vader_compound.describe()