In [None]:
import json
import pandas as pd
from transformers import pipeline
from tqdm import tqdm
import torch
import numpy as np

sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model="distilbert-base-uncased-finetuned-sst-2-english",
    device=0 if torch.cuda.is_available() else -1
)

def get_sentiment_score(text):
    """Helper function to get sentiment score between -1 and 1"""
    try:
        result = sentiment_pipeline(text, truncation=True, max_length=512)[0]
        score = result['score']
        return score if result['label'] == 'POSITIVE' else -score
    except Exception as e:
        print(f"Error processing text: {e}")
        return 0.0

with open('celebrity_timeline_db.json', 'r') as f:
    data = json.load(f)

rows = []

for celebrity, years in tqdm(data.items(), desc="Processing celebrities"):
    for year, articles in tqdm(years.items(), desc=f"Processing {celebrity}'s years", leave=False):
        for article in articles:
            text = article['title'] + " " + article['snippet']
            sentiment = get_sentiment_score(text)
            
            row = {
                'name': celebrity,
                'year': year,
                'sentiment score': sentiment,
                'title': article['title'],
                'link': article.get('link', '') 
            }
            rows.append(row)

df = pd.DataFrame(rows)

df_avg = df.groupby(['name', 'year'])['sentiment score'].mean().reset_index()
df_avg = df_avg.rename(columns={'sentiment score': 'average_sentiment'})

def get_extreme_article(group):
    extreme_idx = group['sentiment score'].abs().idxmax()
    return pd.Series({
        'most_extreme_sentiment': group.loc[extreme_idx, 'sentiment score'],
        'most_extreme_link': group.loc[extreme_idx, 'link'],
        'most_extreme_title': group.loc[extreme_idx, 'title']
    })

df_extreme = df.groupby(['name', 'year']).apply(get_extreme_article).reset_index()

df_avg = pd.merge(df_avg, df_extreme, on=['name', 'year'])

df_avg = df_avg[['name', 'year', 'average_sentiment', 
                 'most_extreme_sentiment', 'most_extreme_title', 'most_extreme_link']]

df_avg.to_csv('celebrity_sentiment_analysis.csv', index=False)

Device set to use cpu
Processing celebrities: 100%|██████████| 50/50 [01:47<00:00,  2.16s/it]
  df_extreme = df.groupby(['name', 'year']).apply(get_extreme_article).reset_index()
