In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import json
from datetime import datetime, timedelta
from collections import defaultdict

# Load the sentiment model
modelName = "nlptown/bert-base-multilingual-uncased-sentiment"
tokenizer = AutoTokenizer.from_pretrained(modelName)
model = AutoModelForSequenceClassification.from_pretrained(modelName)

# Function to calculate sentiment score
def calculate_sentiment(texts):
    if not texts:
        return [0, 0, 0, 0, 1]  # Neutral sentiment if no text is provided
    
    inputs = tokenizer(texts, return_tensors="pt", truncation=True, padding=True, max_length=512, return_attention_mask=True)
    outputs = model(**inputs)
    logits = outputs.logits
    scores = logits.softmax(dim=1)
    averageScore = scores.mean(dim=0).tolist()  # Convert tensor to list
    return averageScore

# Function to group news articles by date
def group_articles_by_date(articles):
    grouped_data = defaultdict(list)
    for article in articles:
        date_str = article.get("publishedAt", "")[:10]  # Extract YYYY-MM-DD from publishedAt
        if date_str:
            grouped_data[date_str].append(article)
    return dict(grouped_data)

# Function to process news and generate sentiment scores
def analyze_and_save_sentiment(input_file, output_file, start_date, end_date):
    with open(input_file, 'r') as file:
        raw_data = json.load(file)

    # If data is a list, convert it to a dictionary grouped by date
    if isinstance(raw_data, list):
        data = group_articles_by_date(raw_data)
    elif isinstance(raw_data, dict):
        data = raw_data
    else:
        raise ValueError("Invalid JSON format")

    result = {}
    existing_dates = set(data.keys())

    # Convert date strings to datetime objects
    start = datetime.strptime(start_date, "%Y-%m-%d")
    end = datetime.strptime(end_date, "%Y-%m-%d")

    # Iterate through all dates in the range
    current_date = start
    while current_date <= end:
        date_str = current_date.strftime("%Y-%m-%d")
        
        if date_str in existing_dates:
            articles = data[date_str]
            texts = []

            for article in articles:
                if 'full_content' in article and article['full_content']:  
                    texts.append(article['full_content'])  # Use full_content if available
                else:
                    texts.append(article['title'] + " " + article['description'])  # Use title + description if no full_content
            
            result[date_str] = calculate_sentiment(texts)
        else:
            result[date_str] = [0, 0, 0, 0, 1]  # Neutral score for missing days
        
        current_date += timedelta(days=1)  # Move to the next day

    # Save the results in sorted order
    sorted_result = dict(sorted(result.items()))
    
    with open(output_file, 'w') as f:
        json.dump(sorted_result, f, indent=2)

# Example usage
input_json_file = 'combined/infosys_combined.json'  # Update with your JSON file name
output_json_file = 'daily_scores_infosys.json'

start_date = "2023-04-01"
end_date = "2024-04-01"

analyze_and_save_sentiment(input_json_file, output_json_file, start_date, end_date)



