In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
import glob
import json
import time
import pandas as pd
from openai import OpenAI


In [2]:
# Provide your OpenAI API key directly
api_key = ""
client = OpenAI(api_key=api_key)


In [3]:
# Define paths
data_path = '/content/drive/MyDrive/IS450 Project/Historical Reddit Data/FinBERT_Data/'
output_csv_path = "/content/drive/MyDrive/IS450 Project/Historical Reddit Data/Processed Posts/golden_dataset_sentiment.csv"

# Load or initialize CSV
if os.path.exists(output_csv_path):
    df_existing = pd.read_csv(output_csv_path)
    processed_ids = set(df_existing["id"].tolist())
else:
    pd.DataFrame(columns=["id", "subreddit", "sentiment"]).to_csv(output_csv_path, index=False)
    processed_ids = set()


In [None]:
# Load input JSON files
file_paths = glob.glob(os.path.join(data_path, "finbert_r_*.json"))
print(f"Found {len(file_paths)} files.")


In [None]:
total_processed = 0

for file in file_paths:
    subreddit = os.path.basename(file).split('_')[-1].split('.')[0]
    with open(file, 'r') as f:
        data = json.load(f)

    for index, row in enumerate(data):
        post_id = row.get("id", f"post_{index}")
        if post_id in processed_ids:
            continue

        text = row.get("processed_text_finbert", "")
        prompt = (
            "Analyze the sentiment of the following financial post. "
            "Respond with a single number from 1 to 5, where: "
            "1 means very negative, 2 means negative, 3 means neutral, "
            "4 means positive, and 5 means very positive. "
            "Only output the number. Do not include any extra text, punctuation, or explanation.\n\n"
            f"{text}\n\n"
        )

        try:
            response = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {"role": "system", "content": "You are a helpful assistant. Only respond with a single word as specified."},
                    {"role": "user", "content": prompt}
                ]
            )
            sentiment = response.choices[0].message.content.strip().lower()
            pd.DataFrame([{
                "id": post_id,
                "subreddit": subreddit,
                "sentiment": sentiment
            }]).to_csv(output_csv_path, mode='a', header=False, index=False)

            processed_ids.add(post_id)
            total_processed += 1
            print(f"Processed {post_id}: {sentiment}")
            time.sleep(1)

        except Exception as e:
            print(f"Error processing {post_id}: {e}")
            continue

print(f"\nAll done! Total posts processed and saved: {total_processed}")
