In [None]:
!pip install "protobuf<4.25.0" --force-reinstall --quiet

In [None]:
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import json
import urllib.request
from scipy.special import softmax
import os
import re
import glob

# --- CONFIGURATION ---
# UPDATE THIS PATH to match where your zip file was unpacked in Kaggle
# usually: /kaggle/input/your-dataset-name/
INPUT_ROOT_DIR = '/kaggle/input/eth-reddit-posts-hourly/eth_reddit_daily_batches' 

# This is where we save the batch files
OUTPUT_DIR = '/kaggle/working/batch_results'
# ---------------------

# Create output directory
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

# 1. Load Model (Standard)
def load_model_from_hgf(model_name: str):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    config_url = f"https://huggingface.co/{model_name}/raw/main/config.json"
    with urllib.request.urlopen(config_url) as url:
        config_data = json.loads(url.read().decode())
    label_mapping = config_data["id2label"]
    labels = [label_mapping[str(i)] for i in range(len(label_mapping))]
    return tokenizer, model, labels

# 2. Preprocessing & Sentiment Functions (Standard)
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

def get_sentiment_scores(text, tokenizer, model, labels) -> dict:
    res = {}
    text = preprocess(text)
    encoded_input = tokenizer(text, return_tensors='pt', truncation=True)
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    ranking = np.argsort(scores)[::-1]
    for i in range(scores.shape[0]):
        l = labels[ranking[i]]
        s = scores[ranking[i]]
        res[l] = np.round(float(s), 4)
    return res

def process_data_and_aggregate_sentiment(table: pd.DataFrame, tokenizer, model, labels):
    data = table.copy(deep=True)
    # Check if empty (some files might be empty after cleaning)
    if data.empty:
        return None
        
    data['sentiment_dict_without_prefix'] = data['selftext'].astype(str).apply(lambda x: get_sentiment_scores(x, tokenizer, model, labels))
    data['sentiment_dict_with_prefix'] = data['selftext'].astype(str).apply(lambda x: get_sentiment_scores("BTC price outlook: " + x, tokenizer, model, labels))
    
    data['neutral'] = data['sentiment_dict_without_prefix'].apply(lambda x: x['neutral'])
    data['positive'] = data['sentiment_dict_without_prefix'].apply(lambda x: x['positive'])
    data['negative'] = data['sentiment_dict_without_prefix'].apply(lambda x: x['negative'])
    
    data['neutral_prefix'] = data['sentiment_dict_with_prefix'].apply(lambda x: x['neutral'])
    data['positive_prefix'] = data['sentiment_dict_with_prefix'].apply(lambda x: x['positive'])
    data['negative_prefix'] = data['sentiment_dict_with_prefix'].apply(lambda x: x['negative'])
    
    data['score'] = data['score'].apply(lambda x: x if x > 0 else 1)
    data['weighted_positive'] = data['score'] * data['positive']
    data['weighted_positive_prefix'] = data['score'] * data['positive_prefix']

    b_b1 = data["positive"].sum()/data["positive"].size
    b_b1_prefix = data["positive_prefix"].sum()/data["positive_prefix"].size
    b_b2 = data["weighted_positive"].sum()/data['score'].sum()
    b_b2_prefix = data["weighted_positive_prefix"].sum()/data['score'].sum()
    b_a4 = data[data["positive"] > 0.8].shape[0]/data.shape[0]
    b_a4_prefix = data[data["positive_prefix"] > 0.8].shape[0]/data.shape[0]
    
    return b_b1, b_b1_prefix, b_b2, b_b2_prefix, b_a4, b_a4_prefix

# --- MAIN EXECUTION ---

# Load model once
print("Loading Model...")
tokenizer, model, labels = load_model_from_hgf("mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")

# Get list of DATE FOLDERS (Batches)
# We assume the structure is INPUT_ROOT_DIR / YYYY-MM-DD / files...
date_folders = sorted([f.path for f in os.scandir(INPUT_ROOT_DIR) if f.is_dir()])

print(f"Found {len(date_folders)} date batches to process.")

for folder_path in date_folders:
    date_name = os.path.basename(folder_path) # e.g., "2025-12-07"
    
    # 1. DEFINE BATCH OUTPUT FILENAME
    batch_output_file = os.path.join(OUTPUT_DIR, f"results_{date_name}.csv")
    
    # 2. CHECK IF ALREADY DONE (Resume Capability)
    if os.path.exists(batch_output_file):
        print(f"Batch {date_name} already exists. Skipping...")
        continue
        
    print(f"Processing Batch: {date_name}")
    
    batch_results = [] # Use list for speed
    
    # Get all csv files in this specific date folder
    csv_files = glob.glob(os.path.join(folder_path, "*.csv"))
    
    for filepath in csv_files:
        filename = os.path.basename(filepath)
        
        # Extract Time from filename (e.g. ..._1129.csv)
        # We already know the Date from the folder name
        time_match = re.search(r'_(\d{4})\.csv', filename)
        time_part = time_match.group(1) if time_match else "Unknown"
        
        try:
            df = pd.read_csv(filepath)
            
            # Process
            metrics = process_data_and_aggregate_sentiment(df, tokenizer, model, labels)
            
            if metrics: # Ensure we didn't get None
                b_b1, b_b1_prefix, b_b2, b_b2_prefix, b_a4, b_a4_prefix = metrics
                
                batch_results.append({
                    'date': date_name,
                    'time': time_part,
                    'file': filename,
                    'b_b1': b_b1,
                    'b_b1_prefix': b_b1_prefix,
                    'b_b2': b_b2,
                    'b_b2_prefix': b_b2_prefix,
                    'b_a4': b_a4,
                    'b_a4_prefix': b_a4_prefix
                })
                
        except Exception as e:
            print(f"Error in {filename}: {e}")
            
    # 3. SAVE THIS BATCH IMMEDIATELY
    if batch_results:
        batch_df = pd.DataFrame(batch_results)
        batch_df = batch_df.sort_values(by='time')
        batch_df.to_csv(batch_output_file, index=False)
        print(f"Saved {len(batch_results)} records to {batch_output_file}")
    else:
        print(f"No valid results for {date_name}")

print("All batches processed.")

# --- FINAL MERGE ---
# Combine all the batch files into one final CSV for convenience
print("Merging all batches...")
all_files = glob.glob(os.path.join(OUTPUT_DIR, "results_*.csv"))
if all_files:
    combined_df = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True)
    combined_df = combined_df.sort_values(by=['date', 'time'])
    combined_df.to_csv("eth_reddit_sentiments_FINAL.csv", index=False)
    print("Final merged file saved: eth_reddit_sentiments_FINAL.csv")
else:
    print("No output files found to merge.")