# Extracting the raw sentiment scores

In [None]:
import pandas as pd
import torch
import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import matplotlib.pyplot as plt
import os

In [None]:
DEVICE = "mps" if torch.backends.mps.is_available() else "cpu"
DEVICE

In [None]:
# Load the CryptoBERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("ElKulako/cryptobert")
model = AutoModelForSequenceClassification.from_pretrained("ElKulako/cryptobert")

# Create a sentiment analysis pipeline using CryptoBERT
cryptobert_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device=DEVICE)

post_1 = " see y'all tomorrow and can't wait to see ada in the morning, i wonder what price it is going to be at. 😎🐂🤠💯😴, bitcoin is looking good go for it and flash by that 45k. "
post_2 = "  alright racers, it’s a race to the bottom! good luck today and remember there are no losers (minus those who invested in currency nobody really uses) take your marks... are you ready? go!!" 
post_3 = " i'm never selling. the whole market can bottom out. i'll continue to hold this dumpster fire until the day i die if i need to." 
df_posts = [post_1, post_2, post_3]
preds = cryptobert_pipeline(df_posts, return_all_scores=True)
print(preds)

In [None]:
# Functions for truncating if the message is too long for CryptoBERT

MAX_TOKENS = 512

def truncate_text(text):
    tokens = tokenizer.tokenize(text)
    if len(tokens) > MAX_TOKENS:
        tokens = tokens[:MAX_TOKENS]
        return tokenizer.convert_tokens_to_string(tokens)
    return text

def batched_sentiment_pipeline(texts, batch_size=16):
    results = []
    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i+batch_size]
        batch = [truncate_text(x) for x in batch]  # <- manual truncation here

        try:
            output = cryptobert_pipeline(batch, return_all_scores=True)
            results.extend(output)
        except Exception as e:
            print(f"Error at batch {i}: {e}")
            fallback = [[{'label': 'LABEL_0', 'score': 0}, {'label': 'LABEL_1', 'score': 0}, {'label': 'LABEL_2', 'score': 0}]] * len(batch)
            results.extend(fallback)
    return results


In [None]:
# Extracting the sentiment scores for each comment

comment_df_paths = os.listdir('./comments/')

for comment_df_path in tqdm(comment_df_paths, desc="Files"):
    comment_df = pd.read_csv(f'./comments/{comment_df_path}')
    comment_df.dropna(subset=['text'], inplace=True)

    texts = comment_df['text'].tolist()
    sentiments = batched_sentiment_pipeline(texts)

    # Store sentiment score columns
    comment_df['sentiment'] = sentiments
    comment_df['positive_score'] = [s[2]['score'] for s in sentiments]
    comment_df['negative_score'] = [s[0]['score'] for s in sentiments]
    comment_df['neutral_score']  = [s[1]['score'] for s in sentiments]

    if os.path.exists('./comments_with_sentiment'):
        comment_df.to_csv(f'./comments_with_sentiment/{comment_df_path}', index=False)
    else:
        os.makedirs('./comments_with_sentiment')


In [None]:
# Extracting the sentiment for each post

post_df_paths = [f for f in os.listdir('./filtered_dfs/') if f.endswith('.csv')]
post_dfs = []

for post_df_path in tqdm(post_df_paths, desc="Processing post files"):
    post_df = pd.read_csv(f'./filtered_dfs/{post_df_path}')
    post_df.dropna(subset=['title'], inplace=True)
    post_df.drop(columns=['contains_keyword'], inplace=True, errors='ignore')
    
    subreddit = post_df_path.split("_")[1] if 'wallstreetbets' not in post_df_path else 'wallstreetbets'
    post_df['subreddit'] = subreddit

    titles = post_df['title'].tolist()
    sentiments = batched_sentiment_pipeline(titles)

    post_df['sentiment'] = sentiments
    post_df['positive_score'] = [s[2]['score'] for s in sentiments] 
    post_df['negative_score'] = [s[0]['score'] for s in sentiments]
    post_df['neutral_score']  = [s[1]['score'] for s in sentiments]

    post_dfs.append(post_df)

# Merge all post DataFrames
merged_post_df = pd.concat(post_dfs, ignore_index=True)
merged_post_df.reset_index(drop=True, inplace=True)
if os.path.exists('posts_with_sentiment'):
    merged_post_df.to_csv('./posts_with_sentiment/merged_post_df.csv', index=False)
else:
    os.makedirs('./posts_with_sentiment')
    merged_post_df.to_csv('./posts_with_sentiment/merged_post_df.csv', index=False)