# RoBERTa Sentiment Analysis

This notebook adds RoBERTa sentiment analysis to the existing VADER and FinBERT results.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load data that already has VADER and FinBERT sentiment scores
df_truth = pd.read_csv('data/truth_social_with_sentiment.csv')
df_truth['date'] = pd.to_datetime(df_truth['date'])

print(f"{len(df_truth)} posts")

18778 posts


In [3]:
# Load model
model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model.eval()

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [4]:
# Function to get RoBERTa sentiment for one text
def get_roberta_sentiment(text):
    # Skip empty texts
    if pd.isna(text) or text == "":
        return 0.0

    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    
    # Get prediction
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
    
    # RoBERTa labels: negative=0, neutral=1, positive=2
    negative_prob = predictions[0][0].item()
    neutral_prob = predictions[0][1].item()
    positive_prob = predictions[0][2].item()
    
    # Return score: positive - negative (range: -1 to +1)
    score = positive_prob - negative_prob
    return score

# Apply RoBERTa to all posts
print("Calculating RoBERTa sentiment scores")
roberta_scores = []
total_posts = len(df_truth)

for i in range(total_posts):
    text = df_truth['cleaned_content'].iloc[i]
    score = get_roberta_sentiment(text)
    roberta_scores.append(score)
    
    # every 1000 posts
    if (i + 1) % 1000 == 0:
        print(f"Processed {i + 1} posts...")

df_truth['roberta_sentiment'] = roberta_scores

print(f"Sentiment score range: {df_truth['roberta_sentiment'].min():.3f} to {df_truth['roberta_sentiment'].max():.3f}")
print(f"Average sentiment: {df_truth['roberta_sentiment'].mean():.3f}")
df_truth[['cleaned_content', 'vader_sentiment', 'finbert_sentiment', 'roberta_sentiment']].head(10)

Calculating RoBERTa sentiment scores
Processed 1000 posts...
Processed 2000 posts...
Processed 3000 posts...
Processed 4000 posts...
Processed 5000 posts...
Processed 6000 posts...
Processed 7000 posts...
Processed 8000 posts...
Processed 9000 posts...
Processed 10000 posts...
Processed 11000 posts...
Processed 12000 posts...
Processed 13000 posts...
Processed 14000 posts...
Processed 15000 posts...
Processed 16000 posts...
Processed 17000 posts...
Processed 18000 posts...
Sentiment score range: -0.959 to 0.989
Average sentiment: 0.020


Unnamed: 0,cleaned_content,vader_sentiment,finbert_sentiment,roberta_sentiment
0,"I am on my way to Malaysia, where I will sign ...",0.9682,0.080414,-0.053873
1,"RT @realDonaldTrumpCanada was caught, red hand...",-0.8329,-0.353714,-0.709411
2,"Canada was caught, red handed, putting up a fr...",-0.8329,-0.300146,-0.710382
3,We have a very strong PEACE in the Middle East...,0.9074,0.027445,0.668038
4,Congressman Jimmy Patronis is a MAGA Warrior w...,0.9643,0.15483,0.968299
5,"Richard Hudson is a Great Man, and TREMENDOUS ...",0.9905,0.394976,0.965736
6,Congressman David Rouzer is a terrific Represe...,0.98,0.239225,0.958171
7,Congressman Addison McDowell is an America Fir...,0.9907,0.200672,0.967453
8,Congresswoman Nicole Malliotakis is a Tremendo...,0.9922,0.254401,0.965822
9,Congressman Jack Bergman is a Tremendous Champ...,0.9826,0.238671,0.96903


In [5]:
# Load combined market data
combined_df = pd.read_csv('data/combined_data_with_sentiment.csv')

# Ensure dates are datetime
combined_df['date'] = pd.to_datetime(combined_df['date'])
df_truth['date'] = pd.to_datetime(df_truth['date'])

# Calculate average
daily_roberta = df_truth.groupby('date')['roberta_sentiment'].mean().reset_index(name='avg_roberta_sentiment')

combined_df = pd.merge(combined_df, daily_roberta, on='date', how='left')
combined_df['avg_roberta_sentiment'] = combined_df['avg_roberta_sentiment'].fillna(0)

print(f"Daily RoBERTa sentiment calculated for {len(daily_roberta)} days")
print(f"Average daily RoBERTa sentiment range: {daily_roberta['avg_roberta_sentiment'].min():.3f} to {daily_roberta['avg_roberta_sentiment'].max():.3f}")
combined_df[['date', 'post_count', 'avg_sentiment', 'avg_finbert_sentiment', 'avg_roberta_sentiment', 'Returns']].head(10)


Daily RoBERTa sentiment calculated for 1269 days
Average daily RoBERTa sentiment range: -0.878 to 0.981


Unnamed: 0,date,post_count,avg_sentiment,avg_finbert_sentiment,avg_roberta_sentiment,Returns
0,2022-01-03,0.0,0.0,0.0,0.0,
1,2022-01-04,0.0,0.0,0.0,0.0,-1.29705
2,2022-01-05,0.0,0.0,0.0,0.0,-3.072096
3,2022-01-06,0.0,0.0,0.0,0.0,-0.070266
4,2022-01-07,0.0,0.0,0.0,0.0,-1.083299
5,2022-01-10,0.0,0.0,0.0,0.0,0.065836
6,2022-01-11,0.0,0.0,0.0,0.0,1.502188
7,2022-01-12,0.0,0.0,0.0,0.0,0.396576
8,2022-01-13,0.0,0.0,0.0,0.0,-2.501625
9,2022-01-14,0.0,0.0,0.0,0.0,0.622266


In [6]:
# Compare correlations with QQQ returns
vader_corr = combined_df['avg_sentiment'].corr(combined_df['Returns'])
finbert_corr = combined_df['avg_finbert_sentiment'].corr(combined_df['Returns'])
roberta_corr = combined_df['avg_roberta_sentiment'].corr(combined_df['Returns'])

print("Correlation with QQQ Returns:")
print(f"VADER: {round(vader_corr, 4)}")
print(f"FinBERT: {round(finbert_corr, 4)}")
print(f"RoBERTa: {round(roberta_corr, 4)}")

# Compare how much the models agree with each other
vader_finbert = combined_df['avg_sentiment'].corr(combined_df['avg_finbert_sentiment'])
vader_roberta = combined_df['avg_sentiment'].corr(combined_df['avg_roberta_sentiment'])
finbert_roberta = combined_df['avg_finbert_sentiment'].corr(combined_df['avg_roberta_sentiment'])

print("\nCorrelation between models: **Higher = models agree more**")
print(f"VADER vs FinBERT: {round(vader_finbert, 4)}")
print(f"VADER vs RoBERTa: {round(vader_roberta, 4)}")
print(f"FinBERT vs RoBERTa: {round(finbert_roberta, 4)}")

Correlation with QQQ Returns:
VADER: 0.0329
FinBERT: -0.0162
RoBERTa: -0.0097

Correlation between models: **Higher = models agree more**
VADER vs FinBERT: 0.6467
VADER vs RoBERTa: 0.7563
FinBERT vs RoBERTa: 0.8149


In [7]:
# Save new results
df_truth.to_csv('data/truth_social_after_roberta.csv', index=False)
combined_df.to_csv('data/combined_data_after_roberta.csv', index=False)