In [None]:
pip install vaderSentiment

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/126.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [None]:
import pandas as pd
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from tqdm import tqdm

tqdm.pandas()

# Choose device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using:", device)

# Load models
finbert_tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
finbert_model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

roberta_tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
roberta_model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")

vader = SentimentIntensityAnalyzer()

# Scoring function
def run_model_batched(texts, tokenizer, model, label_map, batch_size=32):
    model.eval()
    model.to(device)
    all_preds = []

    for i in tqdm(range(0, len(texts), batch_size), desc="Batches"):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(batch, return_tensors='pt', truncation=True, padding=True, max_length=512)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            logits = model(**inputs).logits
            probs = F.softmax(logits, dim=1)
            preds = torch.argmax(probs, dim=1).tolist()
            scaled = [label_map[p] for p in preds]
            all_preds.extend(scaled)

    return all_preds

def vader_scaled(text):
    score = vader.polarity_scores(text)['compound']
    if score >= 0.05:
        return 5.0
    elif score <= -0.05:
        return 0.0
    else:
        return 2.5


Using: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

In [None]:
# Setup for model outputs
finbert_map = {0: 0.0, 1: 2.5, 2: 5.0}
roberta_map = {0: 0.0, 1: 2.5, 2: 5.0}

stocks = ["AAPL", "MSFT", "GOOGL", "AMZN", "NVDA", "META", "TSLA"]
for stock in stocks:
    print(f"\n 🚀 Processing {stock}...")

    # Load
    df = pd.read_csv(f"{stock}_finbert_input.csv")

    # VADER
    print("→ Running VADER...")
    df["sentiment_vader"] = df["string_input"].progress_apply(vader_scaled)

    # FinBERT
    print("→ Running FinBERT...")
    df["sentiment_finbert"] = run_model_batched(df["string_input"].tolist(), finbert_tokenizer, finbert_model, finbert_map)

    # RoBERTa
    print("→ Running RoBERTa...")
    df["sentiment_roberta"] = run_model_batched(df["string_input"].tolist(), roberta_tokenizer, roberta_model, roberta_map)

    # Save per stock
    df_out = df[["index", "Date", "sentiment_vader", "sentiment_finbert", "sentiment_roberta"]]
    df_out.to_csv(f"{stock}_sentiment_output.csv", index=False)
    print(f"💾 Saved {stock}_sentiment_output.csv")



🚀 Processing AAPL...
→ Running VADER...


100%|██████████| 104345/104345 [09:16<00:00, 187.36it/s]


→ Running FinBERT...


Batches: 100%|██████████| 3261/3261 [04:45<00:00, 11.43it/s]


→ Running RoBERTa...


Batches: 100%|██████████| 3261/3261 [04:45<00:00, 11.43it/s]


💾 Saved AAPL_sentiment_output.csv

🚀 Processing MSFT...
→ Running VADER...


100%|██████████| 45110/45110 [04:33<00:00, 164.78it/s]  


→ Running FinBERT...


Batches: 100%|██████████| 1410/1410 [02:18<00:00, 10.16it/s]


→ Running RoBERTa...


Batches: 100%|██████████| 1410/1410 [02:18<00:00, 10.22it/s]


💾 Saved MSFT_sentiment_output.csv

🚀 Processing GOOGL...
→ Running VADER...


100%|██████████| 60794/60794 [05:48<00:00, 174.58it/s]


→ Running FinBERT...


Batches: 100%|██████████| 1900/1900 [02:49<00:00, 11.19it/s]


→ Running RoBERTa...


Batches: 100%|██████████| 1900/1900 [02:49<00:00, 11.23it/s]


💾 Saved GOOGL_sentiment_output.csv

🚀 Processing AMZN...
→ Running VADER...


100%|██████████| 48957/48957 [09:44<00:00, 83.73it/s]   


→ Running FinBERT...


Batches: 100%|██████████| 1530/1530 [02:52<00:00,  8.86it/s]


→ Running RoBERTa...


Batches: 100%|██████████| 1530/1530 [02:51<00:00,  8.94it/s]


💾 Saved AMZN_sentiment_output.csv

🚀 Processing NVDA...
→ Running VADER...


100%|██████████| 19223/19223 [03:38<00:00, 88.06it/s]  


→ Running FinBERT...


Batches: 100%|██████████| 601/601 [01:38<00:00,  6.08it/s]


→ Running RoBERTa...


Batches: 100%|██████████| 601/601 [01:37<00:00,  6.18it/s]


💾 Saved NVDA_sentiment_output.csv

🚀 Processing META...
→ Running VADER...


100%|██████████| 37873/37873 [05:33<00:00, 113.44it/s]


→ Running FinBERT...


Batches: 100%|██████████| 1184/1184 [01:48<00:00, 10.90it/s]


→ Running RoBERTa...


Batches: 100%|██████████| 1184/1184 [01:48<00:00, 10.91it/s]


💾 Saved META_sentiment_output.csv

🚀 Processing TSLA...
→ Running VADER...


100%|██████████| 41017/41017 [07:07<00:00, 96.02it/s]   


→ Running FinBERT...


Batches: 100%|██████████| 1282/1282 [02:53<00:00,  7.39it/s]


→ Running RoBERTa...


Batches: 100%|██████████| 1282/1282 [02:51<00:00,  7.48it/s]

💾 Saved TSLA_sentiment_output.csv





In [None]:
from google.colab import files
files.download("AAPL_sentiment_output.csv")
files.download("MSFT_sentiment_output.csv")
files.download("AMZN_sentiment_output.csv")
files.download("GOOGL_sentiment_output.csv")
files.download("META_sentiment_output.csv")
files.download("NVDA_sentiment_output.csv")
files.download("TSLA_sentiment_output.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>