In [1]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import torch.nn.functional as F
import pandas as pd


# Week 2: Financial News Sentiment Analysis using FinBERT

**Project:** Market Mood and Moves (WiDS 5.0)  
**Student:** Pranit Anand  
**Focus:** Domain-Specific Sentiment Modeling for Financial Text  

---

## Notebook Objective

This notebook explores **financial sentiment analysis** using **FinBERT**, a
Transformer-based language model adapted specifically for financial text.

The goal is to understand how **domain-specific contextual embeddings** can be
used to extract sentiment from financial news headlines and how these sentiment
signals can later be integrated into time-series market models.

Rather than training models from scratch, the emphasis is on:
- Understanding **why** FinBERT is used in finance
- Observing how sentiment outputs are generated
- Interpreting probabilistic sentiment scores for downstream analysis


In [2]:
MODEL_NAME = "ProsusAI/finbert"

tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
model = BertForSequenceClassification.from_pretrained(
    MODEL_NAME,
    use_safetensors=True   # CRITICAL to avoid torch.load vulnerability error
)

model.eval()


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [8]:
headlines = [
    # Earnings & Financial Performance
    "Apple reports record quarterly earnings driven by strong iPhone sales",
    "Tesla posts weaker-than-expected revenue amid rising production costs",
    "Amazon beats profit expectations despite slowing cloud growth",
    "Netflix shares fall after subscriber growth misses estimates",
    "Microsoft delivers strong guidance for next quarter",

    # Corporate Actions
    "Google announces $70 billion share buyback program",
    "Meta Platforms to cut workforce by 10 percent amid restructuring",
    "IBM acquires AI startup to strengthen enterprise offerings",
    "Disney delays multiple film releases due to production issues",
    "Uber sells autonomous driving unit to focus on core business",

    # Regulation & Legal
    "Apple faces antitrust investigation by European Union regulators",
    "SEC files lawsuit against crypto exchange over compliance violations",
    "Facebook fined for data privacy breaches under EU law",
    "Boeing under investigation following safety concerns",
    "JP Morgan reaches settlement in long-running legal dispute",

    # Macroeconomic & Market News
    "Federal Reserve signals potential rate cuts later this year",
    "Inflation rises slower than expected easing market concerns",
    "Global markets rally after positive US jobs data",
    "Recession fears grow as manufacturing activity contracts",
    "Oil prices surge following supply disruptions",

    # Neutral / Informational
    "Company announces new office location in Singapore",
    "Board approves appointment of new chief financial officer",
    "Annual shareholder meeting scheduled for next month",
    "Firm releases sustainability and ESG performance report",
    "Technology conference highlights future innovation trends",

    # Clearly Negative Financial Language
    "Company warns of declining margins and higher operating losses",
    "Bank reports sharp increase in loan defaults",
    "Retailer struggles with excess inventory and falling demand",
    "Automaker issues profit warning due to supply chain disruptions",
    "Airline reports heavy losses amid rising fuel costs"
]


df = pd.DataFrame({"headline": headlines})
df


Unnamed: 0,headline
0,Apple reports record quarterly earnings driven...
1,Tesla posts weaker-than-expected revenue amid ...
2,Amazon beats profit expectations despite slowi...
3,Netflix shares fall after subscriber growth mi...
4,Microsoft delivers strong guidance for next qu...
5,Google announces $70 billion share buyback pro...
6,Meta Platforms to cut workforce by 10 percent ...
7,IBM acquires AI startup to strengthen enterpri...
8,Disney delays multiple film releases due to pr...
9,Uber sells autonomous driving unit to focus on...


In [9]:
def tokenize_text(text):
    return tokenizer(
        text,
        return_tensors="pt",
        padding=True,
        truncation=True,   # Important due to 512-token limit
        max_length=512
    )


In [10]:
def finbert_sentiment(text):
    inputs = tokenize_text(text)

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    probs = F.softmax(logits, dim=1).squeeze()

    labels = ["positive", "negative", "neutral"]
    result = dict(zip(labels, probs.tolist()))

    predicted_label = labels[torch.argmax(probs).item()]

    return predicted_label, result


In [11]:
results = []

for h in df["headline"]:
    label, scores = finbert_sentiment(h)
    results.append({
        "headline": h,
        "finbert_label": label,
        "positive_prob": scores["positive"],
        "negative_prob": scores["negative"],
        "neutral_prob": scores["neutral"]
    })

finbert_df = pd.DataFrame(results)
finbert_df


Unnamed: 0,headline,finbert_label,positive_prob,negative_prob,neutral_prob
0,Apple reports record quarterly earnings driven...,positive,0.941283,0.032401,0.026317
1,Tesla posts weaker-than-expected revenue amid ...,negative,0.011393,0.971129,0.017478
2,Amazon beats profit expectations despite slowi...,positive,0.946483,0.021871,0.031646
3,Netflix shares fall after subscriber growth mi...,negative,0.008874,0.970031,0.021096
4,Microsoft delivers strong guidance for next qu...,positive,0.94815,0.016135,0.035715
5,Google announces $70 billion share buyback pro...,neutral,0.097096,0.029541,0.873363
6,Meta Platforms to cut workforce by 10 percent ...,negative,0.012092,0.96845,0.019458
7,IBM acquires AI startup to strengthen enterpri...,positive,0.616189,0.01383,0.369981
8,Disney delays multiple film releases due to pr...,negative,0.007659,0.943367,0.048974
9,Uber sells autonomous driving unit to focus on...,neutral,0.166539,0.008722,0.824739


Interpretation
FinBERT assigns sentiment based on financial context rather than general
English usage. For example, words like “cost”, “decline”, or “liability”
are interpreted using their financial semantics due to domain adaptation on
the TRC2-Financial corpus. This allows FinBERT to outperform lexicon-based
models in finance-specific sentiment tasks.