# HuggingFace Transformers Implementation

In [1]:
# Import required libraries for FinBERT-based sentiment analysis
# transformers: for loading the pretrained FinBERT model and tokenizer
# torch: for tensor operations and model inference
# torch.nn.functional: provides the softmax function to convert logits into probabilities

from transformers import BertTokenizer, BertForSequenceClassification
import torch
import torch.nn.functional as F


# 1. Load the pretrained FinBERT tokenizer and model
# FinBERT is a BERT model fine-tuned specifically on financial text
# use_safetensors=True ensures secure loading of model weights
tokenizer = BertTokenizer.from_pretrained('ProsusAI/finbert')
model = BertForSequenceClassification.from_pretrained(
    'ProsusAI/finbert',
    use_safetensors=True
)


# 2. Define the financial news text to be analyzed
# This text reflects a negative financial event (decline in revenue)
text = "The company reported a 20% decrease in revenue, missing expectations."


# 3. Tokenize the input text
# return_tensors='pt' converts tokens into PyTorch tensors
# padding=True ensures consistent input length
# truncation=True prevents errors for long text inputs
inputs = tokenizer(
    text,
    return_tensors="pt",
    padding=True,
    truncation=True
)


# 4. Perform model inference
# torch.no_grad() disables gradient computation, making inference faster and memory-efficient
with torch.no_grad():
    outputs = model(**inputs)

# Extract raw prediction scores (logits) from the model
logits = outputs.logits


# 5. Apply softmax to convert logits into probabilities
# dim=1 applies softmax across sentiment classes
# Output probabilities correspond to sentiment classes:
# [Negative, Neutral, Positive]
probabilities = F.softmax(logits, dim=1)

# Print sentiment probabilities
print(probabilities)


tensor([[0.0088, 0.9760, 0.0152]])


In [2]:
# These probablilities correspond to positive, negative and neutral

# Challenge 2: The 512-Token Limit

In [3]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import numpy as np

# Load FinBERT
tokenizer = BertTokenizer.from_pretrained("ProsusAI/finbert")
model = BertForSequenceClassification.from_pretrained(
    "ProsusAI/finbert",
    use_safetensors=True
)

model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [6]:
def document_sentiment(text, max_tokens=512):
    """
    Chunk a long document into 512-token segments,
    run FinBERT on each chunk,
    and return averaged sentiment scores.
    """

    # Step 1: Tokenize WITHOUT truncation
    tokens = tokenizer.tokenize(text)

    # Step 2: Split tokens into chunks (reserve space for [CLS] and [SEP])
    chunk_size = max_tokens - 2
    token_chunks = [
        tokens[i:i + chunk_size]
        for i in range(0, len(tokens), chunk_size)
    ]

    scores = []

    for chunk in token_chunks:
        # Step 3: Convert tokens back to string
        chunk_text = tokenizer.convert_tokens_to_string(chunk)

        # Step 4: Proper tokenization for BERT (adds CLS, SEP, batch dim)
        inputs = tokenizer(
            chunk_text,
            return_tensors="pt",
            truncation=True,
            padding=True,
            max_length=max_tokens
        )

        with torch.no_grad():
            outputs = model(**inputs)

        probs = torch.softmax(outputs.logits, dim=1)
        scores.append(probs.numpy()[0])

    # Step 5: Average sentiment across chunks
    avg_scores = np.mean(scores, axis=0)

    return {
        "Positive": float(avg_scores[0]),
        "Negative": float(avg_scores[1]),
        "Neutral": float(avg_scores[2])
    }

# TEST 
long_text = (
    "The company reported strong revenue growth. "
    "However, macroeconomic risks remain. "
) * 200

result = document_sentiment(long_text)
print(result)

{'Positive': 0.2918495833873749, 'Negative': 0.07937126606702805, 'Neutral': 0.6287791132926941}
