In [1]:
import pandas as pd

# Load the "AllAgree" file (best quality labels)
file_path = r"D:\Data Science\Minor-Project\FinancialPhraseBank-v1.0\Sentences_AllAgree.txt"

records = []
with open(file_path, "r", encoding="latin-1") as f:
    for line in f:
        if "@" in line:
            sentence, label = line.rsplit("@", 1)
            records.append({"sentence": sentence.strip(), "label": label.strip()})

df = pd.DataFrame(records)

# Map labels to integers
label_mapping = {"negative": 0, "neutral": 1, "positive": 2}
df["label"] = df["label"].map(label_mapping)

print(df.head())
print(df["label"].value_counts())


                                            sentence  label
0  According to Gran , the company has no plans t...      1
1  For the last quarter of 2010 , Componenta 's n...      2
2  In the third quarter of 2010 , net sales incre...      2
3  Operating profit rose to EUR 13.1 mn from EUR ...      2
4  Operating profit totalled EUR 21.1 mn , up fro...      2
label
1    1391
2     570
0     303
Name: count, dtype: int64


In [2]:
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["sentence"].tolist(),
    df["label"].tolist(),
    test_size=0.2,
    stratify=df["label"],
    random_state=42
)

In [3]:
from transformers import AutoTokenizer

model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_data(texts, labels):
    encodings = tokenizer(
        texts,
        truncation=True,
        padding=True,
        max_length=128,
        return_tensors="pt"
    )
    return encodings, torch.tensor(labels)

import torch
train_encodings, train_labels = tokenize_data(train_texts, train_labels)
val_encodings, val_labels = tokenize_data(val_texts, val_labels)


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
class FinancialPhraseDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item

from torch.utils.data import DataLoader

train_dataset = FinancialPhraseDataset(train_encodings, train_labels)
val_dataset = FinancialPhraseDataset(val_encodings, val_labels)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

In [5]:
from transformers import DistilBertForSequenceClassification
from torch.optim import AdamW

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

id2label = {0: "negative", 1: "neutral", 2: "positive"}
label2id = {"negative": 0, "neutral": 1, "positive": 2}

model = DistilBertForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3,
    id2label=id2label,
    label2id=label2id
).to(device)

optimizer = AdamW(model.parameters(), lr=5e-5)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# Compute weights based on dataset imbalance
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(df["label"]),
    y=df["label"]
)

class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)
print("Class Weights:", class_weights)

# Define weighted loss function
criterion = torch.nn.CrossEntropyLoss(weight=class_weights)

Class Weights: tensor([2.4906, 0.5425, 1.3240], device='cuda:0')


In [10]:
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm

epochs = 4
for epoch in range(epochs):
    # Training
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1} Training"):
        optimizer.zero_grad()
        
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = criterion(outputs.logits, labels)
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_train_loss = total_loss / len(train_loader)

    # Validation
    model.eval()
    val_preds, val_labels_list = [], []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)

            val_preds.extend(preds.cpu().numpy())
            val_labels_list.extend(labels.cpu().numpy())

    acc = accuracy_score(val_labels_list, val_preds)
    f1 = f1_score(val_labels_list, val_preds, average="weighted")

    print(f"Epoch {epoch+1} | Train Loss: {avg_train_loss:.4f} | Val Acc: {acc:.4f} | Val F1: {f1:.4f}")

Epoch 1 Training: 100%|██████████| 114/114 [00:39<00:00,  2.86it/s]


Epoch 1 | Train Loss: 0.0361 | Val Acc: 0.9669 | Val F1: 0.9671


Epoch 2 Training: 100%|██████████| 114/114 [00:40<00:00,  2.81it/s]


Epoch 2 | Train Loss: 0.0632 | Val Acc: 0.9669 | Val F1: 0.9671


Epoch 3 Training: 100%|██████████| 114/114 [00:41<00:00,  2.76it/s]


Epoch 3 | Train Loss: 0.0484 | Val Acc: 0.9669 | Val F1: 0.9673


Epoch 4 Training: 100%|██████████| 114/114 [00:42<00:00,  2.71it/s]


Epoch 4 | Train Loss: 0.0311 | Val Acc: 0.9426 | Val F1: 0.9438


In [11]:
model.save_pretrained("./finbert_model")
tokenizer.save_pretrained("./finbert_model")

('./finbert_model\\tokenizer_config.json',
 './finbert_model\\special_tokens_map.json',
 './finbert_model\\vocab.txt',
 './finbert_model\\added_tokens.json',
 './finbert_model\\tokenizer.json')

In [22]:
import torch.nn.functional as F

def sentiment_score(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(model.device)
    with torch.no_grad():
        outputs = model(**inputs)
        probs = F.softmax(outputs.logits, dim=-1).cpu().numpy()[0]

    # Map classes to sentiment values
    sentiment_values = {"negative": -1, "neutral": 0, "positive": 1}
    id2label = {0: "negative", 1: "neutral", 2: "positive"}

    # Weighted sum for score in [-1, 1]
    score = sum(probs[i] * sentiment_values[id2label[i]] for i in range(len(probs)))

    # Pick the highest probability as sentiment label
    predicted_label = id2label[probs.argmax()]

    return predicted_label, score

# Example
print(sentiment_score("Zomato is a very good company."))
print(sentiment_score("Udaan is struggling with losses."))
print(sentiment_score("hello."))


('positive', np.float32(0.99597985))
('negative', np.float32(-0.9821295))
('neutral', np.float32(-0.002720035))
