Prepare the Data

In [2]:
import pickle
docs = pickle.load(open('raw_docs_filtered.pkl', 'rb'))

In [36]:
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize

def split_in_segments(text):
    tokens = 0
    mystring = list()
    segments = []
    for sent in sent_tokenize(text):
        newtokens = len(sent.split())
        tokens += newtokens
        mystring.append(str(sent).strip())
        if tokens > 512:
            segments.append(" ".join(mystring))
            mystring = []
            tokens = 0
    if mystring:
        segments.append(" ".join(mystring))
    return segments
texts = []
for i, row in docs.iterrows():
    texts.extend(split_in_segments(row['document_text']))

In [37]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
from scipy.special import softmax

MODEL = f"soleimanian/financial-roberta-large-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

text = [""" While the equity market had a positive
            month, mainly driven by a handful of IT, high growth stocks,
            (i.e. Nvidia).""",
        """
        the bond market ended in negative territory, as
            rates’ cuts expectations moved further out into the year.
        """]

encoded_input = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
output = model(**encoded_input)
scores = [softmax(x.detach().numpy()) for x in output.logits]



In [38]:
df = pd.DataFrame(scores, columns=['negative', 'neutral', 'positive'])
df['text'] = text
df.head()

Unnamed: 0,negative,neutral,positive,text
0,0.001701,0.197782,0.800518,While the equity market had a positive\n ...
1,0.998345,0.00108,0.000575,\n the bond market ended in negative te...


In [49]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Initialize the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')

# Define the topic list
topic_list = ["bond", "bonds"]

# Input text
text = """
While the equity market had a positive month, mainly driven by a handful of IT, high growth stocks,
(i.e. Nvidia), the bond market ended in negative territory, as rates’ cuts expectations moved further out into the year.
"""

# Tokenize the text
tokens = tokenizer(text, return_tensors='pt')
input_ids = tokens['input_ids']

# Identify topic-related tokens
topic_token_ids = [tokenizer.encode(topic, add_special_tokens=False) for topic in topic_list]

# Convert input_ids to a list for easier manipulation
input_ids_list = input_ids[0].tolist()

# Find the indices of the topic-related tokens
topic_indices = []
for i in range(len(input_ids_list)):
    for topic_id in topic_token_ids:
        if input_ids_list[i:i+len(topic_id)] == topic_id:
            topic_indices.extend(range(i, i+len(topic_id)))

# Create an attention mask focusing on topic-related tokens
attention_mask = torch.zeros_like(input_ids)
attention_mask[0, topic_indices] = 1

# Ensure there's at least one topic-related token; otherwise, use the default attention mask
if len(topic_indices) == 0:
    attention_mask = torch.ones_like(input_ids)

# Get model predictions with attention focused on topic words
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)
    sentiment_scores = outputs.logits

# Interpret the results
sentiment = torch.softmax(sentiment_scores, dim=1)
sentiment_label = torch.argmax(sentiment, dim=1).item()

print(f"Sentiment score: {'negative' if sentiment_label == 0 else 'positive'}")


Sentiment score: positive


In [50]:
sentiment_scores

tensor([[-0.8049,  0.9601]])