https://huggingface.co/oferweintraub/bert-base-finance-sentiment-noisy-search (bert-base-uncase)

https://huggingface.co/ahmedrachid/FinancialBERT-Sentiment-Analysis (finbert)

https://huggingface.co/mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis (distilled roberta)

https://huggingface.co/siebert/sentiment-roberta-large-english 

Limitations:

These pretrained/fine-tuned model can deal with only limited-length texts. For example, for bert-case-uncases, only 64 tokens and for the others 512 tokens.

To deal with this, we can either truncate the speech(but this may leads to unaccurate result) since the tokens of the original speech is much more than 512.(See exploratory analysis) 

Another way would be to split the tokens into several chunks and take the average sentiment score of each chunk.

The second approach would be generate summarization or answers to certain questions by prompting and then use these results to get the sentiment label.

In [2]:
from transformers import BertForSequenceClassification, BertTokenizer
from transformers import pipeline
import torch
import json
import pandas as pd
import numpy as np

In [None]:
VIX = json.load(open("train/VIX_1w.json"))
EURUSDV = json.load(open("train/EURUSDV1M_1w.json"))
df_VIX = pd.DataFrame(VIX)
df_EURUSDV = pd.DataFrame(EURUSDV)
example = VIX[0]['speech'][0]['ECB']

 ### Whole speech analysis: chunk average

In [1]:
# initialize our model and tokenizer
tokenizer = BertTokenizer.from_pretrained('ProsusAI/finbert')
model = BertForSequenceClassification.from_pretrained('ProsusAI/finbert')

# and we will place the processing of our input text into a function for easier prediction later
def sentiment(tokens):
    # get output logits from the model
    output = model(**tokens)
    # convert to probabilities
    probs = torch.nn.functional.softmax(output[0], dim=-1)
    # we will return the probability tensor (we will not need argmax until later)
    return probs

In [49]:
# initialize probabilities list
probs_list = []

start = 0
window_size = 510  # we take 2 off here so that we can fit in our [CLS] and [SEP] tokens

loop = True

while loop:
    end = start + window_size
    if end >= total_len:
        loop = False
        end = total_len
    # (1) extract window from input_ids and attention_mask
    input_ids_chunk = input_ids[start:end]
    attention_mask_chunk = attention_mask[start:end]
    # (2) add [CLS] and [SEP]
    input_ids_chunk = [101] + input_ids_chunk + [102]
    attention_mask_chunk = [1] + attention_mask_chunk + [1]
    # (3) add padding upto window_size + 2 (512) tokens
    input_ids_chunk += [0] * (window_size - len(input_ids_chunk) + 2)
    attention_mask_chunk += [0] * (window_size - len(attention_mask_chunk) + 2)
    # (4) format into PyTorch tensors dictionary
    input_dict = {
        'input_ids': torch.Tensor([input_ids_chunk]).long(),
        'attention_mask': torch.Tensor([attention_mask_chunk]).int()
    }
    # (5) make logits prediction
    outputs = model(**input_dict)
    # (6) calculate softmax and append to list
    probs = torch.nn.functional.softmax(outputs[0], dim=-1)
    probs_list.append(probs)

    start = end
    
# let's view the probabilities given
probs_list

[tensor([[0.0102, 0.9390, 0.0508]], grad_fn=<SoftmaxBackward0>),
 tensor([[0.0110, 0.9663, 0.0226]], grad_fn=<SoftmaxBackward0>),
 tensor([[0.0132, 0.9602, 0.0266]], grad_fn=<SoftmaxBackward0>),
 tensor([[0.0252, 0.9324, 0.0423]], grad_fn=<SoftmaxBackward0>),
 tensor([[0.0475, 0.7776, 0.1750]], grad_fn=<SoftmaxBackward0>),
 tensor([[0.1466, 0.0562, 0.7971]], grad_fn=<SoftmaxBackward0>),
 tensor([[0.1217, 0.0204, 0.8579]], grad_fn=<SoftmaxBackward0>),
 tensor([[0.1014, 0.0179, 0.8807]], grad_fn=<SoftmaxBackward0>),
 tensor([[0.0891, 0.0318, 0.8791]], grad_fn=<SoftmaxBackward0>),
 tensor([[0.1344, 0.0159, 0.8497]], grad_fn=<SoftmaxBackward0>)]

In [9]:
def split_speech(speech, max_token, tokenizer=BertTokenizer.from_pretrained("ahmedrachid/FinancialBERT-Sentiment-Analysis")):
    # padding for last chunk
    inputs = tokenizer(speech)
    input_id_chunks = inputs['input_ids'].split(max_token)
    return input_id_chunks

 ### Sentiment from summary

 #### Read data

In [3]:
# Summaries from GPT2 model
sums1_train = json.load(open("sums_train.json"))
sums1_test = json.load(open("sums_test.json"))

 #### Apply different models(pipelines)

In [4]:
base_score = {"negative": -1, "NEGATIVE": -1, "neutral": 0, "positive": 1, "POSITIVE": 1}

In [5]:
model_names = ["siebert/sentiment-roberta-large-english",  
              "mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis",
              "ahmedrachid/FinancialBERT-Sentiment-Analysis"]

In [16]:
# model = BertForSequenceClassification.from_pretrained("ahmedrachid/FinancialBERT-Sentiment-Analysis",num_labels=3)
# tokenizer = BertTokenizer.from_pretrained("ahmedrachid/FinancialBERT-Sentiment-Analysis")

# nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
# print(nlp(example))

[{'label': 'neutral', 'score': 0.979006826877594}]


In [9]:
def sentiment(speech, model_name):
    if model_name != "ahmedrachid/FinancialBERT-Sentiment-Analysis":
        nlp = pipeline("sentiment-analysis", model=model_name)
        output = nlp(speech)[0]
    else:
        model = BertForSequenceClassification.from_pretrained("ahmedrachid/FinancialBERT-Sentiment-Analysis",num_labels=3)
        tokenizer = BertTokenizer.from_pretrained("ahmedrachid/FinancialBERT-Sentiment-Analysis")
        nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
        output = nlp(speech)[0]
    return base_score[output["label"]] + output["score"]

In [10]:
def avg_sentiment(day, model_name, identity_name):
    nb_speech = 0
    sentiment_score = 0
    for day in data["speech"]:
        for speech in list(day.values()):
            if speech != []:
                nb_speech += 1
                sentiment_score += sentiment(speech[0], model_name)
    return sentiment_score/nb_speech

In [11]:
df_sums1_train = pd.DataFrame(sums1_train)

In [None]:
model_name = "mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis"
df_sums1_train["avg_sentiment"] = df_sums1_train["speech"].apply(
    lambda x: avg_sentiment(x, model_name, "train_sum1"))

In [7]:
# def include_sentiment(df, model_name, identity_name):
#     # take average of all the speeches as the sentiment score in one datapoint
#     for data in df:
#         nb_speech = 0
#         sentiment_score = 0
#         for day in data["speech"]:
#             for speech in list(day.values()):
#                 if speech != []:
#                     nb_speech += 1
#                     sentiment_score += sentiment(speech[0], model_name)
#         data["avg_sentiment_score"] = sentiment_score/nb_speech
#     return data.to_json(model_name+"identity_name"+".json")

In [8]:
#for model_name in model_names:
model_name = "mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis"
include_sentiment(sums1_train, model_name, "train_sum1")
include_sentiment(sums1_test, model_name, "test_sum1")

KeyboardInterrupt: 