In [None]:
import pandas as pd

# Text-to-Binary model

We are going to test the two models :
- FinBert
- distilRoberta

With the dataset on Kaggle that you can find here :
- https://www.kaggle.com/datasets/ankurzing/sentiment-analysis-for-financial-news/

In [None]:
from datasets import load_dataset

fin_pb_all = load_dataset('financial_phrasebank', 'sentences_allagree')
fin_pb_75 = load_dataset('financial_phrasebank', 'sentences_75agree')
fin_pb_66 = load_dataset('financial_phrasebank', 'sentences_66agree')
fin_pb_50 = load_dataset('financial_phrasebank', 'sentences_50agree')
FSA_kaggle = pd.read_csv('FSA_kaggle.csv',encoding="iso-8859-1",header=None)
SEntFIN = pd.read_csv('SEntFiN.csv')

In [None]:
fin_pb_all

In [None]:
fin_pb_all_x, fin_pb_all_y = fin_pb_all['train']['sentence'], fin_pb_all['train']['label']
fin_pb_75_x, fin_pb_75_y = fin_pb_75['train']['sentence'], fin_pb_75['train']['label']
fin_pb_66_x, fin_pb_66_y = fin_pb_66['train']['sentence'], fin_pb_66['train']['label']
fin_pb_50_x, fin_pb_50_y = fin_pb_50['train']['sentence'], fin_pb_50['train']['label']

In [None]:
fin_pb_all_y[0:5]

In [None]:
mapping = {0: 'negative', 1: 'neutral', 2: 'positive'}
fin_pb_all_y = [mapping.get(i, i) for i in fin_pb_all_y]
fin_pb_75_y = [mapping.get(i, i) for i in fin_pb_75_y]
fin_pb_66_y = [mapping.get(i, i) for i in fin_pb_66_y]
fin_pb_50_y = [mapping.get(i, i) for i in fin_pb_50_y]

In [None]:
FSA_kaggle.head()

In [None]:
FSA_kaggle_x, FSA_kaggle_y = FSA_kaggle.iloc[:, 1], FSA_kaggle.iloc[:, 0]

In [None]:
SEntFIN.head()

In [None]:
SEntFIN.dtypes

In [None]:
def str_to_dict(dict_str):
    # Remove curly braces and split on commas
    pairs = dict_str[1:-1].split(', ')
    print(pairs)

    # Split each pair on colon to get key-value pairs
    pairs = [pair.split(': ') for pair in pairs]

    # Remove quotes from keys and values
    pairs = [(k[1:-1], v[1:-1]) for k, v in pairs]

    # Convert list of pairs to dictionary
    return dict(pairs)

In [None]:
SEntFIN['negative_count'] = SEntFIN['Decisions'].str.count('negative')
SEntFIN['positive_count'] = SEntFIN['Decisions'].str.count('positive')
SEntFIN['neutral_count'] = SEntFIN['Decisions'].str.count('neutral')

In [None]:
SEntFIN['Count'] = SEntFIN['negative_count'] + SEntFIN['positive_count'] + SEntFIN['neutral_count']

In [None]:
SEntFIN = SEntFIN[SEntFIN['Count'] <= 1]

In [None]:
def filter_and_replace(val):
    if 'positive' in val:
        return 'positive'
    elif 'negative' in val:
        return 'negative'
    elif 'neutral' in val:
        return 'neutral'
    else:
        return None

SEntFIN['Decisions'] = SEntFIN['Decisions'].apply(filter_and_replace)

# Drop rows with None in 'Decisions' column
SEntFIN = SEntFIN.dropna(subset=['Decisions'])

In [None]:
SEntFIN

In [None]:
SEntFIN = SEntFIN.reset_index()
SEntFIN_x, SEntFIN_y = SEntFIN['Title'], SEntFIN['Decisions']

# Test

In [None]:
Fin_bert_mapping = {0: 'positive', 1: 'negative', 2: 'neutral'}
DRob_mapping = {0: 'negative', 1: 'neutral', 2: 'positive'}

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

In [None]:
def calculate_accuracy(model, X, y, mapping):
  accuracy = []
  for i, sentence in enumerate(X):
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True)

    # Get the model's prediction
    outputs = model(**inputs)
    prediction = outputs.logits.argmax(dim=-1).item()

    prediction = mapping.get(prediction)

    accuracy.append(1 if prediction == y[i] else 0)

  return sum(accuracy) / len(accuracy)

In [None]:
datasets = ["fin_pb_all", "fin_pb_75", "fin_pb_66", "fin_pb_50", "FSA_kaggle", "SEntFIN"]
accuracies = {}

for dataset in datasets:
    x = globals()[dataset + '_x']
    y = globals()[dataset + '_y']
    accuracies[dataset] = calculate_accuracy(model, x, y, Fin_bert_mapping)

print(accuracies)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")
model = AutoModelForSequenceClassification.from_pretrained("mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")

In [None]:
datasets = ["fin_pb_all", "fin_pb_75", "fin_pb_66", "fin_pb_50", "FSA_kaggle", "SEntFIN"]
accuracies = {}

for dataset in datasets:
    x = globals()[dataset + '_x']
    y = globals()[dataset + '_y']
    accuracies[dataset] = calculate_accuracy(model, x, y, DRob_mapping)

print(accuracies)