In [7]:
import pandas as pd

# Text-to-Binary model

We are going to test the two models :
- FinBert
- distilRoberta

With the dataset on Kaggle that you can find here :
- https://www.kaggle.com/datasets/ankurzing/sentiment-analysis-for-financial-news/

In [16]:
from datasets import load_dataset

fin_pb_all = load_dataset('financial_phrasebank', 'sentences_allagree')
fin_pb_75 = load_dataset('financial_phrasebank', 'sentences_75agree')
fin_pb_66 = load_dataset('financial_phrasebank', 'sentences_66agree')
fin_pb_50 = load_dataset('financial_phrasebank', 'sentences_50agree')
FSA_kaggle = pd.read_csv('FSA_kaggle.csv',encoding="iso-8859-1",header=None)
SEntFIN = pd.read_csv('SEntFiN.csv')

In [27]:
fin_pb_all

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label'],
        num_rows: 2264
    })
})

In [64]:
fin_pb_all_x, fin_pb_all_y = fin_pb_all['train']['sentence'], fin_pb_all['train']['label']
fin_pb_75_x, fin_pb_75_y = fin_pb_75['train']['sentence'], fin_pb_75['train']['label']
fin_pb_66_x, fin_pb_66_y = fin_pb_66['train']['sentence'], fin_pb_66['train']['label']
fin_pb_50_x, fin_pb_50_y = fin_pb_50['train']['sentence'], fin_pb_50['train']['label']

In [65]:
fin_pb_all_y[0:5]

[1, 2, 2, 2, 2]

In [66]:
mapping = {0: 'negative', 1: 'neutral', 2: 'positive'}
fin_pb_all_y = [mapping.get(i, i) for i in fin_pb_all_y]
fin_pb_75_y = [mapping.get(i, i) for i in fin_pb_75_y]
fin_pb_66_y = [mapping.get(i, i) for i in fin_pb_66_y]
fin_pb_50_y = [mapping.get(i, i) for i in fin_pb_50_y]

In [67]:
FSA_kaggle.head()

Unnamed: 0,0,1
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...


In [68]:
FSA_kaggle_x, FSA_kaggle_y = FSA_kaggle.iloc[:, 1], FSA_kaggle.iloc[:, 0]

In [69]:
SEntFIN.head()

Unnamed: 0,S No.,Title,Decisions,Words,negative_count,positive_count,neutral_count,Count
0,1,SpiceJet to issue 6.4 crore warrants to promoters,neutral,8,0,0,1,1
1,2,MMTC Q2 net loss at Rs 10.4 crore,neutral,8,0,0,1,1
2,3,"Mid-cap funds can deliver more, stay put: Experts",positive,8,0,1,0,1
3,4,Mid caps now turn into market darlings,positive,7,0,1,0,1
4,5,"Market seeing patience, if not conviction: Pra...",neutral,8,0,0,1,1


In [70]:
SEntFIN.dtypes

S No.              int64
Title             object
Decisions         object
Words              int64
negative_count     int64
positive_count     int64
neutral_count      int64
Count              int64
dtype: object

In [71]:
def str_to_dict(dict_str):
    # Remove curly braces and split on commas
    pairs = dict_str[1:-1].split(', ')
    print(pairs)

    # Split each pair on colon to get key-value pairs
    pairs = [pair.split(': ') for pair in pairs]

    # Remove quotes from keys and values
    pairs = [(k[1:-1], v[1:-1]) for k, v in pairs]

    # Convert list of pairs to dictionary
    return dict(pairs)

In [72]:
SEntFIN['negative_count'] = SEntFIN['Decisions'].str.count('negative')
SEntFIN['positive_count'] = SEntFIN['Decisions'].str.count('positive')
SEntFIN['neutral_count'] = SEntFIN['Decisions'].str.count('neutral')

In [73]:
SEntFIN['Count'] = SEntFIN['negative_count'] + SEntFIN['positive_count'] + SEntFIN['neutral_count']

In [74]:
SEntFIN = SEntFIN[SEntFIN['Count'] <= 1]

In [75]:
def filter_and_replace(val):
    if 'positive' in val:
        return 'positive'
    elif 'negative' in val:
        return 'negative'
    elif 'neutral' in val:
        return 'neutral'
    else:
        return None

SEntFIN['Decisions'] = SEntFIN['Decisions'].apply(filter_and_replace)

# Drop rows with None in 'Decisions' column
SEntFIN = SEntFIN.dropna(subset=['Decisions'])

In [76]:
SEntFIN

Unnamed: 0,S No.,Title,Decisions,Words,negative_count,positive_count,neutral_count,Count
0,1,SpiceJet to issue 6.4 crore warrants to promoters,neutral,8,0,0,1,1
1,2,MMTC Q2 net loss at Rs 10.4 crore,neutral,8,0,0,1,1
2,3,"Mid-cap funds can deliver more, stay put: Experts",positive,8,0,1,0,1
3,4,Mid caps now turn into market darlings,positive,7,0,1,0,1
4,5,"Market seeing patience, if not conviction: Pra...",neutral,8,0,0,1,1
...,...,...,...,...,...,...,...,...
10700,10701,"Wall Street opens flat, S&P 500 near record",neutral,8,0,0,1,1
10710,10711,"Heard on the street: Dutchman MF, FIIs",neutral,7,0,0,1,1
10714,10715,Rebound for Russia and China lifts stocks,positive,7,0,1,0,1
10720,10721,"UBS cuts gold, base metals price forecasts",neutral,7,0,0,1,1


In [110]:
SEntFIN = SEntFIN.reset_index()
SEntFIN_x, SEntFIN_y = SEntFIN['Title'], SEntFIN['Decisions']

# Test

In [119]:
Fin_bert_mapping = {0: 'positive', 1: 'negative', 2: 'neutral'}
DRob_mapping = {0: 'negative', 1: 'neutral', 2: 'positive'}

In [79]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

In [122]:
def calculate_accuracy(model, X, y, mapping):
  accuracy = []
  for i, sentence in enumerate(X):
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True)

    # Get the model's prediction
    outputs = model(**inputs)
    prediction = outputs.logits.argmax(dim=-1).item()

    prediction = mapping.get(prediction)

    accuracy.append(1 if prediction == y[i] else 0)

  return sum(accuracy) / len(accuracy)

In [118]:
datasets = ["fin_pb_all", "fin_pb_75", "fin_pb_66", "fin_pb_50", "FSA_kaggle", "SEntFIN"]
accuracies = {}

for dataset in datasets:
    x = globals()[dataset + '_x']
    y = globals()[dataset + '_y']
    accuracies[dataset] = calculate_accuracy(model, x, y, Fin_bert_mapping)

print(accuracies)

{'fin_pb_all': 0.9717314487632509, 'fin_pb_75': 0.9472922096727483, 'fin_pb_66': 0.9181882855110268, 'fin_pb_50': 0.8895996698307883, 'FSA_kaggle': 0.8893933140734627, 'SEntFIN': 0.7317876599518561}


In [124]:
tokenizer = AutoTokenizer.from_pretrained("mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")
model = AutoModelForSequenceClassification.from_pretrained("mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")

In [125]:
datasets = ["fin_pb_all", "fin_pb_75", "fin_pb_66", "fin_pb_50", "FSA_kaggle", "SEntFIN"]
accuracies = {}

for dataset in datasets:
    x = globals()[dataset + '_x']
    y = globals()[dataset + '_y']
    accuracies[dataset] = calculate_accuracy(model, x, y, DRob_mapping)

print(accuracies)

{'fin_pb_all': 0.9973498233215548, 'fin_pb_75': 0.9475818129163046, 'fin_pb_66': 0.9006402655916529, 'fin_pb_50': 0.8586463062319438, 'FSA_kaggle': 0.8584399504746183, 'SEntFIN': 0.7722032180413024}
