In [1]:
# Pytorch Deep Learning
import torch
# Pandas+Numpy
import pandas as pd
# Sklearn metrics
from sklearn.metrics import balanced_accuracy_score,accuracy_score

# Hugging Face Transformer Libraries
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline,Trainer, TrainingArguments
# Hugging Face Datasets
from datasets import Dataset

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns

In [20]:
import numpy as np


In [3]:
if torch.cuda.is_available():
  print("CUDA available. GPU will be used for computation")
  device =0
else:
  print("CUDA not available. Using CPU for computation.")
  device =-1



CUDA available. GPU will be used for computation


In [1]:
import pandas as pd

df = pd.read_csv("hf://datasets/mltrev23/financial-sentiment-analysis/archive.zip")

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
df

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,According to the Finnish-Russian Chamber of Co...,neutral
4,The Swedish buyout firm has sold its remaining...,neutral
...,...,...
5837,RISING costs have forced packaging producer Hu...,negative
5838,Nordic Walking was first used as a summer trai...,neutral
5839,"According shipping company Viking Line , the E...",neutral
5840,"In the building and home improvement trade , s...",neutral


In [6]:
df.shape

(5842, 2)

In [7]:
df["Sentiment"].value_counts(normalize=True) #Checking the balance of our classes
#We can see a big class imbalance as negative is underrepresented.

Unnamed: 0_level_0,proportion
Sentiment,Unnamed: 1_level_1
neutral,0.535775
positive,0.317015
negative,0.14721


In [8]:
#Bert Trainer assumes the label(sentence) is called text
df.rename({"Sentence" : "text"}, axis = 1, inplace = True) # Change summary to text


# **For NLP, Sentences are converted to Token and then changed to embeddings in a vector space.**
---
Tokens - Sentences are broken into individual words

Embeddings - Converts tokens to vectors that makes it easy for the model to train and understand has another word semantically correlates with other words.



In [12]:
# Attention is all you need Paper : https://arxiv.org/pdf/1706.03762

In [10]:
# Model name from Model Hub
model_name = 'yiyanghkust/finbert-tone'
# Load model
model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [11]:
model.config

BertConfig {
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "Neutral",
    "1": "Positive",
    "2": "Negative"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "Negative": 2,
    "Neutral": 0,
    "Positive": 1
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.53.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30873
}

In [12]:
id2label =  model.config.id2label
id2label

{0: 'Neutral', 1: 'Positive', 2: 'Negative'}

###Convert Sentence to Tokens

In [13]:
sentence = "Because of the New Trump Tariffs, the agriculture market stock is likely to fail"

In [14]:
inputs = tokenizer(sentence, return_tensors = "pt", padding = True, truncation = True, max_length = 512)


In [15]:
inputs

{'input_ids': tensor([[    3,   238,     7,     6,    56, 18960,  7458,   585,     6, 10164,
            52,    93,    17,   419,     9,  1485,     4]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [16]:
#Put all tensors on same device
inputs = {k:v.to(device) for k,v in inputs.items()}

In [17]:
inputs

{'input_ids': tensor([[    3,   238,     7,     6,    56, 18960,  7458,   585,     6, 10164,
             52,    93,    17,   419,     9,  1485,     4]], device='cuda:0'),
 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], device='cuda:0'),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}

In [20]:
#Make prediction on tokens

In [18]:
with torch.no_grad():
  outputs = model(**inputs)
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[-3.4463, -3.4584,  8.8568]], device='cuda:0'), hidden_states=None, attentions=None)

In [21]:
predictions = np.argmax(outputs.logits.cpu().numpy(), axis =1)

In [22]:
predictions

array([2])

In [23]:
mapped_label = [id2label[element] for element in predictions]

We Could construct it as an NLP Pipeline Using the Hugging face pipeline feature

In [24]:
mapped_label # The Sentence produces a negative Sentiment

['Negative']

We can change it to an NLP pipeline using the hugging face feature

In [25]:
sentiment_pipeline = pipeline(task ="sentiment-analysis", model =model, tokenizer = tokenizer, batch_size= 128, device = device)

Device set to use cuda:0


In [26]:
preds = sentiment_pipeline(df["text"].tolist())

In [27]:
preds

[{'label': 'Positive', 'score': 0.998542070388794},
 {'label': 'Negative', 'score': 0.9954423904418945},
 {'label': 'Positive', 'score': 0.9999998807907104},
 {'label': 'Neutral', 'score': 0.9999959468841553},
 {'label': 'Neutral', 'score': 0.999997615814209},
 {'label': 'Neutral', 'score': 0.9950066804885864},
 {'label': 'Positive', 'score': 0.9998606443405151},
 {'label': 'Neutral', 'score': 0.999988317489624},
 {'label': 'Positive', 'score': 0.9999998807907104},
 {'label': 'Neutral', 'score': 0.9999980926513672},
 {'label': 'Positive', 'score': 0.9999853372573853},
 {'label': 'Negative', 'score': 0.999984860420227},
 {'label': 'Neutral', 'score': 0.9999676942825317},
 {'label': 'Negative', 'score': 0.9952989220619202},
 {'label': 'Neutral', 'score': 0.9999977350234985},
 {'label': 'Neutral', 'score': 0.9999911785125732},
 {'label': 'Positive', 'score': 1.0},
 {'label': 'Neutral', 'score': 0.9993795156478882},
 {'label': 'Neutral', 'score': 0.9999998807907104},
 {'label': 'Neutral', 

In [28]:
df["predictions"] = [pred["label"] for pred in preds]

In [33]:
df

Unnamed: 0,text,Sentiment,predictions
0,The GeoSolutions technology will leverage Bene...,positive,Positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative,Negative
2,"For the last quarter of 2010 , Componenta 's n...",positive,Positive
3,According to the Finnish-Russian Chamber of Co...,neutral,Neutral
4,The Swedish buyout firm has sold its remaining...,neutral,Neutral
...,...,...,...
5837,RISING costs have forced packaging producer Hu...,negative,Negative
5838,Nordic Walking was first used as a summer trai...,neutral,Neutral
5839,"According shipping company Viking Line , the E...",neutral,Negative
5840,"In the building and home improvement trade , s...",neutral,Negative


In [29]:
df.groupby(["Sentiment", "predictions"]).size()

Unnamed: 0_level_0,Unnamed: 1_level_0,0
Sentiment,predictions,Unnamed: 2_level_1
negative,Negative,523
negative,Neutral,313
negative,Positive,24
neutral,Negative,448
neutral,Neutral,2566
neutral,Positive,116
positive,Negative,67
positive,Neutral,815
positive,Positive,970


### We see that our model is not performing so well. For example, in possitve, there is almost an equal number of nuetal same in Negative. This might be due to the class imbalance.

In [34]:
df["predictions"] = df["predictions"].str.lower()


In [32]:
#Lets Calculate the balanced accurate since its better on imbalanced datasets

In [35]:
print(balanced_accuracy_score(df["Sentiment"], df["predictions"]))

0.6505686469816793


In [36]:
print(accuracy_score(df["Sentiment"], df["predictions"]))

0.6947963026360835


Lets Finetune the FinBert Model to improve its accuracy

In [37]:
# Capitalize the keys in label2id to match the case of Sentiment labels
label2id = {k.capitalize(): v for k, v in model.config.label2id.items()}

# Apply case-insensitive mapping from Sentiment to label2id
df["label"] = df["Sentiment"].apply(lambda l: label2id[l.capitalize()])
df["label"].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,3130
1,1852
2,860


In [38]:
model.config.label2id

{'Positive': 1, 'Negative': 2, 'Neutral': 0}

Split out dataset into train, val, and test dfs'

In [39]:
train_end_point = int(df.shape[0]*0.6)
val_end_point = int(df.shape[0]*0.8)
df_train = df.iloc[:train_end_point,:]
df_val =df.iloc[train_end_point: val_end_point,:]
df_test = df.iloc[val_end_point:, :]
print(df_train.shape, df_test.shape, df_val.shape)

(3505, 4) (1169, 4) (1168, 4)


In [38]:
# Lets test the accuracy before trianing

In [41]:
preds=sentiment_pipeline(df_test['text'].tolist())
df_test['prediction']=[pred['label'] for pred in preds]
df_test['prediction'] = df['predictions'].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['prediction']=[pred['label'] for pred in preds]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['prediction'] = df['predictions'].str.lower()


In [42]:
print(balanced_accuracy_score(df_test['Sentiment'],df_test['prediction']))

0.6597675851747513


In [43]:
accuracy_score(df_test['Sentiment'],df_test['prediction'])

0.7057313943541489

Fine Tune Using HugginFace Trainer Class!

In [44]:
#Convert pandas DataFrame into Hugging Face Dataset Objects:
dataset_train = Dataset.from_pandas(df_train)
dataset_test = Dataset.from_pandas(df_test)
dataset_val = Dataset.from_pandas(df_val)

In [45]:
# Tokenizing the datasets:
dataset_train = dataset_train.map(lambda e: tokenizer(e['text'], truncation=True, padding='max_length', max_length=128), batched=True)
dataset_val = dataset_val.map(lambda e: tokenizer(e['text'], truncation=True, padding='max_length', max_length=128), batched=True)
dataset_test = dataset_test.map(lambda e: tokenizer(e['text'], truncation=True, padding='max_length' , max_length=128), batched=True)

# Setting the dataset format: (needed for Pytorch?)
dataset_train.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])
dataset_val.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])
dataset_test.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])

# Shuffle the training dataset
dataset_train_shuffled = dataset_train.shuffle(seed=42)  # Using a seed for reproducibility

Map:   0%|          | 0/3505 [00:00<?, ? examples/s]

Map:   0%|          | 0/1168 [00:00<?, ? examples/s]

Map:   0%|          | 0/1169 [00:00<?, ? examples/s]

Train the Model

In [48]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {'balanced_accuracy' : balanced_accuracy_score(predictions, labels),'accuracy':accuracy_score(predictions,labels)}

args = TrainingArguments(
    output_dir='temp/',
    eval_strategy='epoch',
    save_strategy='epoch',
    logging_strategy="steps",  # Log every X steps
    logging_steps=50,  # Log every 50 steps
    learning_rate=2e-6,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=20,
    weight_decay=0.1,
    load_best_model_at_end=True,
    metric_for_best_model='balanced_accuracy',
)

trainer = Trainer(
        model=model,                         # the instantiated 🤗 Transformers model to be trained
        args=args,                  # training arguments, defined above
        train_dataset=dataset_train_shuffled,         # training dataset
        eval_dataset=dataset_val,            # evaluation dataset
        compute_metrics=compute_metrics
)





In [50]:
pip install --upgrade transformers



/bin/bash: line 1: 2: No such file or directory


In [49]:
trainer.train()

Epoch,Training Loss,Validation Loss,Balanced Accuracy,Accuracy
1,0.5359,0.550627,0.726529,0.776541
2,0.445,0.513957,0.733612,0.787671
3,0.4157,0.494966,0.731447,0.786815
4,0.3904,0.481747,0.720938,0.781678
5,0.3455,0.479943,0.725362,0.782534
6,0.3122,0.488339,0.718987,0.784247
7,0.3122,0.488748,0.718815,0.781678
8,0.2928,0.496946,0.71733,0.780822
9,0.2643,0.504584,0.712795,0.779966
10,0.2634,0.507934,0.721269,0.785103


TrainOutput(global_step=2200, training_loss=0.2869776825471358, metrics={'train_runtime': 2065.1049, 'train_samples_per_second': 33.945, 'train_steps_per_second': 1.065, 'total_flos': 4611062645683200.0, 'train_loss': 0.2869776825471358, 'epoch': 20.0})

In [50]:
predictions = trainer.predict(dataset_test)
predictions

PredictionOutput(predictions=array([[ 4.290045  , -3.056845  , -4.100369  ],
       [ 0.5128855 , -1.348583  , -1.8100287 ],
       [ 1.0716122 , -2.6724224 , -1.3616703 ],
       ...,
       [ 1.2836261 , -3.6943896 , -0.1821409 ],
       [ 0.45959002, -3.9305341 ,  1.2626249 ],
       [-0.8983517 ,  0.69362456, -3.0182805 ]], dtype=float32), label_ids=array([0, 0, 0, ..., 0, 0, 1]), metrics={'test_loss': 0.49225348234176636, 'test_balanced_accuracy': 0.7539672818868467, 'test_accuracy': 0.7912745936698032, 'test_runtime': 8.3106, 'test_samples_per_second': 140.664, 'test_steps_per_second': 4.452})

In [21]:
model_path = r"C:\Users\ugoch\OneDrive\Desktop\Summer Projects\FinancialAnalysis\Financial-Analysis\ml\FInBertModel"


# Save the model
trainer.model.save_pretrained(model_path)

# Save the tokenizer associated with the model
# Save the tokenizer
tokenizer.save_pretrained(model_path)



NameError: name 'trainer' is not defined

In [26]:
trained_pipeline = pipeline("text-classification", model=model_path, tokenizer=model_path,device=-1)

Device set to use cpu


In [27]:
# Calculate the balanced accuracy score
score = balanced_accuracy_score(df_test['Sentiment'], df_test['prediction'].str.lower())
print(f"Balanced Accuracy Score: {score}")


NameError: name 'balanced_accuracy_score' is not defined

In [60]:
# Calculate the balanced accuracy score
score = accuracy_score(df_test['Sentiment'], df_test['prediction'].str.lower())
print(f"Accuracy Score: {score}")


Accuracy Score: 0.7912745936698032


Let's Upload the model to Hugging Face Model Hub


In [79]:
import os

In [81]:
from huggingface_hub import HfApi

api = HfApi(token=os.getenv("HF_TOKEN"))
api.upload_folder(
    folder_path="/content/path",
    repo_id="Belusochim/Fine-Tune-FinBert",
    repo_type="model",
)

Uploading...:   0%|          | 0.00/439M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Belusochim/Fine-Tune-FinBert/commit/4a8c5e2facc3b95dcec50ade819f0aea506354fe', commit_message='Upload folder using huggingface_hub', commit_description='', oid='4a8c5e2facc3b95dcec50ade819f0aea506354fe', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Belusochim/Fine-Tune-FinBert', endpoint='https://huggingface.co', repo_type='model', repo_id='Belusochim/Fine-Tune-FinBert'), pr_revision=None, pr_num=None)

In [6]:
model_path = "path/to/save/model"


In [19]:
from transformers import BertTokenizer, BertForSequenceClassification

# Use the correct directory path for model and tokenizer
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)

OSError: Can't load tokenizer for 'to/save/model/tokenizer.json'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'to/save/model/tokenizer.json' is the correct path to a directory containing all relevant files for a BertTokenizer tokenizer.

In [11]:
trained_pipeline = pipeline("text-classification", model=model_path, tokenizer=model_path,device=0)

HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name': 'path/to/save/model'. Use `repo_type` argument if needed.