# Fine-Tuning Longformer For Financial Sentiment Analysis

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')
%cd /content/gdrive/MyDrive/Fine_tune_longformer/

Mounted at /content/gdrive
/content/gdrive/MyDrive/Fine_tune_longformer


### Load libraries

In [None]:
! pip install transformers[torch] --quiet
! pip install datasets --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m314.1/314.1 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m45.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's d

In [None]:
!pip show transformers

Name: transformers
Version: 4.41.2
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: /usr/local/lib/python3.10/dist-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: 


In [None]:
from transformers import LongformerForSequenceClassification, LongformerTokenizerFast, Trainer, TrainingArguments, get_polynomial_decay_schedule_with_warmup, AdamW
from datasets import load_dataset
import torch
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

### Initialise pre-trained longformer model

In [None]:
# Load Longformer model and tokenizer with GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = "allenai/longformer-base-4096"
model = LongformerForSequenceClassification.from_pretrained(model_name,
                                                            gradient_checkpointing=False,
                                                            attention_window = 512,
                                                            num_labels=3).to(device)
tokenizer = LongformerTokenizerFast.from_pretrained(model_name, max_length = 1024)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/597M [00:00<?, ?B/s]

Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

###Load financial phrasebank data
The data is loaded as a transformers dataset, the sentiment is converted to numerical lables and the headlines are tokenised using the longformer tokeniser. The dataset is then split into training and testing data using a 90/10 split.

In [None]:
# Load CSV file into a dataset object
dataset = load_dataset('csv', data_files='FinancialPhrasebank.csv', split='train')

# Convert sentiment to numerical labels
sentiment_mapping = {'negative': 0, 'neutral': 1, 'positive': 2}
dataset = dataset.map(lambda x: {'text': x['Headline'], 'label': sentiment_mapping[x['Sentiment']]})
dataset = dataset.select_columns(['text', 'label'])
print("Data Loaded.")

# Tokenise the text
dataset = dataset.map(lambda e: tokenizer(e['text'], padding = 'max_length', truncation=True, max_length = 1024), batched=True)
dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

# Print dataset information
print("Dataset Tokenised.")
print(dataset)

# Split the dataset into train and test sets
train_data = dataset.train_test_split(test_size=0.1)['train']
test_data = dataset.train_test_split(test_size=0.1)['test']

# Print dataset sizes
print("Train dataset size:", len(train_data))
print("Test dataset size:", len(test_data))

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/4730 [00:00<?, ? examples/s]

Data Loaded.


Map:   0%|          | 0/4730 [00:00<?, ? examples/s]

Dataset Tokenised.
Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 4730
})
Train dataset size: 4257
Test dataset size: 473


### A function to compute the training accuracy metrics

In [None]:
# Accuracy metrics
def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)

  # Calculate the metrics
  accuracy = accuracy_score(labels, preds)
  precision = precision_score(labels, preds, average='weighted')
  recall = recall_score(labels, preds, average='weighted')
  f1 = f1_score(labels, preds, average='weighted')

  return {
      'accuracy': accuracy,
      'precision': precision,
      'recall': recall,
      'f1': f1
  }

### Training hyperparameters
These are the same hyperparameters used by the authors in the original Longformer paper with the remainder being the same as RoBERTa. Learning rate is set to 3e-5 with an effective training batch size of 64. Warmup steps equal ~1% of the total training steps. Training is done for 5 epochs at a time to conserve GPU usage.

[Longformer Paper](https://arxiv.org/abs/2004.05150)
| [RoBERTa Paper](https://arxiv.org/abs/1907.11692)

In [None]:
# Output Directory
output_directory = "./Fine_Tuned_LongFormer"

# Training arguments
training_args = TrainingArguments(
    output_dir=output_directory,
    overwrite_output_dir=True,
    learning_rate=3e-5,
    num_train_epochs=5,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=8,
    per_device_eval_batch_size= 8,
    warmup_steps=10,
    evaluation_strategy="epoch",
    fp16=True,
    save_strategy="epoch",
    load_best_model_at_end=True,
)

### Optimiser & Learning Rate Scheduler
Hyperparamters from the longformer & RoBERTa paper. Adam optimiser with β1 = 0.9, β2 = 0.999, ε = 1e-6 and L2 weight decay of 0.01. Learning rate linearly increases to 3e-5 followed by a power 3 polynomial decay to 1e-7.

In [None]:
# Optimiser
optimiser = AdamW(model.parameters(), lr=training_args.learning_rate, betas=(0.9, 0.999), eps=1e-06, weight_decay=0.01)

# Define learning rate scheduler
num_training_steps = int((len(train_data) // training_args.per_device_train_batch_size)*training_args.num_train_epochs)
scheduler = get_polynomial_decay_schedule_with_warmup(
    optimiser,
    num_warmup_steps=training_args.warmup_steps,
    num_training_steps=num_training_steps,
    lr_end=1e-7,
    power=3.0,
)

### Define The Trainer

In [None]:
# Define the training
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=test_data,
    optimizers=(optimiser, scheduler)
)

In [None]:
# train the model
trainer.train()

Initializing global attention on CLS token...


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
0,No log,0.423054,0.830867,0.864346,0.830867,0.835937
1,No log,0.208103,0.932347,0.934432,0.932347,0.932771
2,No log,0.121691,0.966173,0.96665,0.966173,0.966271
3,No log,0.10368,0.959831,0.96083,0.959831,0.960049
4,No log,0.071231,0.978858,0.978925,0.978858,0.978882


TrainOutput(global_step=330, training_loss=0.32871736468690815, metrics={'train_runtime': 4773.5287, 'train_samples_per_second': 4.459, 'train_steps_per_second': 0.069, 'total_flos': 1.3854413847011328e+16, 'train_loss': 0.32871736468690815, 'epoch': 4.953095684803002})

In [None]:
# Save model
trainer.save_model(output_directory)

In [None]:
# For resuming a stopped training
trainer.train(resume_from_checkpoint=True)

Epoch,Training Loss,Validation Loss


TrainOutput(global_step=330, training_loss=0.0, metrics={'train_runtime': 1.7286, 'train_samples_per_second': 12313.492, 'train_steps_per_second': 190.907, 'total_flos': 1.3854413847011328e+16, 'train_loss': 0.0, 'epoch': 4.953095684803002})