**1. Data Loading & Pre-Processing**

In [None]:
import numpy as np
import pandas as pd

!pip install transformers
from transformers import BertTokenizer, BertModel, get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

from tqdm.notebook import tqdm

import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn

import time

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m100.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m32.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m84.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.15.1 tokenizers-0.13.3 transformers-4.29.2


In [None]:
# checking if gpu is enabled
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Load the dataset file into a Pandas dataframe
df = pd.read_csv("/content/drive/MyDrive/BERT-Final/financialDataset.csv", names = ['phrase', 'sentiment'])
df.head()

Unnamed: 0,phrase,sentiment
0,"According to Gran , the company has no plans t...",neutral
1,Technopolis plans to develop in stages an area...,neutral
2,The international electronic industry company ...,negative
3,With the new production plant the company woul...,positive
4,According to the company 's updated strategy f...,positive


In [None]:
print(df.sentiment.value_counts())
print("\nTotal rows: ", len(df))

neutral     6009
positive    3215
negative    1464
Name: sentiment, dtype: int64

Total rows:  10688


In [None]:
class_labels = ['negative', 'neutral', 'positive']

labels = {}
for index, tag in enumerate(class_labels):
  labels[tag] = index

labels

{'negative': 0, 'neutral': 1, 'positive': 2}

In [None]:
df.sentiment = df['sentiment'].map(labels)
df.head()

Unnamed: 0,phrase,sentiment
0,"According to Gran , the company has no plans t...",1
1,Technopolis plans to develop in stages an area...,1
2,The international electronic industry company ...,0
3,With the new production plant the company woul...,2
4,According to the company 's updated strategy f...,2


In [None]:
# split data into training and test sets (90/10 split)
train_set, test_set = train_test_split(df, test_size=0.1, random_state=42, stratify = df.sentiment.values)

# split test set into validation and test (90/10 split)
validation_set, test_set = train_test_split(test_set, test_size=0.5, random_state=42, stratify = test_set.sentiment.values)
print(len(train_set), len(validation_set), len(test_set))

9619 534 535


In [None]:
train_set

Unnamed: 0,phrase,sentiment
7805,Mursula said they tried to gather macro-econom...,1
8987,"At the close , the OMX Helsinki 25 was 0.01 pc...",0
220,"In the third quarter , net sales increased by ...",2
7511,"Currently , the company uses eight similar rea...",1
3221,The contract is for next year .,1
...,...,...
5932,Short on $ATVI from 24.55.,0
5388,BAE Systems's sales boosted by European Typhoo...,2
4778,`` Low energy consumption and flexible loading...,1
4446,Kone shares dropped 4.1 percent to x20ac 43 U...,0


In [None]:
#Importing tokenizer from huggingface
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
max_len = 50
for text in df.phrase.values:
  if len(text) > max_len:
    max_len = len(text)

print(max_len)

315


In [None]:
#defining the encoder
max_len = 320
batchSize = 16

def encoder(phrases, targets):
  dataset = []

  for i in range(len(phrases)):
    text = str(phrases[i])
    target = targets[i]

    encoded = tokenizer.encode_plus(
        text,
        add_special_tokens = True,
        return_attention_mask = True,
        return_token_type_ids = False,
        padding = 'max_length',
        max_length = max_len,
        truncation=True,
        return_tensors='pt'
    )
    data = {
        'input_ids' : encoded['input_ids'].flatten(),
        'attention_mask' : encoded['attention_mask'].flatten(),
        'targets' :  torch.tensor(target, dtype = torch.long)
    }
    dataset.append(data)
  return dataset

In [None]:
def dataloader(dataset, batchSize):
  encoded_dataset = encoder(
      phrases = dataset.phrase.to_numpy(),
      targets = dataset.sentiment.to_numpy()
  )

  return DataLoader(
      encoded_dataset,
      shuffle = True,
      batch_size = batchSize,
      pin_memory = True
  )

In [None]:
dataloader_train = dataloader(train_set, batchSize)
dataloader_validation = dataloader(validation_set, batchSize)
dataloader_test = dataloader(test_set, batchSize)

In [None]:
# Get the first batch from the data loader
first_batch = next(iter(dataloader_train))

# Access the inputs and targets of the first batch
input_ids = first_batch["input_ids"]
attention_mask = first_batch["attention_mask"]
targets = first_batch["targets"]

# Print or inspect the inputs and targets
print(input_ids)
print(attention_mask)
print(targets)

tensor([[  101,  1203,  9742,  ...,     0,     0,     0],
        [  101, 11121,  1113,  ...,     0,     0,     0],
        [  101,  3929, 23226,  ...,     0,     0,     0],
        ...,
        [  101, 20820,  3813,  ...,     0,     0,     0],
        [  101, 16890,  6140,  ...,     0,     0,     0],
        [  101,   118, 10511,  ...,     0,     0,     0]])
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])
tensor([2, 2, 2, 1, 2, 2, 1, 1, 2, 0, 1, 1, 1, 2, 1, 1])


In [None]:
class classifier(nn.Module):
  def __init__(self, classes):
    super(classifier, self).__init__()
    self.model = BertModel.from_pretrained('bert-base-cased', return_dict=False)
    self.drop = nn.Dropout(p=0.5)
    self.out = nn.Linear(self.model.config.hidden_size, classes)

  def forward(self, input_ids, attention_mask):
    last_hidden_state, pooled_output = self.model(
      input_ids = input_ids,
      attention_mask = attention_mask
    )
    dropped_out = self.drop(pooled_output)
    return self.out(dropped_out)

In [None]:
model = classifier(len(class_labels))
model = model.to(device)

Downloading pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
epochs = 10
optimizer = torch.optim.AdamW(model.parameters(), lr = 0.00005, eps = 0.00000001, weight_decay = 0.01, betas = (0.9, 0.999))
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps = len(dataloader_train)*epochs)
loss_fxn = nn.CrossEntropyLoss().to(device)

In [None]:
def trainer(dataLoader, epoch, n):
  model.train()

  loss_list = [] #store loss for each batch
  predictions_list = []
  targets_list = []

  for batch in tqdm(dataLoader, desc = 'Epoch {}'.format(epoch+1), leave=False):
    ids = batch['input_ids'].to(device)
    mask = batch['attention_mask'].to(device)
    targets = batch['targets'].to(device)

    outputs = model(
        input_ids = ids,
        attention_mask = mask
    )

    max_values, predictions = torch.max(outputs, dim = 1) #predictions are indices corresponding to the different classes
    loss  = loss_fxn(outputs, targets)

    predictions_list.extend(predictions.detach().cpu().numpy())  # Store predictions
    targets_list.extend(targets.detach().cpu().numpy())  # Store targets
    loss_list.append(loss.item())

    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()
  
  predictions = np.array(predictions_list)
  targets = np.array(targets_list)

  loss = np.mean(loss_list)
  accuracy = accuracy_score(targets, predictions)
  precision = precision_score(targets, predictions, average='weighted')
  recall = recall_score(targets, predictions, average='weighted')
  f1 = f1_score(targets, predictions, average='weighted')

  return loss, accuracy, precision, recall, f1

In [None]:
#defining the model evaluator
def evaluator(dataLoader, n):
  model.eval()

  loss_list = [] #store loss for each batch
  predictions_list = []
  targets_list = []

  with torch.no_grad():
    for batch in dataLoader:
      ids = batch['input_ids'].to(device)
      mask = batch['attention_mask'].to(device)
      targets = batch['targets'].to(device)

      outputs = model(
          input_ids = ids,
          attention_mask = mask
      )

      max_values, predictions = torch.max(outputs, dim = 1) #predictions are indices corresponding to the different classes
      loss  = loss_fxn(outputs, targets)

      predictions_list.extend(predictions.detach().cpu().numpy())  # Store predictions
      targets_list.extend(targets.detach().cpu().numpy())  # Store targets
      loss_list.append(loss.item())
  
  predictions = np.array(predictions_list)
  targets = np.array(targets_list)

  loss = np.mean(loss_list)
  accuracy = accuracy_score(targets, predictions)
  precision = precision_score(targets, predictions, average='weighted')
  recall = recall_score(targets, predictions, average='weighted')
  f1 = f1_score(targets, predictions, average='weighted')

  return loss, accuracy, precision, recall, f1

In [None]:
file_path = '/content/drive/MyDrive/BERT-Final/best_model.pt'

best_accuracy = 0
total_time = 0

for epoch in range(epochs):
  start_time = time.time()

  t_loss, t_accuracy, t_precision, t_recall, t_f1 = trainer(
      dataloader_train,
      epoch,
      len(train_set)
  )

  v_loss, v_accuracy, v_precision, v_recall, v_f1 = evaluator(
      dataloader_validation,
      len(validation_set)
  )
  end_time = time.time()
  time_taken = end_time - start_time
  total_time += time_taken

  print("Epoch {ep} | Time: {time} seconds".format(ep = (epoch + 1), time = round(time_taken,3)))
  print("Training ---> Accuracy: {a} | Precision: {p} | Recall: {r} | F1-Score: {f} | Loss: {l}".format(a = round(t_accuracy, 3), p = round(t_precision, 3), r = round(t_recall, 3), f = round(t_f1, 3), l = round(t_loss, 3)))
  print("Validation -> Accuracy: {a} | Precision: {p} | Recall: {r} | F1-Score: {f} | Loss: {l}".format(a = round(v_accuracy, 3), p = round(v_precision, 3), r = round(v_recall, 3), f = round(v_f1, 3), l = round(v_loss, 3)))
  print()

  if(v_accuracy > best_accuracy):
    torch.save(model, file_path)
    best_accuracy = v_accuracy

print("Best Accuracy: {acc} | Total Training Time: {time} seconds".format(acc = round(best_accuracy, 3), time = round(total_time, 3)))

Epoch 1:   0%|          | 0/602 [00:00<?, ?it/s]

Epoch 1 | Time: 526.354 seconds
Training ---> Accuracy: 0.776 | Precision: 0.776 | Recall: 0.776 | F1-Score: 0.776 | Loss: 0.534
Validation -> Accuracy: 0.843 | Precision: 0.865 | Recall: 0.843 | F1-Score: 0.847 | Loss: 0.378



Epoch 2:   0%|          | 0/602 [00:00<?, ?it/s]

Epoch 2 | Time: 523.941 seconds
Training ---> Accuracy: 0.887 | Precision: 0.896 | Recall: 0.887 | F1-Score: 0.89 | Loss: 0.287
Validation -> Accuracy: 0.843 | Precision: 0.831 | Recall: 0.843 | F1-Score: 0.833 | Loss: 0.43



Epoch 3:   0%|          | 0/602 [00:00<?, ?it/s]

Epoch 3 | Time: 522.529 seconds
Training ---> Accuracy: 0.917 | Precision: 0.926 | Recall: 0.917 | F1-Score: 0.92 | Loss: 0.211
Validation -> Accuracy: 0.871 | Precision: 0.887 | Recall: 0.871 | F1-Score: 0.875 | Loss: 0.404



Epoch 4:   0%|          | 0/602 [00:00<?, ?it/s]

Epoch 4 | Time: 523.008 seconds
Training ---> Accuracy: 0.931 | Precision: 0.941 | Recall: 0.931 | F1-Score: 0.934 | Loss: 0.161
Validation -> Accuracy: 0.882 | Precision: 0.895 | Recall: 0.882 | F1-Score: 0.885 | Loss: 0.447



Epoch 5:   0%|          | 0/602 [00:00<?, ?it/s]

Epoch 5 | Time: 523.573 seconds
Training ---> Accuracy: 0.936 | Precision: 0.947 | Recall: 0.936 | F1-Score: 0.939 | Loss: 0.142
Validation -> Accuracy: 0.882 | Precision: 0.9 | Recall: 0.882 | F1-Score: 0.886 | Loss: 0.478



Epoch 6:   0%|          | 0/602 [00:00<?, ?it/s]

Epoch 6 | Time: 523.171 seconds
Training ---> Accuracy: 0.941 | Precision: 0.952 | Recall: 0.941 | F1-Score: 0.944 | Loss: 0.127
Validation -> Accuracy: 0.884 | Precision: 0.894 | Recall: 0.884 | F1-Score: 0.887 | Loss: 0.515



Epoch 7:   0%|          | 0/602 [00:00<?, ?it/s]

Epoch 7 | Time: 523.669 seconds
Training ---> Accuracy: 0.946 | Precision: 0.957 | Recall: 0.946 | F1-Score: 0.948 | Loss: 0.116
Validation -> Accuracy: 0.89 | Precision: 0.906 | Recall: 0.89 | F1-Score: 0.893 | Loss: 0.566



Epoch 8:   0%|          | 0/602 [00:00<?, ?it/s]

Epoch 8 | Time: 524.146 seconds
Training ---> Accuracy: 0.949 | Precision: 0.961 | Recall: 0.949 | F1-Score: 0.952 | Loss: 0.111
Validation -> Accuracy: 0.893 | Precision: 0.907 | Recall: 0.893 | F1-Score: 0.897 | Loss: 0.551



Epoch 9:   0%|          | 0/602 [00:00<?, ?it/s]

Epoch 9 | Time: 523.78 seconds
Training ---> Accuracy: 0.949 | Precision: 0.96 | Recall: 0.949 | F1-Score: 0.951 | Loss: 0.106
Validation -> Accuracy: 0.876 | Precision: 0.889 | Recall: 0.876 | F1-Score: 0.88 | Loss: 0.573



Epoch 10:   0%|          | 0/602 [00:00<?, ?it/s]

Epoch 10 | Time: 523.63 seconds
Training ---> Accuracy: 0.95 | Precision: 0.961 | Recall: 0.95 | F1-Score: 0.953 | Loss: 0.1
Validation -> Accuracy: 0.886 | Precision: 0.897 | Recall: 0.886 | F1-Score: 0.889 | Loss: 0.601

Best Accuracy: 0.893 | Total Training Time: 5237.8 seconds


In [None]:
'''
# save best model state
def save(model, optimizer):
    # save
    torch.save({
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()
    }, output_model)

# load
checkpoint = torch.load(output_model, map_location='cpu')
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
'''

"\n# save best model state\ndef save(model, optimizer):\n    # save\n    torch.save({\n        'model_state_dict': model.state_dict(),\n        'optimizer_state_dict': optimizer.state_dict()\n    }, output_model)\n\n# load\ncheckpoint = torch.load(output_model, map_location='cpu')\nmodel.load_state_dict(checkpoint['model_state_dict'])\noptimizer.load_state_dict(checkpoint['optimizer_state_dict'])\n"

In [None]:
'''file_path = '/content/drive/MyDrive/BERT-Final/best_model.pt'

if val_acc > best_accuracy:
    torch.save(model, file_path)
    best_accuracy = val_acc
'''

"file_path = '/content/drive/MyDrive/BERT-Final/best_model.pt'\n\nif val_acc > best_accuracy:\n    torch.save(model, file_path)\n    best_accuracy = val_acc\n"

In [None]:
'''#Loading the model
model = torch.load(file_path)'''

'#Loading the model\nmodel = torch.load(file_path)'