In [2]:
# Importing all needed modules.
from datasets import load_dataset
from transformers import RobertaForSequenceClassification, RobertaTokenizer
from transformers import DataCollatorWithPadding

from torch.utils.data import DataLoader

from transformers import AdamW
from transformers import get_scheduler
import evaluate

import torch
from tqdm.auto import tqdm

In [3]:
# Importing the data set.
dataset = load_dataset("FinanceInc/auditor_sentiment")

Using custom data configuration demo-org--auditor_review-77f48794e3d06c46
Found cached dataset parquet (C:/Users/Asus VivoBook/.cache/huggingface/datasets/FinanceInc___parquet/demo-org--auditor_review-77f48794e3d06c46/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label'],
        num_rows: 3877
    })
    test: Dataset({
        features: ['sentence', 'label'],
        num_rows: 969
    })
})

In [17]:
# Setting up the training hyperperemaeters.
MODEL_NAME = "roberta-base"
LEARNING_RATE = 5e-5
DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
NUM_CLASSES = 3
NUM_EPOCHS = 10

In [6]:
# Creation of the ROBERTA tokenizer.
tokenizer = RobertaTokenizer.from_pretrained(MODEL_NAME)

In [7]:
# Creating the tokeniztion function.
def tokenize(sample):
    return tokenizer(sample["sentence"], truncation=True)

In [8]:
# Tokenising the dataset.
tokenized_datasets = dataset.map(tokenize, batched=True)

Loading cached processed dataset at C:\Users\Asus VivoBook\.cache\huggingface\datasets\FinanceInc___parquet\demo-org--auditor_review-77f48794e3d06c46\0.0.0\2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec\cache-633cf463999a58fe.arrow
Loading cached processed dataset at C:\Users\Asus VivoBook\.cache\huggingface\datasets\FinanceInc___parquet\demo-org--auditor_review-77f48794e3d06c46\0.0.0\2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec\cache-a71b4d2f9958a4fd.arrow


In [9]:
# Creating the data collator.
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [10]:
# Preparing the datasets for training.
tokenized_datasets = tokenized_datasets.remove_columns(["sentence"])
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names

['label', 'input_ids', 'attention_mask']

In [11]:
# Creating the data loaders.
train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size = 8, collate_fn = data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets["test"], batch_size=8, collate_fn=data_collator
)

In [13]:
# Creation of the ROBERTA model.
roberta = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=NUM_CLASSES)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classi

In [14]:
# Freezing the base model of reoberta.
for param in roberta.base_model.parameters():
    param.requires_grad = False

In [15]:
# Creating the optimizer.
optimizer = AdamW(roberta.parameters(), lr=LEARNING_RATE, no_deprecation_warning=True)

In [18]:
# Setting up the Learning Rate shceduler.
num_training_steps = NUM_EPOCHS * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

4850


In [19]:
# Setting up the ROBERTA model to DEVICE.
roberta.to(DEVICE)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

In [23]:
# Training the model.
progress_bar = tqdm(range(num_training_steps))

roberta.train()
for epoch in range(NUM_EPOCHS):
    accuracy_metric = evaluate.load("accuracy")
    for batch in train_dataloader:
        batch = {k: v.to(DEVICE) for k, v in batch.items()}
        outputs = roberta(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
        
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        accuracy_metric.add_batch(predictions=predictions, references=batch["labels"])
    print(f"EPOCH - {epoch}")
    print(f"accuracy = {accuracy_metric.compute()['accuracy']}")

  0%|          | 0/4850 [00:00<?, ?it/s]

EPOCH - 0
accuracy = 0.5914366778436936
EPOCH - 1
accuracy = 0.59994841372195
EPOCH - 2
accuracy = 0.6058808356977045
EPOCH - 3
accuracy = 0.6082022182099561
EPOCH - 4
accuracy = 0.62677327830797
EPOCH - 5
accuracy = 0.6259994841372195
EPOCH - 6
accuracy = 0.6355429455764766
EPOCH - 7
accuracy = 0.6404436419912304
EPOCH - 8
accuracy = 0.6430229558937323
EPOCH - 9
accuracy = 0.645344338405984


In [24]:
# Unfreezing the last 3 layers of the base model.
for param in roberta.base_model.encoder.layer[-1].output.parameters():
    param.requires_grad = False
# In the task wasn't specified that need to be finetuned only the last 3 layers of the base model 
# without the classifier, if so uncomment the code below:
#for param in roberta.classifier.parameters():
#    param.requires_grad = False

In [25]:
# Updatin the number of epochs for faster training.
NUM_EPOCHS = 3

In [26]:
# Setting up the Learning Rate shceduler.
num_training_steps = NUM_EPOCHS * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

1455


In [27]:
# Retraining the Roberta model.
progress_bar = tqdm(range(num_training_steps))

roberta.train()
for epoch in range(NUM_EPOCHS):
    accuracy_metric = evaluate.load("accuracy")
    for batch in train_dataloader:
        batch = {k: v.to(DEVICE) for k, v in batch.items()}
        outputs = roberta(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
        
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        accuracy_metric.add_batch(predictions=predictions, references=batch["labels"])
    print(f"EPOCH - {epoch}")
    print(f"accuracy = {accuracy_metric.compute()['accuracy']}")

  0%|          | 0/1455 [00:00<?, ?it/s]

EPOCH - 0
accuracy = 0.6396698478204798
EPOCH - 1
accuracy = 0.6587567706989941
EPOCH - 2
accuracy = 0.6566933195769925


In [29]:
# Final evaluation on the test set.
roberta.eval()
accuracy_metric = evaluate.load("accuracy")
for batch in eval_dataloader:
    batch = {k: v.to(DEVICE) for k, v in batch.items()}
    with torch.no_grad():
        outputs = roberta(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    
    accuracy_metric.add_batch(predictions=predictions, references=batch["labels"])

In [30]:
# Looking at the accuracy of the model.
accuracy_metric.compute()["accuracy"]

0.6480908152734778

In [33]:
roberta(**batch).logits

tensor([[-0.3302,  0.2312,  0.0192]], grad_fn=<AddmmBackward0>)

In [34]:
from transformers import Pipeline

In [72]:
# Defining the new class of Pipelinne.
class RobertaPipe(Pipeline):
    def _sanitize_parameters(self, **kwargs):
        # Skip.
        return {}, {}, {}
    
    def preprocess(self, inputs):
        # This function prepares the input for the mode.
        model_input = self.tokenizer(inputs)
        model_input["input_ids"] = torch.tensor([model_input["input_ids"]])
        model_input["attention_mask"] = torch.tensor([model_input["attention_mask"]])
        return model_input
    
    def _forward(self, model_inputs):
        # This function makes the prediction based on the inputs.
        outputs = self.model(**model_inputs)
        return outputs
    
    def postprocess(self, model_outputs):
        # This function creates calcualtes the sentiment score from predicted probabilities.
        probs = torch.softmax(outputs.logits, dim=-1).detach().numpy()[0]
        
        sentiment_score = 0 * probs[0] + 0.5 * probs[1] + 1 * probs[2]
        return sentiment_score

In [73]:
my_pipe = RobertaPipe(model=roberta, tokenizer=tokenizer)

In [74]:
my_pipe("Hello")

0.5501276254653931