##Installing libraries

In [None]:
!pip install transformers

##Importing libraries

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
from tqdm import tqdm
import csv

##Code for fine-tuning
Note: "output.csv" file must be present in the same directory as this "fine_tuning_rte.ipynb" file.

Public link to output.csv: https://drive.google.com/file/d/1fm6SDn0TQckZqLFs7ctNODOZNPLC1W_z/view?usp=share_link

In [None]:
training_data = []
with open('output.csv', 'r', newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        separator = "|"
        evidence_list = [item for item in row['evidence'].split(separator)]
        # print(evidence_list)
        training_data.append({'claim': row['claim'], 'evidence': evidence_list, 'label': row['label']})

# check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# load pre-trained model and tokenizer for textual entailment
model = AutoModelForSequenceClassification.from_pretrained("roberta-large-mnli").to(device)
tokenizer = AutoTokenizer.from_pretrained("roberta-large-mnli")

# encode training data
input_ids = []
attention_masks = []
labels = []

for data in tqdm(training_data[:1000]):
    claim = data["claim"]
    evidence_sentences = data["evidence"]
    label = data["label"]
    
    # join evidence sentences into a single string
    evidence_text = " ".join(evidence_sentences)
    
    # encode claim and evidence text into token IDs
    encoded_dict = tokenizer.encode_plus(
                        claim,                      # claim text to encode
                        evidence_text,              # evidence text to encode
                        add_special_tokens = True,  # add [CLS] and [SEP] tokens
                        max_length = 512,           # truncate/pad to this length
                        padding = 'max_length',     # pad to max length
                        return_attention_mask = True, # return attention masks
                        return_tensors = 'pt'       # return PyTorch tensors
                  )
    
    # get token IDs and attention mask from encoded dictionary
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])
    
    # convert label to numerical value
    if label == "REFUTES":
        labels.append(0)
    elif label == "SUPPORTS":
        labels.append(1)
    else:
        labels.append(2)

# convert data to PyTorch tensors
input_ids = torch.cat(input_ids, dim=0).to(device)
attention_masks = torch.cat(attention_masks, dim=0).to(device)
labels = torch.tensor(labels).to(device)

# define batch size and create data loader
batch_size = 2
dataset = TensorDataset(input_ids, attention_masks, labels)
sampler = SequentialSampler(dataset)
dataloader = DataLoader(dataset, sampler=sampler, batch_size=batch_size)

# fine-tune the model on training data
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

for epoch in range(3):
    for batch in tqdm(dataloader):
        input_ids = batch[0].to(device)
        attention_masks = batch[1].to(device)
        labels = batch[2].to(device)
        
        model.zero_grad()
        
        outputs = model(input_ids, attention_mask=attention_masks, labels=labels)
        
        loss = outputs.loss
        loss.backward()
        
        optimizer.step()

# save the fine-tuned model
model.save_pretrained("fine-tuned-roberta-large-mnli")
tokenizer.save_pretrained("fine-tuned-roberta-large-mnli")

Note: If fine-tuning is done in Google Colab, the model saved in cloud has to be downloaded manually and placed in the same directory as "test_rte.ipynb" and "test_custom_rte" files, so that the fine-tuned model can be tested appropriately.