##Connecting to Google Drive where the pre-trained model is stored

In [None]:
from google.colab import drive
drive.mount('/content/gdrive/', force_remount=True)
%cd gdrive/MyDrive

Mounted at /content/gdrive/
/content/gdrive/MyDrive


##Installing libraries

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m63.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m88.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m20.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.27.4


##Importing libraries

In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import csv
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score

##Defining the predict_textual_entailment() function
Note: The folder "fine-tuned-roberta-large-mnli" must be at the same level as this file for the code to function properly.

Public link to the pre-trained fine-tuned model: https://drive.google.com/drive/folders/1ShykW5wmMt2bWRx9AjdO36KUJBie4vkX?usp=sharing

In [None]:
# load fine-tuned model and tokenizer
model_name = "fine-tuned-roberta-large-mnli"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

def predict_textual_entailment(claim, evidence_sentences):
    # join evidence sentences into a single string
    evidence_text = " ".join(evidence_sentences)
    
    # encode claim and evidence text into token IDs
    encoded_dict = tokenizer.encode_plus(
                        claim,                      # claim text to encode
                        evidence_text,              # evidence text to encode
                        add_special_tokens = True,  # add [CLS] and [SEP] tokens
                        max_length = 512,           # truncate/pad to this length
                        padding = 'max_length',     # pad to max length
                        return_attention_mask = True, # return attention masks
                        return_tensors = 'pt'       # return PyTorch tensors
                   )
    
    # get token IDs and attention mask from encoded dictionary
    input_ids = encoded_dict['input_ids']
    attention_mask = encoded_dict['attention_mask']
    
    # predict the textual entailment label using the pre-trained model
    with torch.no_grad():
        logits = model(input_ids, attention_mask=attention_mask)
        entailment_probabilities = torch.softmax(logits[0], dim=1).tolist()[0]
        entailment_labels = ["REFUTES", "SUPPORTS", "NOT ENOUGH INFO"]
        predicted_textual_entailment = entailment_labels[entailment_probabilities.index(max(entailment_probabilities))]
    
    return predicted_textual_entailment

##Testing the model with custom claim-evidence pair

In [None]:
claim = "Manoj is taller than Suraj."
evidence = [
    "Suraj is taller than Purohit.", "Purohit is taller than Manoj."
]

# The correct label is REFUTES


print("Claim: ", claim)
print("Evidences: ")
cnt = 1
for evid in evidence:
  print(cnt, ": ", evid)
  cnt += 1
predicted_label = predict_textual_entailment(claim, evidence)
print("Label: ", predicted_label)

Claim:  Manoj is taller than Suraj.
Evidences: 
1 :  Suraj is taller than Purohit.
2 :  Purohit is taller than Manoj.
Label:  REFUTES


##Testing the model on the development dataset

Public link to the development dataset "output.csv": https://drive.google.com/file/d/1fm6SDn0TQckZqLFs7ctNODOZNPLC1W_z/view?usp=sharing

In [None]:
dev_data = []
with open('output.csv', 'r', newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        separator = "|"
        evidence_list = [item for item in row['evidence'].split(separator)]
        # print(evidence_list)
        dev_data.append({'claim': row['claim'], 'evidence': evidence_list, 'label': row['label']})

In [None]:
predicted_label = []
ground_label = []
for row in tqdm(dev_data[1000:1100]):
  predicted_label.append(predict_textual_entailment(row['claim'], row['evidence']))
  ground_label.append(row['label'])

100%|██████████| 100/100 [09:48<00:00,  5.89s/it]


In [None]:
accuracy = accuracy_score(ground_label, predicted_label)
print("Accuracy:", accuracy)

Accuracy: 0.83


##Individual Class accuracy analysis

In [None]:
class1_pred = []
class1_ground = []
class2_pred = []
class2_ground = []
class3_pred = []
class3_ground = []
for ind in range(len(ground_label)):
  label = ground_label[ind]
  pred = predicted_label[ind]
  if label == 'SUPPORTS':
    class1_ground.append(label)
    class1_pred.append(pred)
  elif label == 'REFUTES':
    class2_ground.append(label)
    class2_pred.append(pred)
  else:
    class3_ground.append(label)
    class3_pred.append(pred)
accuracy = accuracy_score(class1_pred, class1_ground)
print("Supports Accuracy:", accuracy)
accuracy = accuracy_score(class2_pred, class2_ground)
print("Refutes Accuracy:", accuracy)
accuracy = accuracy_score(class3_pred, class3_ground)
print("NEI Accuracy:", accuracy)

Supports Accuracy: 0.9166666666666666
Refutes Accuracy: 0.5555555555555556
NEI Accuracy: 0.96


In [None]:
weighted_f1 = f1_score(ground_label, predicted_label, average='weighted')
print(weighted_f1)

0.8212838915470495


##Testing on unseen data

In [None]:
test_data = []
with open('retrieved_evidence_output1.csv', 'r', newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        separator = "|"
        evidence_list = [item for item in row['string_evidence'].split(separator)]
        # print(evidence_list)
        test_data.append({'claim': row['claim'], 'evidence': evidence_list, 'label': row['label']})

In [None]:
predicted_label = []
ground_label = []
for row in tqdm(test_data[:30]):
  predicted_label.append(predict_textual_entailment(row['claim'], row['evidence']))
  ground_label.append(row['label'])

100%|██████████| 30/30 [02:57<00:00,  5.93s/it]


In [None]:
accuracy = accuracy_score(ground_label, predicted_label)
print("Accuracy:", accuracy)

Accuracy: 0.43333333333333335


In [None]:
import textwrap
sample = dev_data[1000]
claim = sample['claim']
evidence = sample['evidence']
print("Claim: ", claim)
print("Evidences: ")
cnt = 1
for evid in evidence:
  wrapped = textwrap.fill(evid, width=80)
  print(cnt, ": ", wrapped)
  cnt += 1
predicted_label = predict_textual_entailment(claim, evidence)
print("Label: ", predicted_label)

Claim:  One of the leads in Transformers: Age of Extinction is an American rapper.
Evidences: 
1 :  It stars Mark Wahlberg , with Peter Cullen reprising his role as the voice of
Optimus Prime , as the lead roles .
2 :  Mark Robert Michael Wahlberg -LRB- born June 5 , 1971 -RRB- is an American actor
, producer , businessman , former model , and rapper .
Label:  SUPPORTS
