#### This notebook was run on Google Colab as I needed more than 8GB of VRAM to run Flan-T5-xl. Hence some of the file paths are specific to my Google Drive, so may not run correctly on another machine.

In [1]:
!pip install transformers
!pip install pandas
!pip install nltk
!pip install torch
!pip install datasets





In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, AutoModelForSeq2SeqLM
import torch
import re



In [3]:
from google.colab import drive
drive.mount('/content/drive')

import sys
sys.path.append('/content/drive/MyDrive/NLP_prompting')

from fnc_dataset import FNCDataset



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Load FLAN T5 Model

In [4]:
#load the Flan-T5-xl model
modelName = "google/flan-t5-xl"
model = AutoModelForSeq2SeqLM.from_pretrained(modelName).to("cuda")
tokenizer = AutoTokenizer.from_pretrained(modelName, use_fast=True)

#move model to gpu, set seed for reproducibility
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
seed = 42
torch.manual_seed(seed)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

<torch._C.Generator at 0x7bf9507bf430>

## Define Prompt
## Measure Performance on Competition Set

In [5]:
batchSize = 50

stanceMapping = {
    "agree": 0,
    "disagree": 1,
    "discuss": 2,
    "unrelated": 3
}

reverseStanceMapping = {value: key for key, value in stanceMapping.items()}


def classify_relationship_batch(headlineBodyPairs, batchSize):
    results = []
    for i in range(0, len(headlineBodyPairs), batchSize):
        batch = headlineBodyPairs[i: i + batchSize]

        promptList = []
        for headline, articleBody in batch:
            prompt = (f"""
Classify the relationship between the following headline and article body into one of these categories: 'agree', 'disagree', 'discuss', or 'unrelated'.

Examples:
1.
Headline: "New study shows cats reduce stress levels"
Article Body: "A recent scientific study confirms that spending time with cats can significantly reduce stress."
Answer: agree

2.
Headline: "Climate change is a hoax"
Article Body: "Scientists around the globe provide evidence that climate change is real and caused by human activity."
Answer: disagree

3.
Headline: "Electric cars will replace petrol vehicles by 2035"
Article Body: "The shift to electric vehicles is gaining momentum, but infrastructure and consumer adoption remain key challenges."
Answer: discuss

4.
Headline: "The Eiffel Tower was built in 1999"
Article Body: "A new study suggests that drinking coffee can improve cognitive function and extend lifespan."
Answer: unrelated

Now classify the following:
Headline: "{headline}"
Article Body: "{articleBody}"

Answer with only one category: 'agree', 'disagree', 'discuss', or 'unrelated'.
            """)
            promptList.append(prompt)

        #tokenise inputs
        tokenizedPrompts = tokenizer(promptList, return_tensors="pt", truncation=True, max_length=1024, padding=True).to(device)

        #generate outputs, small max_length as we only want one word back
        with torch.no_grad():
            outputs = model.generate(**tokenizedPrompts, max_length=200)

        #decode the outputs for the batch and add to the overall results list
        batchResults = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
        results.extend(batchResults)

        #free VRAM
        del tokenizedPrompts
        torch.cuda.empty_cache()

    return results


def prompting_dataset_to_dataframe(fncDataset):
    #merge stance and body dataframes, drop Body ID column, rename columns, map stances to integers
    combinedDataframe = fncDataset.headlinesBodiesCombined
    combinedDataframe = combinedDataframe.drop(columns=["Body ID"])
    combinedDataframe = combinedDataframe.rename(columns={
        "Headline": "headline",
        "Stance": "label",
        "articleBody": "body"
    })
    combinedDataframe["label"] = combinedDataframe["label"].map(stanceMapping)

    print(f"{len(combinedDataframe)} entries in the dataset")
    return combinedDataframe


competitionData = FNCDataset.from_csv(labelledStancesPath="/content/drive/MyDrive/NLP_prompting/fnc-1/competition_test_stances.csv", articlesPath="/content/drive/MyDrive/NLP_prompting/fnc-1/competition_test_bodies.csv")
competitionDataframe = prompting_dataset_to_dataframe(competitionData)

headlineBodyPairs = list(zip(competitionDataframe["headline"], competitionDataframe["body"]))
correctIntLabels = list(competitionDataframe["label"])

results = classify_relationship_batch(headlineBodyPairs, batchSize)

cleanResults = [result.strip().lower().split(" ")[-1] for result in results]
correctStringLabels = [reverseStanceMapping[correctIntLabel] for correctIntLabel in correctIntLabels]

#initialise performance tracking variables
classCorrectCounts = {label: 0 for label in stanceMapping.keys()}
classTotalCounts = {label: 0 for label in stanceMapping.keys()}
wrongResults = set()
correctCount = 0

#go through the model predictions and count how many are correct for each class
for i, cleanResult in enumerate(cleanResults):
    correctStringLabel = correctStringLabels[i]
    classTotalCounts[correctStringLabel] += 1
    if cleanResult == correctStringLabel:
        correctCount += 1
        classCorrectCounts[correctStringLabel] += 1
    else:
        wrongResults.add(cleanResult)

#print overall accuracy
overallAccuracy = correctCount / len(cleanResults) * 100.0
print(f"Overall Accuracy: {overallAccuracy}")

#print class accuracies
for label in stanceMapping.keys():
    if classTotalCounts[label] > 0:
        classAccuracy = classCorrectCounts[label] / classTotalCounts[label] * 100.0
        print(f"Accuracy for '{label}': {classAccuracy:.2f}%")
    else:
        print(f"No samples for class '{label}'.")

print(f"totals encountered per class: {classTotalCounts}")
print(f"Wrong answer set: {wrongResults}")





25413 entries in the dataset
Overall Accuracy: 86.88860032266949
Accuracy for 'agree': 25.64%
Accuracy for 'disagree': 52.94%
Accuracy for 'discuss': 66.44%
Accuracy for 'unrelated': 99.50%
totals encountered per class: {'agree': 1903, 'disagree': 697, 'discuss': 4464, 'unrelated': 18349}
Wrong answer set: {'discuss', 'disagree', 'unrelated', 'agree'}
