Zero-shot learning (Direct Prompt)

In [None]:
# pip install accelerate
import time
import json
import torch


def zero_shot_direct_prompt_classification(claim_text, evidences, tokenizer, model, max_len=512 ):

    prompt = (
        "Classify if the evidences SUPPORTS, REFUTES, have NOT_ENOUGH_INFO or DISPUTED regarding the claim. Examples are provided for you to understand the logic.\n"

        f"Claim: {claim_text}\n"
        f"Evidence: {' [SEP] '.join(evidences)}\n"
    )

    # Tokenize and generate output
    input_ids = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=max_len).input_ids.to("cuda")
    output_ids = model.generate(input_ids, max_length=10)
    output = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # Ensure the output is one of the valid labels
    valid_labels = ["SUPPORTS", "REFUTES", "NOT_ENOUGH_INFO", "DISPUTED"]
    output = output.strip().upper()

    return output if output in valid_labels else "NOT_ENOUGH_INFO"

Zero-shot learning (Role-play Prompt)

In [None]:
def zero_shot_roleplay_prompt_classification(claim_text, evidences, tokenizer, model, max_len=512):

    prompt = (
        "You are a fact-checking assistant and your task is to classify if the evidences SUPPORTS, REFUTES, have NOT_ENOUGH_INFO or DISPUTED regarding the claim.\n"

        f"Claim: {claim_text}\n"
        f"Evidence: {' [SEP] '.join(evidences)}\n"
    )

    # Tokenize and generate output
    input_ids = tokenizer(prompt, return_tensors="pt",truncation=True, max_length=max_len).input_ids.to("cuda")
    output_ids = model.generate(input_ids, max_length=10)
    output = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # Ensure the output is one of the valid labels
    valid_labels = ["SUPPORTS", "REFUTES", "NOT_ENOUGH_INFO", "DISPUTED"]
    output = output.strip().upper()

    return output if output in valid_labels else "NOT_ENOUGH_INFO"

Few-shot Learning (Direct Prompt)

In [None]:
def few_shot_direct_prompt_classification(claim_text, evidences, tokenizer, model, max_len=512 ):

    prompt = (
         "Classify if the evidences SUPPORTS, REFUTES, have NOT_ENOUGH_INFO or DISPUTED regarding the claim. Examples are provided below for reference.\n"

        "Example 1:\n"
        "Claim: \"Our harmless emissions of trifling quantities of carbon dioxide cannot possibly acidify the oceans.\"\n"
        "Evidence: \"Carbon dioxide also causes ocean acidification because it dissolves in water to form carbonic acid.\", "
        "\"Marine calcifiers exhibit mixed responses to CO 2-induced ocean acidification.\"\n"
        "Answer: REFUTES\n\n"

        "Example 2:\n"
        "Claim: \"Sea-level rise does not seem to depend on ocean temperature, and certainly not on CO2\"\n"
        "Evidence: \"This depth depends on (among other things) temperature and the amount of CO 2 dissolved in the ocean.\", "
        "\"Because different climate models have slightly different patterns of ocean heating, they do not agree fully on the predictions for the contribution of ocean heating on sea level rise.\"\n"
        "Answer: DISPUTED\n\n"

        "Example 3:\n"
        "Claim: \"Ocean and surface temperature measurements find the planet continues to accumulate heat.\"\n"
        "Evidence: \"Greenhouse gases trap heat radiating from the Earth to space.\", "
         "\"Energy from the Sun heats this layer, and the surface below, causing expansion of the air.\"\n"
        "Answer: NOT_ENOUGH_INFO\n\n"

         "Now classify the following claim:\n"
        f"Claim: {claim_text}\n"
        f"Evidence: {' [SEP] '.join(evidences)}\n"
        "Answer: "
    )

    # Tokenize and generate output
    input_ids = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=max_len).input_ids.to("cuda")
    output_ids = model.generate(input_ids, max_length=10)
    output = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # Ensure the output is one of the valid labels
    valid_labels = ["SUPPORTS", "REFUTES", "NOT_ENOUGH_INFO", "DISPUTED"]
    output = output.strip().upper()

    return output if output in valid_labels else "NOT_ENOUGH_INFO"

Few-shot Learning (Role Play Prompt)

In [None]:
def few_shot_roleplay_prompt_classification(claim_text, evidences, tokenizer, model, max_len=512):

    prompt = (
         "You are a fact-checking assistant and your task is to classify if the evidences SUPPORTS, REFUTES, have NOT_ENOUGH_INFO or DISPUTED regarding the claim. Examples are provided below for reference.\n"

        "Example 1:\n"
        "Claim: \"Our harmless emissions of trifling quantities of carbon dioxide cannot possibly acidify the oceans.\"\n"
        "Evidence: \"Carbon dioxide also causes ocean acidification because it dissolves in water to form carbonic acid.\", "
        "\"Marine calcifiers exhibit mixed responses to CO 2-induced ocean acidification.\"\n"
        "Answer: REFUTES\n\n"

        "Example 2:\n"
        "Claim: \"Sea-level rise does not seem to depend on ocean temperature, and certainly not on CO2\"\n"
        "Evidence: \"This depth depends on (among other things) temperature and the amount of CO 2 dissolved in the ocean.\", "
        "\"Because different climate models have slightly different patterns of ocean heating, they do not agree fully on the predictions for the contribution of ocean heating on sea level rise.\"\n"
        "Answer: DISPUTED\n\n"

        "Example 3:\n"
        "Claim: \"Ocean and surface temperature measurements find the planet continues to accumulate heat.\"\n"
        "Evidence: \"Greenhouse gases trap heat radiating from the Earth to space.\", "
         "\"Energy from the Sun heats this layer, and the surface below, causing expansion of the air.\"\n"
        "Answer: NOT_ENOUGH_INFO\n\n"

         "Now classify the following claim:\n"
        f"Claim: {claim_text}\n"
        f"Evidence: {' [SEP] '.join(evidences)}\n"
        "Answer: "
    )

    # Tokenize and generate output
    input_ids = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=max_len).input_ids.to("cuda")
    output_ids = model.generate(input_ids, max_length=10)
    output = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # Ensure the output is one of the valid labels
    valid_labels = ["SUPPORTS", "REFUTES", "NOT_ENOUGH_INFO", "DISPUTED"]
    output = output.strip().upper()

    return output if output in valid_labels else "NOT_ENOUGH_INFO"

LLM Pipeline

In [None]:
# pip install accelerate
import time
import json
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix


def evaluate_llm(dev_data, mode, tokenizer, model ):

    start_time = time.time()

    # Define the label map to ensure consistency
    label_map = {'SUPPORTS': 0, 'REFUTES': 1, 'NOT_ENOUGH_INFO': 2, 'DISPUTED': 3}

    # Initialize lists for storing ground truth and predictions
    ground_truths = []
    predictions = []

    for item in dev_data:
        claim_id = item["claim_id"]
        claim_text = item["claim_text"]
        evidences = item["evidences"]
        true_label = item["claim_label"]

        # zero-shot direct prompting
        if mode == "zero-shot-direct":
          predicted_label = zero_shot_direct_prompt_classification(claim_text, evidences, tokenizer, model )

        # zero-shot role play prompting
        if mode == "zero-shot-role-play":
          predicted_label = zero_shot_roleplay_prompt_classification(claim_text, evidences, tokenizer, model )

        # few-shot direct prompting
        if mode == "few-shot-direct":
          predicted_label = few_shot_direct_prompt_classification(claim_text, evidences, tokenizer, model )

        # few-shot role play prompting
        if mode == "few-shot-role-play":
          predicted_label = few_shot_roleplay_prompt_classification(claim_text, evidences, tokenizer, model )

        # Convert labels to integers using the label map
        ground_truths.append(label_map[true_label])
        predictions.append(label_map[predicted_label])

    end_time = time.time()
    total_inference_time = end_time - start_time

    # Calculate metrics
    accuracy = accuracy_score(ground_truths, predictions)
    precision = precision_score(ground_truths, predictions, average='weighted', zero_division=0)
    recall = recall_score(ground_truths, predictions, average='weighted', zero_division=0)
    f1 = f1_score(ground_truths, predictions, average='weighted', zero_division=0)

    # Confusion matrix
    cm = confusion_matrix(ground_truths, predictions)
    print("Confusion Matrix:\n", cm)

    # Print metrics
    print(f"Total Inference Time: {total_inference_time:.2f} seconds")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")

    # Return results and metrics
    return {
        "inference_time": total_inference_time,
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1_score": f1
    }

# Evaluate
with open('/kaggle/input/claim-evidence-pair/claim-evidence-set/claim-evidence-dev_set.json', 'r') as f:
  dev_data = json.load(f)

prompting_mode = ["zero-shot-direct", "zero-shot-role-play", "few-shot-direct", "few-shot-role-play"]

for mode in prompting_mode:
  print(f"Prompting Mode: {mode}")
  tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large")
  model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-large", device_map="auto")
  results = evaluate_llm(dev_data, mode, tokenizer, model )
  print(results)
  print("-----------------------------------------------------------")