Zero-shot learning (Direct Prompt)

In [14]:
# pip install accelerate
import time
import json
import torch


def zero_shot_direct_prompt_classification(claim_text, evidences, tokenizer, model, max_len=512 ):

    prompt = (
        "Classify if the evidences SUPPORTS, REFUTES, have NOT_ENOUGH_INFO or DISPUTED regarding the claim. Examples are provided for you to understand the logic.\n"

        f"Claim: {claim_text}\n"
        f"Evidence: {' [SEP] '.join(evidences)}\n"
    )

    # Tokenize and generate output
    input_ids = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=max_len).input_ids.to("cuda")
    output_ids = model.generate(input_ids, max_length=10)
    output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    # print(output)

    # Ensure the output is one of the valid labels
    valid_labels = ["SUPPORTS", "REFUTES", "NOT_ENOUGH_INFO", "DISPUTED"]
    output = output.strip().upper()

    return output if output in valid_labels else "NOT_ENOUGH_INFO"

Zero-shot learning (Role-play Prompt)

In [15]:
def zero_shot_roleplay_prompt_classification(claim_text, evidences, tokenizer, model, max_len=512):

    prompt = (
        "You are a fact-checking assistant and your task is to classify if the evidences SUPPORTS, REFUTES, have NOT_ENOUGH_INFO or DISPUTED regarding the claim.\n"

        f"Claim: {claim_text}\n"
        f"Evidence: {' [SEP] '.join(evidences)}\n"
    )

    # Tokenize and generate output
    input_ids = tokenizer(prompt, return_tensors="pt",truncation=True, max_length=max_len).input_ids.to("cuda")
    output_ids = model.generate(input_ids, max_length=10)
    output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    # print(output)

    # Ensure the output is one of the valid labels
    valid_labels = ["SUPPORTS", "REFUTES", "NOT_ENOUGH_INFO", "DISPUTED"]
    output = output.strip().upper()

    return output if output in valid_labels else "NOT_ENOUGH_INFO"

Few-shot Learning (Direct Prompt)

In [16]:
def few_shot_direct_prompt_classification(claim_text, evidences, tokenizer, model, max_len=512 ):

    prompt = (
         "Classify if the evidences SUPPORTS, REFUTES, have NOT_ENOUGH_INFO or DISPUTED regarding the claim. Examples are provided below for reference.\n"

        "Example 1:\n"
        "Claim: \"Our harmless emissions of trifling quantities of carbon dioxide cannot possibly acidify the oceans.\"\n"
        "Evidence: \"Carbon dioxide also causes ocean acidification because it dissolves in water to form carbonic acid.\", "
        "\"Marine calcifiers exhibit mixed responses to CO 2-induced ocean acidification.\"\n"
        "Answer: REFUTES\n\n"

        "Example 2:\n"
        "Claim: \"Sea-level rise does not seem to depend on ocean temperature, and certainly not on CO2\"\n"
        "Evidence: \"This depth depends on (among other things) temperature and the amount of CO 2 dissolved in the ocean.\", "
        "\"Because different climate models have slightly different patterns of ocean heating, they do not agree fully on the predictions for the contribution of ocean heating on sea level rise.\"\n"
        "Answer: DISPUTED\n\n"

        "Example 3:\n"
        "Claim: \"Ocean and surface temperature measurements find the planet continues to accumulate heat.\"\n"
        "Evidence: \"Greenhouse gases trap heat radiating from the Earth to space.\", "
         "\"Energy from the Sun heats this layer, and the surface below, causing expansion of the air.\"\n"
        "Answer: NOT_ENOUGH_INFO\n\n"

         "Now classify the following claim:\n"
        f"Claim: {claim_text}\n"
        f"Evidence: {' [SEP] '.join(evidences)}\n"
        "Answer: "
    )

    # Tokenize and generate output
    input_ids = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=max_len).input_ids.to("cuda")
    output_ids = model.generate(input_ids, max_length=10)
    output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    # print(output)

    # Ensure the output is one of the valid labels
    valid_labels = ["SUPPORTS", "REFUTES", "NOT_ENOUGH_INFO", "DISPUTED"]
    output = output.strip().upper()

    return output if output in valid_labels else "NOT_ENOUGH_INFO"

Few-shot Learning (Role Play Prompt)

In [17]:
def few_shot_roleplay_prompt_classification(claim_text, evidences, tokenizer, model, max_len=512):

    prompt = (
         "You are a fact-checking assistant and your task is to classify if the evidences SUPPORTS, REFUTES, have NOT_ENOUGH_INFO or DISPUTED regarding the claim. Examples are provided below for reference.\n"

        "Example 1:\n"
        "Claim: \"Our harmless emissions of trifling quantities of carbon dioxide cannot possibly acidify the oceans.\"\n"
        "Evidence: \"Carbon dioxide also causes ocean acidification because it dissolves in water to form carbonic acid.\", "
        "\"Marine calcifiers exhibit mixed responses to CO 2-induced ocean acidification.\"\n"
        "Answer: REFUTES\n\n"

        "Example 2:\n"
        "Claim: \"Sea-level rise does not seem to depend on ocean temperature, and certainly not on CO2\"\n"
        "Evidence: \"This depth depends on (among other things) temperature and the amount of CO 2 dissolved in the ocean.\", "
        "\"Because different climate models have slightly different patterns of ocean heating, they do not agree fully on the predictions for the contribution of ocean heating on sea level rise.\"\n"
        "Answer: DISPUTED\n\n"

        "Example 3:\n"
        "Claim: \"Ocean and surface temperature measurements find the planet continues to accumulate heat.\"\n"
        "Evidence: \"Greenhouse gases trap heat radiating from the Earth to space.\", "
         "\"Energy from the Sun heats this layer, and the surface below, causing expansion of the air.\"\n"
        "Answer: NOT_ENOUGH_INFO\n\n"

         "Now classify the following claim:\n"
        f"Claim: {claim_text}\n"
        f"Evidence: {' [SEP] '.join(evidences)}\n"
        "Answer: "
    )

    # Tokenize and generate output
    input_ids = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=max_len).input_ids.to("cuda")
    output_ids = model.generate(input_ids, max_length=10)
    output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    # print(output)

    # Ensure the output is one of the valid labels
    valid_labels = ["SUPPORTS", "REFUTES", "NOT_ENOUGH_INFO", "DISPUTED"]
    output = output.strip().upper()

    return output if output in valid_labels else "NOT_ENOUGH_INFO"

LLM Pipeline

In [18]:
# pip install accelerate
import time
import json
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix


def evaluate_llm(dev_data, mode, tokenizer, model ):

    start_time = time.time()

    # Define the label map to ensure consistency
    label_map = {'SUPPORTS': 0, 'REFUTES': 1, 'NOT_ENOUGH_INFO': 2, 'DISPUTED': 3}

    # Initialize lists for storing ground truth and predictions
    ground_truths = []
    predictions = []

    for item in dev_data:
        claim_id = item["claim_id"]
        claim_text = item["claim_text"]
        evidences = item["evidences"]
        true_label = item["claim_label"]

        # zero-shot direct prompting
        if mode == "zero-shot-direct":
          predicted_label = zero_shot_direct_prompt_classification(claim_text, evidences, tokenizer, model )

        # zero-shot role play prompting
        if mode == "zero-shot-role-play":
          predicted_label = zero_shot_roleplay_prompt_classification(claim_text, evidences, tokenizer, model )

        # few-shot direct prompting
        if mode == "few-shot-direct":
          predicted_label = few_shot_direct_prompt_classification(claim_text, evidences, tokenizer, model )

        # few-shot role play prompting
        if mode == "few-shot-role-play":
          predicted_label = few_shot_roleplay_prompt_classification(claim_text, evidences, tokenizer, model )

        # Convert labels to integers using the label map
        ground_truths.append(label_map[true_label])
        predictions.append(label_map[predicted_label])

    end_time = time.time()
    total_inference_time = end_time - start_time

    # Calculate metrics
    accuracy = accuracy_score(ground_truths, predictions)
    precision = precision_score(ground_truths, predictions, average='weighted', zero_division=0)
    recall = recall_score(ground_truths, predictions, average='weighted', zero_division=0)
    f1 = f1_score(ground_truths, predictions, average='weighted', zero_division=0)

    #  # Confusion matrix
    cm = confusion_matrix(ground_truths, predictions)
    print("Confusion Matrix:\n", cm)

    # Print metrics
    print(f"Total Inference Time: {total_inference_time:.2f} seconds")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")

    # Return results and metrics
    return {
        "inference_time": total_inference_time,
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1_score": f1
    }

# Evaluate
with open('/kaggle/input/claim-evidence-pair/claim-evidence-set/claim-evidence-dev_set.json', 'r') as f:
  dev_data = json.load(f)

prompting_mode = ["zero-shot-direct", "zero-shot-role-play", "few-shot-direct", "few-shot-role-play"]

for mode in prompting_mode:
  print(f"Prompting Mode: {mode}")
  # Load model and tokenizer
  tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large")
  model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-large", device_map="auto")
  results = evaluate_llm(dev_data, mode, tokenizer, model )
  print(results)
  print("-----------------------------------------------------------")

Prompting Mode: zero-shot-direct
Total Inference Time: 38.98 seconds
Accuracy: 0.4740
Precision: 0.3139
Recall: 0.4740
F1 Score: 0.3579
{'inference_time': 38.97865152359009, 'accuracy': 0.474025974025974, 'precision': 0.31387596360858394, 'recall': 0.474025974025974, 'f1_score': 0.357874852420307}
-----------------------------------------------------------
Prompting Mode: zero-shot-role-play
Total Inference Time: 39.65 seconds
Accuracy: 0.5000
Precision: 0.3245
Recall: 0.5000
F1 Score: 0.3907
{'inference_time': 39.64689254760742, 'accuracy': 0.5, 'precision': 0.3245380870519914, 'recall': 0.5, 'f1_score': 0.3907494304795654}
-----------------------------------------------------------
Prompting Mode: few-shot-direct
Total Inference Time: 54.63 seconds
Accuracy: 0.4416
Precision: 0.5823
Recall: 0.4416
F1 Score: 0.4250
{'inference_time': 54.62656545639038, 'accuracy': 0.44155844155844154, 'precision': 0.5822838014698479, 'recall': 0.44155844155844154, 'f1_score': 0.4249629470556551}
-----

RUN LOG

43.1s 17 Prompting Mode: zero-shot-direct
44.3s 18 You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
111.0s 19 Confusion Matrix:
111.0s 20 [[63  3  0  2]
111.0s 21 [16 10  0  1]
111.0s 22 [37  2  0  2]
111.0s 23 [16  2  0  0]]
111.0s 24 Total Inference Time: 41.72 seconds
111.0s 25 Accuracy: 0.4740
111.0s 26 Precision: 0.3139
111.0s 27 Recall: 0.4740
111.0s 28 F1 Score: 0.3579
111.0s 29 {'inference_time': 41.7239887714386, 'accuracy': 0.474025974025974, 'precision': 0.31387596360858394, 'recall': 0.474025974025974, 'f1_score': 0.357874852420307}
111.0s 30 -----------------------------------------------------------
111.0s 31 Prompting Mode: zero-shot-role-play
154.0s 32 Confusion Matrix:
154.0s 33 [[60  6  0  2]
154.0s 34 [10 17  0  0]
154.0s 35 [32  4  0  5]
154.0s 36 [14  4  0  0]]
154.0s 37 Total Inference Time: 40.57 seconds
154.0s 38 Accuracy: 0.5000
154.0s 39 Precision: 0.3245
154.0s 40 Recall: 0.5000
154.0s 41 F1 Score: 0.3907
154.0s 42 {'inference_time': 40.57264232635498, 'accuracy': 0.5, 'precision': 0.3245380870519914, 'recall': 0.5, 'f1_score': 0.3907494304795654}
154.0s 43 -----------------------------------------------------------
154.0s 44 Prompting Mode: few-shot-direct
209.4s 45 Confusion Matrix:
209.4s 46 [[55  1  2 10]
209.4s 47 [ 7  3  1 16]
209.4s 48 [11  0  6 24]
209.4s 49 [13  0  1  4]]
209.4s 50 Total Inference Time: 52.84 seconds
209.4s 51 Accuracy: 0.4416
209.4s 52 Precision: 0.5823
209.4s 53 Recall: 0.4416
209.4s 54 F1 Score: 0.4250
209.4s 55 {'inference_time': 52.83831024169922, 'accuracy': 0.44155844155844154, 'precision': 0.5822838014698479, 'recall': 0.44155844155844154, 'f1_score': 0.4249629470556551}
209.4s 56 -----------------------------------------------------------
209.4s 57 Prompting Mode: few-shot-role-play
264.4s 58 Confusion Matrix:
264.4s 59 [[56  0  1 11]
264.4s 60 [ 7  2  6 12]
264.4s 61 [16  1  5 19]
264.4s 62 [13  0  0  5]]
264.4s 63 Total Inference Time: 53.10 seconds
264.4s 64 Accuracy: 0.4416
264.4s 65 Precision: 0.5090
264.4s 66 Recall: 0.4416
264.4s 67 F1 Score: 0.4007
264.4s 68 {'inference_time': 53.097389459609985, 'accuracy': 0.44155844155844154, 'precision': 0.509022930507667, 'recall': 0.44155844155844154, 'f1_score': 0.4006823365313931}
264.4s 69 -----------------------------------------------------------