In [1]:
import os
import re
import json
import torch
import numpy as np
from tqdm.auto import tqdm
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from transformers import pipeline
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

from data_loader import load_dataset, format_input_with_context

In [2]:
MODEL_NAME = 'meta-llama/Llama-3.2-3B-Instruct'
BATCH_SIZE = 16 # 12 works
USE_CTX = False # whether to use [CTX] parts of threads (too slow for True)


RESULTS_DIR = "./results/prompting/"
os.makedirs(RESULTS_DIR, exist_ok=True)

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# load data
train_df, dev_df, test_df = load_dataset()

VALID_STANCES = ['SUPPORT', 'DENY', 'QUERY', 'COMMENT']

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")

Loading cached data from saved_data/datasets.pkl...


In [3]:
# prompts
PERSONA = """You are an expert in rumour stance analysis on twitter."""
INSTRUCTION = """Your task is to classify the stance of the TARGET tweet towards the veracity of the rumour in the SOURCE tweet."""
INPUT_FORMAT = """\
    The input will be provided as a single string containing labeled segments:
    "[SRC] ... [PARENT] ... [TARGET] ..." (Note: [PARENT] may be omitted if not applicable).
    """
LABEL_DEFNS = """\
    Classification Labels:
    - SUPPORT: The reply supports the veracity of the source claim
    - DENY: The reply denies the veracity of the source claim
    - QUERY: The reply asks for additional evidence in relation to the veracity of the source claim
    - COMMENT: The reply makes their own comment without a clear contribution to assessing the veracity of the source claim
    """
OUTPUT_FORMAT = """\
    Respond with ONLY one word: SUPPORT, DENY, QUERY, or COMMENT.
    """

SYS_PROMPT = f"""\
{PERSONA}
{INSTRUCTION}
{INPUT_FORMAT}
{LABEL_DEFNS}
{OUTPUT_FORMAT}
"""

USER_PROMPT_TEMPLATE = """\
Text: {thread_context}

Task: Classify the stance of [TARGET] towards the veracity of the rumour in [SRC].
"""

def build_user_prompt(thread_context):
    return USER_PROMPT_TEMPLATE.format(thread_context=thread_context)


def build_zero_shot_messages(thread_context):
    return [
        {"role": "system", "content": SYS_PROMPT},
        {"role": "user", "content": build_user_prompt(thread_context)},
    ]


def build_few_shot_messages(thread_context, examples=None):
    messages = [{"role": "system", "content": SYS_PROMPT}]
    
    if examples:
        for ex in examples:
            messages.append({"role": "user", "content": build_user_prompt(ex['source'])})
            messages.append({"role": "assistant", "content": ex['label']})
    
    messages.append({"role": "user", "content": build_user_prompt(thread_context)})
    return messages


In [4]:
def parse_stance_response(text):
    text = text.upper().strip()
    for label in VALID_STANCES:
        if re.search(rf'\b{label}\b', text):
            return label.lower()
    print(f'Error! On {text}\n')
    return None


In [5]:
def generate_response(pipe, messages, max_new_tokens=10): # max n tokens to limit response length
    output = pipe(
        messages, 
        max_new_tokens=max_new_tokens,
        do_sample=False, # deterministic output
        pad_token_id=pipe.tokenizer.eos_token_id, # prevent warnings
        )
    return output[0]["generated_text"][-1]["content"].strip()


# LLM --
def plot_confusion_matrix(cm, mode, save_path=None):
    """Plot confusion matrix."""
    labels = ['support', 'deny', 'query', 'comment']
    
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title(f'Confusion Matrix ({mode})')
    
    if save_path:
        plt.savefig(save_path, dpi=150, bbox_inches='tight')
        print(f"Saved: {save_path}")
    
    plt.show()


def save_results(results, model_key, mode):
    """Save evaluation results to JSON."""
    output = {
        'model': model_key,
        'mode': mode,
        'macro_f1': results['macro_f1'],
        'per_class_f1': results['per_class_f1'],
        'predictions': results['predictions'],
        'true_labels': results['true_labels'],
        'raw_responses': results['raw_responses']
    }
    
    filename = f"{RESULTS_DIR}{model_key}_{mode}_results.json"
    with open(filename, 'w') as f:
        json.dump(output, f, indent=2)
    print(f"Saved: {filename}")

In [6]:
# few shot examples
support_example = train_df[train_df['label_text']=='support'].iloc[0]
deny_example = train_df[train_df['label_text']=='deny'].iloc[0]
query_example = train_df[train_df['label_text']=='query'].iloc[0]
comment_example = train_df[train_df['label_text']=='comment'].iloc[0]


def get_few_shot_examples(df, n_per_class=1):
    """Select stratified random examples for few-shot prompting."""
    examples = []
    
    for label in ['support', 'deny', 'query', 'comment']:
        class_df = df[df['label_text'] == label]
        samples = class_df.sample(n=min(n_per_class, len(class_df)))
        
        for _, row in samples.iterrows():
            examples.append({
                'source': format_input_with_context(row, df, use_features=False, use_context=USE_CTX),
                'label': label.upper()
            })
    
    return examples

In [7]:
# load model
pipe = pipeline(
    "text-generation", 
    MODEL_NAME, 
    device_map='auto',
    dtype=torch.float16,
)
pipe.tokenizer.pad_token = pipe.tokenizer.eos_token
pipe.tokenizer.padding_side = "left"

print(MODEL_NAME)
print(BATCH_SIZE)
print(pipe.model.config)

`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "dtype": "float16",
  "eos_token_id": [
    128001,
    128008,
    128009
  ],
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 24,
  "num_hidden_layers": 28,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": {
    "factor": 32.0,
    "high_freq_factor": 4.0,
    "low_freq_factor": 1.0,
    "original_max_position_embeddings": 8192,
    "rope_type": "llama3"
  },
  "rope_theta": 500000.0,
  "tie_word_embeddings": true,
  "transformers_version": "4.57.3",
  "use_cache": true,
  "vocab_size": 128256
}

meta-llama/Llama-3.2-3B-Instruct
12


In [8]:
def evaluate_prompting(model, df, mode="zero-shot", examples=None, batch_size=BATCH_SIZE, verbose=True):
    from datasets import Dataset
    
    # Prepare all messages upfront
    all_messages = []
    for _, row in df.iterrows():
        input_text = format_input_with_context(row, df, use_features=False, use_context=USE_CTX)
        if mode == "zero-shot":
            messages = build_zero_shot_messages(input_text)
        else:
            messages = build_few_shot_messages(input_text, examples)
        all_messages.append(messages)
    
    # Batched inference using Dataset
    dataset = Dataset.from_dict({"messages": all_messages})
    
    raw_responses = []
    for i in tqdm(range(0, len(all_messages), batch_size), desc=f"Evaluating ({mode})"):
        batch = all_messages[i:i+batch_size]
        outputs = pipe(batch, max_new_tokens=10, pad_token_id=pipe.tokenizer.eos_token_id)
        for out in outputs:
            raw_responses.append(out[0]["generated_text"][-1]["content"].strip())
    
    # Parse responses
    predictions = [parse_stance_response(r) for r in raw_responses]

    # count number of None (errors), print and replace with comment
    num_errors = predictions.count(None)
    if num_errors > 0:
        print(f"Warning: {num_errors} errors in predictions. Replacing with 'comment'.")
        predictions = [p if p is not None else 'comment' for p in predictions]

    true_labels = df['label_text'].tolist()
    
    # Metrics
    labels = ['support', 'deny', 'query', 'comment']
    macro_f1 = f1_score(true_labels, predictions, average='macro')
    per_class_f1 = f1_score(true_labels, predictions, average=None, labels=labels)
    
    if verbose:
        print(f"\n{'='*60}\nResults ({mode})\n{'='*60}")
        print(f"Macro F1: {macro_f1:.4f}")
        print(f"\nPer-class F1:")
        for lbl, f1 in zip(labels, per_class_f1):
            print(f"  {lbl}: {f1:.4f}")
        print(f"\n{classification_report(true_labels, predictions, labels=labels)}")
    
    # Save predictions to CSV
    results_df = pd.DataFrame({
        'true_label': true_labels,
        'predicted': predictions,
    })
    csv_path = f"{RESULTS_DIR}predictions_{mode}.csv"
    results_df.to_csv(csv_path, index=False)
    if verbose:
        print(f"Saved predictions: {csv_path}")
    
    return {
        'predictions': predictions,
        'true_labels': true_labels,
        'raw_responses': raw_responses,
        'macro_f1': macro_f1,
        'per_class_f1': dict(zip(labels, per_class_f1)),
        'confusion_matrix': confusion_matrix(true_labels, predictions, labels=labels)
    }

In [9]:
# zero-shot

zero_results = evaluate_prompting(pipe, dev_df, mode="zero-shot")

Evaluating (zero-shot):   0%|          | 0/24 [00:00<?, ?it/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset



Results (zero-shot)
Macro F1: 0.2110

Per-class F1:
  support: 0.0000
  deny: 0.1739
  query: 0.2292
  comment: 0.4409

              precision    recall  f1-score   support

     support       0.00      0.00      0.00        69
        deny       0.11      0.36      0.17        11
       query       0.13      0.79      0.23        28
     comment       0.69      0.32      0.44       173

    accuracy                           0.29       281
   macro avg       0.23      0.37      0.21       281
weighted avg       0.44      0.29      0.30       281

Saved predictions: ./results/prompting/predictions_zero-shot.csv


In [10]:
# few-shot
few_shot_examples = get_few_shot_examples(train_df, n_per_class=1)
few_results = evaluate_prompting(pipe, df, mode="few-shot", examples=few_shot_examples)

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Ground truth: comment
Response:     QUERY



Evaluating (few-shot):   0%|          | 0/24 [00:00<?, ?it/s]

Error! On RE-POST

Error! On REPOST

Error! On REPEAT

Error! On REPEAT


Results (few-shot)
Macro F1: 0.3654

Per-class F1:
  support: 0.3306
  deny: 0.2632
  query: 0.3623
  comment: 0.5057

              precision    recall  f1-score   support

     support       0.38      0.29      0.33        69
        deny       0.19      0.45      0.26        11
       query       0.23      0.89      0.36        28
     comment       0.73      0.39      0.51       173

    accuracy                           0.42       281
   macro avg       0.38      0.51      0.37       281
weighted avg       0.57      0.42      0.44       281

Saved predictions: ./results/prompting/predictions_few-shot.csv
