In [1]:
import os
import re
import json
import torch
import numpy as np
from tqdm.auto import tqdm
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from transformers import AutoModelForCausalLM, AutoTokenizer
import outlines # see https://dottxt-ai.github.io/outlines/latest/features/models/transformers/
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from typing import Literal

from data_loader import load_dataset, format_input_with_context


In [41]:
MODEL_NAME = 'meta-llama/Llama-3.2-3B-Instruct'
USE_CTX = False # whether to use [CTX] parts of threads (too slow for True)
MAX_COT_TOKENS = 200 # max n tokens for cot response
VALID_STANCES = ['SUPPORT', 'DENY', 'QUERY', 'COMMENT']
RELOAD_MODEL = False # takes up lots of mem if we keep reloading

RESULTS_DIR = "./results/prompting/"
RANDOM_SEED = 42
os.makedirs(RESULTS_DIR, exist_ok=True)
np.random.seed(RANDOM_SEED)

# load data
train_df, dev_df, test_df = load_dataset()
use_df = dev_df  # df to test on

Loading cached data from saved_data/datasets.pkl...


In [3]:
# load model

# output types for outlines
StanceLabel = Literal["SUPPORT", "DENY", "QUERY", "COMMENT"]
COT_REGEX = outlines.types.Regex(r".*Label: (SUPPORT|DENY|QUERY|COMMENT)")

if 'model' not in globals() or RELOAD_MODEL:
    hf_model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        dtype=torch.float16,
        device_map='auto',
    )
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    
    # use outlines
    model = outlines.from_transformers(hf_model, tokenizer)
    
    # get rid of annoying warnings re pad_token
    model.tokenizer.pad_token = tokenizer.eos_token
    model.tokenizer.padding_side = "left"
    model.model.generation_config.pad_token_id = model.tokenizer.pad_token_id

print(MODEL_NAME)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

meta-llama/Llama-3.2-3B-Instruct


In [49]:
# prompts
# see https://huggingface.co/docs/transformers/en/tasks/prompting
# https://www.promptingguide.ai/techniques/zeroshot

PERSONA = """You are an expert in rumour stance analysis on twitter."""
INSTRUCTION = """Your task is to classify the stance of the [TARGET] tweet towards the veracity of the rumour in the [SRC] tweet."""
INPUT_FORMAT = """\
The input will be provided as a single string containing labeled segments:
"[SRC] ... [PARENT] ... [TARGET] ...". (Note: [PARENT] is omitted if [TARGET] replies directly to [SRC])
"""
LABEL_DEFNS = """\
Classification Labels:
- SUPPORT: The reply explicitly supports or provides evidence for the veracity of the source claim
- DENY: The tweet explicitly denies or provides counter-evidence for the veracity of the source claim
- QUERY: The reply asks for additional evidence in relation to the veracity of the source claim
- COMMENT: The reply makes their own opinionated/neutral comment without a clear contribution to assessing the veracity of the source claim
"""

SYS_PROMPT = f"""\
{PERSONA}
{INSTRUCTION}
{INPUT_FORMAT}
{LABEL_DEFNS}
"""
# output format set by outlines

USER_PROMPT_TEMPLATE = """\
Text: {thread_context}

Classify the stance of [TARGET] towards the veracity of the rumour in [SRC]:
"""

def build_user_prompt(thread_context):
    return USER_PROMPT_TEMPLATE.format(thread_context=thread_context)


def build_messages(thread_context, examples=None, sys_prompt=None):
    init_prompt = sys_prompt if sys_prompt is not None else SYS_PROMPT # allow custom sys prompt (eg. for CoT/ablations)
    messages = [{"role": "system", "content": init_prompt}]
    if examples: # for few-shot
        for ex in examples:
            messages.append({"role": "user", "content": build_user_prompt(ex['source'])})
            messages.append({"role": "assistant", "content": ex['response']})
    messages.append({"role": "user", "content": build_user_prompt(thread_context)}) # final user prompt to answer
    return messages


In [50]:
# COT prompts 

# add output format to think step by step
COT_OUTPUT_FORMAT = """\
Your response should follow this EXACT format:
Reasoning: [VERY BRIEFLY explain why the label fits. Use a two stage strategy: Stage 1 classify Stance vs Non-Stance - If Non-stance, skip stage 2 and answer 'COMMENT'. Otherwise: Stage 2 classify Support / Deny / Query]
Label: [EXACTLY one of: SUPPORT, DENY, QUERY, or COMMENT]
"""

COT_SYS_PROMPT = f"""\
{PERSONA}
{INSTRUCTION}
{INPUT_FORMAT}
{LABEL_DEFNS}
{COT_OUTPUT_FORMAT}
"""



# for few-shot cot

# handselected ids for reasoning
COT_FEW_SHOT_IDS = {
    'support': '524967134339022848', # @TheKirkness Radio Canada tweeting same. must be true :-(
    'deny': '544292581950357504', # @JSchoenberger7 @TheAnonMessage not an Isis flag. Just an Islamic one. Stop spreading false rumors.
    'query': '544281192632053761', # @mscott Are your reporters 100% sure it is an ISIS flag? Cause that is what is being reported. #facts
    'comment': '552804023389392896', # @thei100 @Independent these fuckers thinking its a GTA heist mission
}

COT_EXAMPLES = {
    'support': "1. Stance: takes position on veracity\n2. \"must be true\" → affirms claim\nLabel: SUPPORT",
    'deny': "1. Stance: challenges veracity\n2. \"not an Isis flag\", \"false rumors\" → rejects claim\nLabel: DENY",
    'query': "1. Stance: engages with veracity\n2. \"Are your reporters 100% sure?\" → requests evidence\nLabel: QUERY",
    'comment': "1. Stance: no veracity assessment\n2. Offers opinion/reaction only\nLabel: COMMENT",
}

def get_cot_examples(train_df):
    examples = []
    for label, tweet_id in COT_FEW_SHOT_IDS.items():
        matches = train_df[train_df['tweet_id'] == tweet_id]
        if len(matches) == 0:
            print(f"Warning: COT example tweet_id {tweet_id} not found")
            continue
        row = matches.iloc[0]
        examples.append({
            'source': format_input_with_context(row, train_df, use_features=False, use_context=USE_CTX),
            'response': COT_EXAMPLES[label]
        })
    return examples



In [15]:
def parse_cot_label(text): # get final label from cot response
    match = re.search(r'Label:\s*(SUPPORT|DENY|QUERY|COMMENT)', text, re.IGNORECASE)
    return match.group(1).lower() if match else None

In [16]:

# LLM --
def plot_confusion_matrix(cm, mode, save_path=None):
    """Plot confusion matrix."""
    labels = ['support', 'deny', 'query', 'comment']
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title(f'Confusion Matrix ({mode})')
    if save_path:
        plt.savefig(save_path, dpi=150, bbox_inches='tight')
        print(f"Saved: {save_path}")
    plt.show()


def save_results(results, model_key, mode):
    """Save evaluation results to JSON."""
    output = {
        'model': model_key,
        'mode': mode,
        'macro_f1': results['macro_f1'],
        'per_class_f1': results['per_class_f1'],
        'predictions': results['predictions'],
        'true_labels': results['true_labels'],
        'raw_responses': results['raw_responses']
    }
    filename = f"{RESULTS_DIR}{model_key}_{mode}_results.json"
    with open(filename, 'w') as f:
        json.dump(output, f, indent=2)
    print(f"Saved: {filename}")

In [17]:
# few shot examples
support_example = train_df[train_df['label_text']=='support'].iloc[0]
deny_example = train_df[train_df['label_text']=='deny'].iloc[0]
query_example = train_df[train_df['label_text']=='query'].iloc[0]
comment_example = train_df[train_df['label_text']=='comment'].iloc[0]


def get_few_shot_examples(df, n_per_class=1):
    """Select stratified random examples for few-shot prompting."""
    examples = []
    
    for label in ['support', 'deny', 'query', 'comment']:
        class_df = df[df['label_text'] == label]
        samples = class_df.sample(n=min(n_per_class, len(class_df)))
        
        for _, row in samples.iterrows():
            examples.append({
                'source': format_input_with_context(row, df, use_features=False, use_context=USE_CTX),
                'label': label.upper()
            })
    
    return examples

In [18]:
def evaluate_prompting(model, df, mode="zero-shot", examples=None, sys_prompt_override=None, verbose=True):
    # set output params
    if mode == "cot":
        output_type = COT_REGEX
        max_new_tokens = MAX_COT_TOKENS
    else:
        output_type = StanceLabel
        max_new_tokens = 10
        
    # check mode / get correct sys prompt for messages
    if sys_prompt_override is not None: # for ablations
        active_sys_prompt = sys_prompt_override
    elif mode == "cot":
        active_sys_prompt = COT_SYS_PROMPT
    elif mode in ("zero-shot", "few-shot"):
        active_sys_prompt = SYS_PROMPT # standard
    else:
        raise ValueError(f"Unrecognised mode: {mode}. Choose from 'zero-shot', 'few-shot', or 'cot'.")
    
    # get all prompts before
    all_prompts = []
    for _, row in df.iterrows():
        input_text = format_input_with_context(row, df, use_features=False, use_context=USE_CTX)
        messages = build_messages(input_text, examples=examples, sys_prompt=active_sys_prompt)
        
#         if mode == 'cot':
#             messages[-1]['content'] += "\nLet's think step-by-step." # for cot
        
        # get prompt string from messages
        prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        all_prompts.append(prompt)
    
    # get responses
    raw_responses = []
    predictions = []
    
    for prompt in tqdm(all_prompts, desc=f"Evaluating ({mode})"):
        response = model(prompt, output_type, max_new_tokens=max_new_tokens)
        raw_responses.append(response)
        
        if mode == "cot":
            pred = parse_cot_label(response) # may return None
        else:
            pred = response.lower() # outlines fixes to a valid stance
        predictions.append(pred)
    
    # parsing errors (only for cot)
    if mode == 'cot':
        error_idxs = [i for i, p in enumerate(predictions) if p is None]
        if error_idxs and verbose:
            print(f"Warning: {len(error_idxs)} errors in predictions. Replacing with 'comment'.")
            for idx in error_idxs[:5]:  # Show first 5 errors only
                print(f"Index: {idx}\nRaw Model Output: '{raw_responses[idx]}'\n" + "-" * 30)

        # set errors to comment (majority class)
        predictions = [p if p is not None else 'comment' for p in predictions]
        
    true_labels = df['label_text'].tolist()
    
    # metrics
    labels = ['support', 'deny', 'query', 'comment']
    macro_f1 = f1_score(true_labels, predictions, average='macro')
    per_class_f1 = f1_score(true_labels, predictions, average=None, labels=labels)
    
    if verbose:
        print(f"\n{'='*60}\nResults ({mode})\n{'='*60}")
        print(f"Macro F1: {macro_f1:.4f}\n\nPer-class F1:")
        for lbl, f1 in zip(labels, per_class_f1):
            print(f"  {lbl}: {f1:.4f}")
        print(f"\n{classification_report(true_labels, predictions, labels=labels, zero_division=0.0)}")
    
    # save and return predictions
    results_df = pd.DataFrame({'true_label': true_labels, 'predicted': predictions})
    csv_path = f"{RESULTS_DIR}predictions_{mode}.csv"
    results_df.to_csv(csv_path, index=False)
    if verbose:
        print(f"Saved predictions: {csv_path}")
    
    return {
        'predictions': predictions,
        'true_labels': true_labels,
        'raw_responses': raw_responses,
        'macro_f1': macro_f1,
        'per_class_f1': dict(zip(labels, per_class_f1)),
        'confusion_matrix': confusion_matrix(true_labels, predictions, labels=labels)
    }

## Zero-shot

In [None]:
# zero-shot
zero_results = evaluate_prompting(model, use_df, mode="zero-shot")

Evaluating (zero-shot):   0%|          | 0/281 [00:00<?, ?it/s]

In [24]:
sns.set_theme(style="whitegrid", font_scale=1.1)

# Component dictionary for easy reference (no OUTPUT_FORMAT - only used for CoT)
PROMPT_COMPONENTS = {
    'persona': PERSONA,
    'instruction': INSTRUCTION,
    'input_format': INPUT_FORMAT,
    'label_defns': LABEL_DEFNS,
}

# Build colour palette: light grey for minimal, seaborn default for components, black for full
_default_palette = sns.color_palette()
COMPONENT_COLOURS = {
    'minimal': '#D3D3D3',
    'persona': _default_palette[0],
    'instruction': _default_palette[1],
    'input_format': _default_palette[2],
    'label_defns': _default_palette[3],
    'full': '#000000',
}

# ISOLATED: minimal + ONE component (shows individual contribution)
ISOLATED_CONFIGS = {
    'minimal': [],
    '+persona': ['persona'],
    '+instruction': ['instruction'],
    '+input_format': ['input_format'],
    '+label_defns': ['label_defns'],
    'full': ['persona', 'instruction', 'input_format', 'label_defns'],
}

# CUMULATIVE: stacking components one by one
CUMULATIVE_CONFIGS = {
    'minimal': [],
    '+persona': ['persona'],
    '+instruction': ['persona', 'instruction'],
    '+input_format': ['persona', 'instruction', 'input_format'],
    '+label_defns (full)': ['persona', 'instruction', 'input_format', 'label_defns'],
}


def build_ablation_sys_prompt(component_keys):
    """Build system prompt from list of component keys."""
    if not component_keys:
        return ""  # Empty system prompt for minimal
    return "\n".join(PROMPT_COMPONENTS[k] for k in component_keys)


def run_ablation_study(model, df, configs, desc="Ablation"):
    """Run ablation study with given configurations. Returns dict of {config_name: macro_f1}."""
    results = {}
    
    for config_name, component_keys in tqdm(configs.items(), desc=desc):
        sys_prompt = build_ablation_sys_prompt(component_keys)
        ablation_results = evaluate_prompting(
            model, df, mode="zero-shot", examples=None, 
            sys_prompt_override=sys_prompt, verbose=False
        )
        results[config_name] = ablation_results['macro_f1']
        print(f"  {config_name}: {ablation_results['macro_f1']:.4f}")
    
    return results


def _get_ablation_color(config_name):
    """Get bar color based on config name."""
    if config_name == 'minimal':
        return COMPONENT_COLOURS['minimal']
    elif 'full' in config_name:
        return COMPONENT_COLOURS['full']
    for comp in COMPONENT_COLOURS:
        if comp in config_name:
            return COMPONENT_COLOURS[comp]
    return COMPONENT_COLOURS['minimal']


def plot_ablation(results, title, save_path=None, show_delta=False):
    """Plot horizontal bar chart for ablation results."""
    df_plot = pd.DataFrame({
        'config': list(results.keys()),
        'score': list(results.values())
    })
    df_plot['color'] = [_get_ablation_color(c) for c in df_plot['config']]
    palette = dict(zip(df_plot['config'], df_plot['color']))
    
    fig, ax = plt.subplots(figsize=(10, 6))
    sns.barplot(data=df_plot, y='config', x='score', hue='config', palette=palette, 
                ax=ax, edgecolor='white', legend=False)
    
    ax.set_xlabel('Macro F1 Score')
    ax.set_ylabel('')
    ax.set_title(f'{title}\n(Zero-Shot on Dev Set)')
    ax.set_xlim(0, df_plot['score'].max() * 1.15)
    
    for i, (score, config) in enumerate(zip(df_plot['score'], df_plot['config'])):
        ax.text(score + 0.005, i, f'{score:.3f}', va='center', fontsize=10)
    
    if 'minimal' in results:
        ax.axvline(x=results['minimal'], color=COMPONENT_COLOURS['minimal'], 
                   linestyle='--', alpha=0.7, linewidth=1.5, label='Minimal baseline')
        ax.legend(loc='lower right')
    
    if show_delta:
        scores = list(results.values())
        for i in range(1, len(scores)):
            delta = scores[i] - scores[i-1]
            sign = '+' if delta >= 0 else ''
            ax.text(scores[i] + 0.04, i - 0.25, f'{sign}{delta:.3f}', 
                    fontsize=9, color='dimgray', style='italic')
    
    plt.tight_layout()
    if save_path:
        plt.savefig(save_path, dpi=150, bbox_inches='tight')
        print(f"Saved: {save_path}")
    plt.show()
    return fig


# Run isolated ablation (each component individually)
isolated_results = run_ablation_study(model, use_df, ISOLATED_CONFIGS, desc="Isolated Ablation")
plot_ablation(isolated_results, 'System Prompt Ablation: Isolated Component Contribution', 
              save_path=f"{RESULTS_DIR}ablation_isolated.png")

# Run cumulative ablation (stacking components)
cumulative_results = run_ablation_study(model, use_df, CUMULATIVE_CONFIGS, desc="Cumulative Ablation")
plot_ablation(cumulative_results, 'System Prompt Ablation: Cumulative Component Stacking', 
              save_path=f"{RESULTS_DIR}ablation_cumulative.png", show_delta=True)

Isolated Ablation:   0%|          | 0/6 [00:00<?, ?it/s]

Evaluating (zero-shot):   0%|          | 0/281 [00:00<?, ?it/s]

  minimal: 0.1321


Evaluating (zero-shot):   0%|          | 0/281 [00:00<?, ?it/s]

KeyboardInterrupt: 

## Few-shot

In [None]:
# few-shot
few_shot_examples = get_few_shot_examples(train_df, n_per_class=1)
few_results = evaluate_prompting(model, use_df, mode="few-shot", examples=few_shot_examples)

## CoT Prompting

In [47]:
# CoT prompting

cot_results = evaluate_prompting(model, use_df, mode="cot", examples=None)


Evaluating (cot):   0%|          | 0/281 [00:00<?, ?it/s]

Index: 1
Raw Model Output: 'Step 1: Identify the claim in the [SRC] tweet - a plane crash in the French Alps has occurred, which is a verifiable event that likely happened true (no need to verify the details of the crash itself, just its occurrence is true) or at least not false (given the information provided in the [SRC] tweet). The claim is likely true. The tone of the [SRC] tweet is somber, but the language used does not directly deny or question the occurrence of'
------------------------------
Index: 5
Raw Model Output: 'Step 1: Analyze the language used in the TARGET tweet. The user is expressing sadness and shock, but also explicitly states that it was a deliberate suicide, which is a fact confirmed by the news source mentioned in the SRC tweet (Germanwings Airbus A320 crash in French Alps). This implies that the TARGET is not questioning the veracity of the event, but rather adding more information to the story that is already known to be true by the general public and the new

In [53]:
# test a single eg
row = dev_df.iloc[5]
input_text = format_input_with_context(row, dev_df, use_features=False, use_context=USE_CTX)
messages = build_messages(input_text, None, COT_SYS_PROMPT) 
# messages[-1]['content'] += "\nLet's think step-by-step." # for cot
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
response = model(prompt, COT_REGEX, max_new_tokens=MAX_COT_TOKENS)
print(f"Prompt: {prompt}\n")
print(f"Response: {response}\n")
print(f"Prediction: {parse_cot_label(response)}")
print(f"True label: {row['label_text']}")


Prompt: <|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 02 Jan 2026

You are an expert in rumour stance analysis on twitter.
Your task is to classify the stance of the [TARGET] tweet towards the veracity of the rumour in the [SRC] tweet.
The input will be provided as a single string containing labeled segments:
"[SRC] ... [PARENT] ... [TARGET] ...". (Note: [PARENT] is omitted if [TARGET] replies directly to [SRC])

Classification Labels:
- SUPPORT: The reply explicitly supports or provides evidence for the veracity of the source claim
- DENY: The tweet explicitly denies or provides counter-evidence for the veracity of the source claim
- QUERY: The reply asks for additional evidence in relation to the veracity of the source claim
- COMMENT: The reply makes their own opinionated/neutral comment without a clear contribution to assessing the veracity of the source claim

Your response should follow this EXACT format:
Reasoning:

In [None]:
# few shot cot
cot_few_shot_examples = get_cot_examples(train_df)
cot_few_results = evaluate_prompting(model, use_df, mode="cot", examples=cot_few_shot_examples)