In [1]:
import os
import re
import json
import torch
import numpy as np
from tqdm.auto import tqdm
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from transformers import AutoModelForCausalLM, AutoTokenizer
import outlines # see https://dottxt-ai.github.io/outlines/latest/features/models/transformers/
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from typing import Literal

from data_loader import load_dataset, format_input_with_context


In [2]:
MODEL_NAME = 'meta-llama/Llama-3.2-3B-Instruct'
USE_CTX = False # whether to use [CTX] parts of threads (too slow for True)
MAX_COT_TOKENS = 200 # max n tokens for cot response
VALID_STANCES = ['SUPPORT', 'DENY', 'QUERY', 'COMMENT']
RELOAD_MODEL = False # takes up lots of mem if we keep reloading

RESULTS_DIR = "./results/prompting/"
RANDOM_SEED = 42
os.makedirs(RESULTS_DIR, exist_ok=True)
np.random.seed(RANDOM_SEED)

# load data
train_df, dev_df, test_df = load_dataset()
use_df = dev_df  # df to test on

Loading cached data from saved_data/datasets.pkl...


In [3]:
# load model

# output types for outlines
StanceLabel = Literal["SUPPORT", "DENY", "QUERY", "COMMENT"]
COT_REGEX = outlines.types.Regex(r"[\s\S]*Label: (SUPPORT|DENY|QUERY|COMMENT)")

if 'model' not in globals() or RELOAD_MODEL:
    hf_model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        dtype=torch.float16,
        device_map='auto',
    )
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    
    # use outlines
    model = outlines.from_transformers(hf_model, tokenizer)
    
    # get rid of annoying warnings re pad_token
    model.tokenizer.pad_token = tokenizer.eos_token
    model.tokenizer.padding_side = "left"
    model.model.generation_config.pad_token_id = model.tokenizer.pad_token_id

print(MODEL_NAME)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

meta-llama/Llama-3.2-3B-Instruct


In [4]:
# prompts
# see https://huggingface.co/docs/transformers/en/tasks/prompting
# https://www.promptingguide.ai/techniques/zeroshot

PERSONA = """You are an expert in rumour stance analysis on twitter."""
INSTRUCTION = """Your task is to classify the stance of the [TARGET] tweet towards the veracity of the rumour in the [SRC] tweet."""
INPUT_FORMAT = """\
The input will be provided as a single string containing labeled segments:
"[SRC] ... [PARENT] ... [TARGET] ...". (Note: [PARENT] is omitted if [TARGET] replies directly to [SRC])
"""
LABEL_DEFNS = """\
Classification Labels:
- SUPPORT: The reply explicitly supports or provides evidence for the veracity of the source claim
- DENY: The tweet explicitly denies or provides counter-evidence for the veracity of the source claim
- QUERY: The reply asks for additional evidence in relation to the veracity of the source claim
- COMMENT: The reply makes their own opinionated/neutral comment without a clear contribution to assessing the veracity of the source claim
"""

SYS_PROMPT = f"""\
{PERSONA}
{INSTRUCTION}
{INPUT_FORMAT}
{LABEL_DEFNS}
"""
# output format set by outlines

USER_PROMPT_TEMPLATE = """\
Text: {thread_context}

Classify the stance of [TARGET] towards the veracity of the rumour in [SRC]:
"""

def build_user_prompt(thread_context):
    return USER_PROMPT_TEMPLATE.format(thread_context=thread_context)


def build_messages(thread_context, examples=None, sys_prompt=None):
    init_prompt = sys_prompt if sys_prompt is not None else SYS_PROMPT # allow custom sys prompt (eg. for CoT/ablations)
    messages = [{"role": "system", "content": init_prompt}]
    if examples: # for few-shot
        for ex in examples:
            messages.append({"role": "user", "content": build_user_prompt(ex['source'])})
            messages.append({"role": "assistant", "content": ex['response']})
    messages.append({"role": "user", "content": build_user_prompt(thread_context)}) # final user prompt to answer
    return messages


In [5]:
# COT prompts 

# add output format to think step by step
COT_OUTPUT_FORMAT = """\
Your response should follow this EXACT format:
Reasoning: [VERY BRIEFLY explain why the label fits. Use a two stage strategy: Stage 1 classify Stance vs Non-Stance - If Non-stance, skip stage 2 and answer 'COMMENT'. Otherwise: Stage 2 classify Support / Deny / Query]
Label: [EXACTLY one of: SUPPORT, DENY, QUERY, or COMMENT]
"""

COT_SYS_PROMPT = f"""\
{PERSONA}
{INSTRUCTION}
{INPUT_FORMAT}
{LABEL_DEFNS}
{COT_OUTPUT_FORMAT}
"""



# for few-shot cot

# handselected ids for reasoning
COT_FEW_SHOT_IDS = {
    'support': '524967134339022848', # @TheKirkness Radio Canada tweeting same. must be true :-(
    'deny': '544292581950357504', # @JSchoenberger7 @TheAnonMessage not an Isis flag. Just an Islamic one. Stop spreading false rumors.
    'query': '544281192632053761', # @mscott Are your reporters 100% sure it is an ISIS flag? Cause that is what is being reported. #facts
    'comment': '552804023389392896', # @thei100 @Independent these fuckers thinking its a GTA heist mission
}

COT_EXAMPLES = {
    'support': "1. Stance: takes position on veracity\n2. \"must be true\" → affirms claim\nLabel: SUPPORT",
    'deny': "1. Stance: challenges veracity\n2. \"not an Isis flag\", \"false rumors\" → rejects claim\nLabel: DENY",
    'query': "1. Stance: engages with veracity\n2. \"Are your reporters 100% sure?\" → requests evidence\nLabel: QUERY",
    'comment': "1. Stance: no veracity assessment\n2. Offers opinion/reaction only\nLabel: COMMENT",
}

def get_cot_examples(train_df):
    examples = []
    for label, tweet_id in COT_FEW_SHOT_IDS.items():
        matches = train_df[train_df['tweet_id'] == tweet_id]
        if len(matches) == 0:
            print(f"Warning: COT example tweet_id {tweet_id} not found")
            continue
        row = matches.iloc[0]
        examples.append({
            'source': format_input_with_context(row, train_df, use_features=False, use_context=USE_CTX),
            'response': COT_EXAMPLES[label]
        })
    return examples



In [6]:
def parse_cot_label(text): # get final label from cot response
    match = re.search(r'Label:\s*(SUPPORT|DENY|QUERY|COMMENT)', text, re.IGNORECASE)
    return match.group(1).lower() if match else None

In [7]:

# LLM --
def plot_confusion_matrix(cm, mode, save_path=None):
    """Plot confusion matrix."""
    labels = ['support', 'deny', 'query', 'comment']
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title(f'Confusion Matrix ({mode})')
    if save_path:
        plt.savefig(save_path, dpi=150, bbox_inches='tight')
        print(f"Saved: {save_path}")
    plt.show()


def save_results(results, model_key, mode):
    """Save evaluation results to JSON."""
    output = {
        'model': model_key,
        'mode': mode,
        'macro_f1': results['macro_f1'],
        'per_class_f1': results['per_class_f1'],
        'predictions': results['predictions'],
        'true_labels': results['true_labels'],
        'raw_responses': results['raw_responses']
    }
    filename = f"{RESULTS_DIR}{model_key}_{mode}_results.json"
    with open(filename, 'w') as f:
        json.dump(output, f, indent=2)
    print(f"Saved: {filename}")

In [12]:
# few shot examples

# from different sources and topics
DIVERSE_FEW_SHOT_IDS = {
    'support': '553486439129038848', # 'BREAKING: French police tell AP suspects in #CharlieHebdo attack have taken a hostage northeast of Paris.'
    'deny': '500384255814299648', # '@TPM @McBlondeLand Awful! The DOJ needs to do thorough investigation of this PD-these problems are systemic. The citizens deserve better!'
    'query': '525003827494531073', # '@CTVNews @ctvsaskatoon so what happened at Rideau? Nothing?'
    'comment': '544314008980172800', # "@WilliamsJon @morgfair how do we know the guy isn't just pissed that he got fired or foreclosed on. Or his gf is cheating on him."
}
    
# From same source thread
SAME_SRC_SOURCE_ID = '529739968470867968' # 'Prince not playing Massey Hall tonight, promoter says http://t.co/clVHvM07kx http://t.co/IADirsFOOC'
SAME_SRC_FEW_SHOT_IDS = {
    'support': '529740822238232577', # 'athenaph what tease!!! RT @CBCNews: Prince not playing Massey Hall tonight, promoter says http://t.co/245JKmwWHH'
    'deny': '529740809672077313', # '.@CBCNews I can also confirm he will not be playing in Rankin Inlet, NU either'
    'query': '529748991849013248', # '@CBCNews And, ... ?\nWhen @CBCNews asked Live Nation why no one made this clarification 4 or 5 hours earlier?'
    'comment': '529741574742507520', # '@CBCNews  lol.'
}

# random stratified selection (one per stance but randomly picked)
RANDOM_STRATIFIED_FEW_SHOT_IDS = {
    'support': '544306719686656000', # 'US evacuated Consulate in Sydney after #sydneysiege. Emergency warning to U.S. citizens urging them to "maintain high level of vigilance".'
    'deny': '524990163446140928', # 'Police have clarified that there were two shootings in Ottawa today, not three: at the War Memorial and Parliament Hill.'
    'query': '500386447158161408', # '@TPM Wow. I thought a Tweep was being sarcastic, but...it really happened that way?!?'
    'comment': '553543395717550080', # '@FoxNews @Machma7Machos the terrorists need empathy send them to Harvard University'
}

FEW_SHOT_SETS = {
    'diverse': DIVERSE_FEW_SHOT_IDS,
    'same_src': SAME_SRC_FEW_SHOT_IDS,
    'random': RANDOM_STRATIFIED_FEW_SHOT_IDS,
}


def get_few_shot_examples(df, n_per_class=1, use_set=None):
    examples = []
    
    if use_set is not None:
        # deterministic using defined sets
        if use_set not in FEW_SHOT_SETS:
            raise ValueError(f"Unknown use_set '{use_set}'. Choose from: {list(FEW_SHOT_SETS.keys())}")
        
        id_dict = FEW_SHOT_SETS[use_set]
        for label, tweet_id in id_dict.items():
            matches = df[df['tweet_id'] == tweet_id]
            if len(matches) == 0:
                print(f"Warning: Few-shot example tweet_id {tweet_id} not found in df")
                continue
            row = matches.iloc[0]
            examples.append({
                'source': format_input_with_context(row, df, use_features=False, use_context=USE_CTX),
                'response': label.upper()
            })
    else:
        # Non-deterministic: random stratified sampling
        for label in ['support', 'deny', 'query', 'comment']:
            class_df = df[df['label_text'] == label]
            samples = class_df.sample(n=min(n_per_class, len(class_df)))
            
            for _, row in samples.iterrows():
                examples.append({
                    'source': format_input_with_context(row, df, use_features=False, use_context=USE_CTX),
                    'response': label.upper()
                })
    
    return examples
    


In [10]:
def evaluate_prompting(model, df, mode="zero-shot", examples=None, sys_prompt_override=None, verbose=True):
    # set output params
    if mode == "cot":
        output_type = COT_REGEX
        max_new_tokens = MAX_COT_TOKENS
    else:
        output_type = StanceLabel
        max_new_tokens = 10
        
    # check mode / get correct sys prompt for messages
    if sys_prompt_override is not None: # for ablations
        active_sys_prompt = sys_prompt_override
    elif mode == "cot":
        active_sys_prompt = COT_SYS_PROMPT
    elif mode in ("zero-shot", "few-shot"):
        active_sys_prompt = SYS_PROMPT # standard
    else:
        raise ValueError(f"Unrecognised mode: {mode}. Choose from 'zero-shot', 'few-shot', or 'cot'.")
    
    # get all prompts before
    all_prompts = []
    for _, row in df.iterrows():
        input_text = format_input_with_context(row, df, use_features=False, use_context=USE_CTX)
        messages = build_messages(input_text, examples=examples, sys_prompt=active_sys_prompt)
        
#         if mode == 'cot':
#             messages[-1]['content'] += "\nLet's think step-by-step." # for cot
        
        # get prompt string from messages
        prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        all_prompts.append(prompt)
    
    # get responses
    raw_responses = []
    predictions = []
    
    for prompt in tqdm(all_prompts, desc=f"Evaluating ({mode})"):
        response = model(prompt, output_type, max_new_tokens=max_new_tokens)
        raw_responses.append(response)
        
        if mode == "cot":
            pred = parse_cot_label(response) # may return None
        else:
            pred = response.lower() # outlines fixes to a valid stance
        predictions.append(pred)
    
    # parsing errors (only for cot)
    if mode == 'cot':
        error_idxs = [i for i, p in enumerate(predictions) if p is None]
        if error_idxs and verbose:
            print(f"Warning: {len(error_idxs)} errors in predictions. Replacing with 'comment'.")
            for idx in error_idxs[:5]:  # Show first 5 errors only
                print(f"Index: {idx}\nRaw Model Output: '{raw_responses[idx]}'\n" + "-" * 30)

        # set errors to comment (majority class)
        predictions = [p if p is not None else 'comment' for p in predictions]
        
    true_labels = df['label_text'].tolist()
    
    # metrics
    labels = ['support', 'deny', 'query', 'comment']
    macro_f1 = f1_score(true_labels, predictions, average='macro')
    per_class_f1 = f1_score(true_labels, predictions, average=None, labels=labels)
    
    if verbose:
        print(f"\n{'='*60}\nResults ({mode})\n{'='*60}")
        print(f"Macro F1: {macro_f1:.4f}\n\nPer-class F1:")
        for lbl, f1 in zip(labels, per_class_f1):
            print(f"  {lbl}: {f1:.4f}")
        print(f"\n{classification_report(true_labels, predictions, labels=labels, zero_division=0.0)}")
    
    # save and return predictions
    results_df = pd.DataFrame({'true_label': true_labels, 'predicted': predictions})
    csv_path = f"{RESULTS_DIR}predictions_{mode}.csv"
    results_df.to_csv(csv_path, index=False)
    if verbose:
        print(f"Saved predictions: {csv_path}")
    
    return {
        'predictions': predictions,
        'true_labels': true_labels,
        'raw_responses': raw_responses,
        'macro_f1': macro_f1,
        'per_class_f1': dict(zip(labels, per_class_f1)),
        'confusion_matrix': confusion_matrix(true_labels, predictions, labels=labels)
    }

## Zero-shot

In [54]:
# zero-shot
zero_results = evaluate_prompting(model, use_df, mode="zero-shot")

Evaluating (zero-shot):   0%|          | 0/281 [00:00<?, ?it/s]


Results (zero-shot)
Macro F1: 0.1008

Per-class F1:
  support: 0.0000
  deny: 0.1714
  query: 0.2204
  comment: 0.0115

              precision    recall  f1-score   support

     support       0.00      0.00      0.00        69
        deny       0.10      0.55      0.17        11
       query       0.12      0.96      0.22        28
     comment       1.00      0.01      0.01       173

    accuracy                           0.12       281
   macro avg       0.31      0.38      0.10       281
weighted avg       0.63      0.12      0.04       281

Saved predictions: ./results/prompting/predictions_zero-shot.csv


In [56]:
sns.set_theme(style="whitegrid", font_scale=1.1)

# Component dictionary for easy reference (no OUTPUT_FORMAT - only used for CoT)
PROMPT_COMPONENTS = {
    'persona': PERSONA,
    'instruction': INSTRUCTION,
    'input_format': INPUT_FORMAT,
    'label_defns': LABEL_DEFNS,
}

# Build colour palette: light grey for minimal, seaborn default for components, black for full
_default_palette = sns.color_palette()
COMPONENT_COLOURS = {
    'minimal': '#D3D3D3',
    'persona': _default_palette[0], #1f77b4
    'instruction': _default_palette[1], #ff7f0e
    'input_format': _default_palette[2], #2ca02c
    'label_defns': _default_palette[3], #d62728
    'full': '#000000',
}

# ISOLATED: minimal + ONE component (shows individual contribution)
ISOLATED_CONFIGS = {
    'minimal': [],
    '+persona': ['persona'],
    '+instruction': ['instruction'],
    '+input_format': ['input_format'],
    '+label_defns': ['label_defns'],
    'full': ['persona', 'instruction', 'input_format', 'label_defns'],
}

# CUMULATIVE: stacking components one by one
CUMULATIVE_CONFIGS = {
    'minimal': [],
    '+persona': ['persona'],
    '+instruction': ['persona', 'instruction'],
    '+input_format': ['persona', 'instruction', 'input_format'],
    '+label_defns (full)': ['persona', 'instruction', 'input_format', 'label_defns'],
}


def build_ablation_sys_prompt(component_keys):
    """Build system prompt from list of component keys."""
    if not component_keys:
        return ""  # Empty system prompt for minimal
    return "\n".join(PROMPT_COMPONENTS[k] for k in component_keys)


def run_ablation_study(model, df, configs, desc="Ablation"):
    """Run ablation study with given configurations. Returns dict of {config_name: macro_f1}."""
    results = {}
    
    for config_name, component_keys in tqdm(configs.items(), desc=desc):
        sys_prompt = build_ablation_sys_prompt(component_keys)
        ablation_results = evaluate_prompting(
            model, df, mode="zero-shot", examples=None, 
            sys_prompt_override=sys_prompt, verbose=False
        )
        results[config_name] = ablation_results['macro_f1']
        print(f"  {config_name}: {ablation_results['macro_f1']:.4f}")
    
    return results


def _get_ablation_color(config_name):
    """Get bar color based on config name."""
    if config_name == 'minimal':
        return COMPONENT_COLOURS['minimal']
    elif 'full' in config_name:
        return COMPONENT_COLOURS['full']
    for comp in COMPONENT_COLOURS:
        if comp in config_name:
            return COMPONENT_COLOURS[comp]
    return COMPONENT_COLOURS['minimal']


def plot_ablation(results, title, save_path=None, show_delta=False):
    """Plot horizontal bar chart for ablation results."""
    df_plot = pd.DataFrame({
        'config': list(results.keys()),
        'score': list(results.values())
    })
    df_plot['color'] = [_get_ablation_color(c) for c in df_plot['config']]
    palette = dict(zip(df_plot['config'], df_plot['color']))
    
    fig, ax = plt.subplots(figsize=(10, 6))
    sns.barplot(data=df_plot, y='config', x='score', hue='config', palette=palette, 
                ax=ax, edgecolor='white', legend=False)
    
    ax.set_xlabel('Macro F1 Score')
    ax.set_ylabel('')
    ax.set_title(f'{title}\n(Zero-Shot on Dev Set)')
    ax.set_xlim(0, df_plot['score'].max() * 1.15)
    
    for i, (score, config) in enumerate(zip(df_plot['score'], df_plot['config'])):
        ax.text(score + 0.005, i, f'{score:.3f}', va='center', fontsize=10)
    
    if 'minimal' in results:
        ax.axvline(x=results['minimal'], color=COMPONENT_COLOURS['minimal'], 
                   linestyle='--', alpha=0.7, linewidth=1.5, label='Minimal baseline')
        ax.legend(loc='lower right')
    
    if show_delta:
        scores = list(results.values())
        for i in range(1, len(scores)):
            delta = scores[i] - scores[i-1]
            sign = '+' if delta >= 0 else ''
            ax.text(scores[i] + 0.04, i - 0.25, f'{sign}{delta:.3f}', 
                    fontsize=9, color='dimgray', style='italic')
    
    plt.tight_layout()
    if save_path:
        plt.savefig(save_path, dpi=150, bbox_inches='tight')
        print(f"Saved: {save_path}")
    plt.show()
    return fig


# Run isolated ablation (each component individually)
isolated_results = run_ablation_study(model, use_df, ISOLATED_CONFIGS, desc="Isolated Ablation")
plot_ablation(isolated_results, 'System Prompt Ablation: Isolated Component Contribution', 
              save_path=f"{RESULTS_DIR}ablation_isolated.png")

# Run cumulative ablation (stacking components)
cumulative_results = run_ablation_study(model, use_df, CUMULATIVE_CONFIGS, desc="Cumulative Ablation")
plot_ablation(cumulative_results, 'System Prompt Ablation: Cumulative Component Stacking', 
              save_path=f"{RESULTS_DIR}ablation_cumulative.png", show_delta=True)

Isolated Ablation:   0%|          | 0/6 [00:00<?, ?it/s]

Evaluating (zero-shot):   0%|          | 0/281 [00:00<?, ?it/s]

KeyboardInterrupt: 

## Few-shot

In [13]:
# few-shot
few_shot_examples = get_few_shot_examples(train_df, n_per_class=1)
few_results = evaluate_prompting(model, use_df, mode="few-shot", examples=few_shot_examples)

Evaluating (few-shot):   0%|          | 0/281 [00:00<?, ?it/s]


Results (few-shot)
Macro F1: 0.3789

Per-class F1:
  support: 0.3977
  deny: 0.2963
  query: 0.4065
  comment: 0.4153

              precision    recall  f1-score   support

     support       0.33      0.51      0.40        69
        deny       0.25      0.36      0.30        11
       query       0.26      0.89      0.41        28
     comment       0.78      0.28      0.42       173

    accuracy                           0.40       281
   macro avg       0.40      0.51      0.38       281
weighted avg       0.60      0.40      0.41       281

Saved predictions: ./results/prompting/predictions_few-shot.csv


## CoT Prompting

In [14]:
# CoT prompting

cot_results = evaluate_prompting(model, use_df, mode="cot", examples=None)


Evaluating (cot):   0%|          | 0/281 [00:00<?, ?it/s]

Index: 65
Raw Model Output: 'Reasoning: Stage 1: Stance vs Non-Stance - [TARGET] is responding to the source claim, so it is a stance. Stage 2: Support / Deny / Query - The reply does not explicitly support or deny the veracity of the source claim, nor does it ask for additional evidence. Instead, it makes a comment, "another great tragedy", which is an opinionated statement. Therefore, the label is: COMMENT. [COMMENT] The reply is a neutral comment without a clear contribution to assessing the veracity of the source claim. It does not provide any direct support, denial, or query, but rather expresses sympathy or shock. The tone is also somewhat informal, which is typical of a condolence message. The reply does not directly engage with the veracity of the claim, but rather reacts to the news. Therefore, the reply does not fit into the categories of Support, Deny, or Query. A more fitting response might have been "Did'
------------------------------
Index: 95
Raw Model Output: 'Reasonin

In [16]:
# test a single eg
row = dev_df.iloc[5]
input_text = format_input_with_context(row, dev_df, use_features=False, use_context=USE_CTX)
messages = build_messages(input_text, None, COT_SYS_PROMPT) 
# messages[-1]['content'] += "\nLet's think step-by-step." # for cot
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
response = model(prompt, COT_REGEX, max_new_tokens=MAX_COT_TOKENS)
print(f"Prompt: {prompt}\n")
print(f"Response: {response}\n")
print(f"Prediction: {parse_cot_label(response)}")
print(f"True label: {row['label_text']}")


Prompt: <|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 02 Jan 2026

You are an expert in rumour stance analysis on twitter.
Your task is to classify the stance of the [TARGET] tweet towards the veracity of the rumour in the [SRC] tweet.
The input will be provided as a single string containing labeled segments:
"[SRC] ... [PARENT] ... [TARGET] ...". (Note: [PARENT] is omitted if [TARGET] replies directly to [SRC])

Classification Labels:
- SUPPORT: The reply explicitly supports or provides evidence for the veracity of the source claim
- DENY: The tweet explicitly denies or provides counter-evidence for the veracity of the source claim
- QUERY: The reply asks for additional evidence in relation to the veracity of the source claim
- COMMENT: The reply makes their own opinionated/neutral comment without a clear contribution to assessing the veracity of the source claim

Your response should follow this EXACT format:
Reasoning:

In [15]:
# few shot cot
cot_few_shot_examples = get_cot_examples(train_df)
cot_few_results = evaluate_prompting(model, use_df, mode="cot", examples=cot_few_shot_examples)

Evaluating (cot):   0%|          | 0/281 [00:00<?, ?it/s]

Index: 4
Raw Model Output: '1. Stance: acknowledges veracity indirectly (through shared source URL, but no direct statement on factuality) and expresses sympathy, but doesn't provide any additional information or comment on the incident's veracity itself. However, given the absence of any direct challenge, lack of denial, or query, the sentiment is sympathetic rather than informative or argumentative. It can be argued that it is a neutral comment rather than a stance on the veracity of the claim. However, in the context of your original request, a more fitting response would be to classify it as a COMMENT. Since the target does not directly address the veracity of the claim, the response does not challenge or support the claim but rather provides a sympathetic response. Therefore, a more fitting response would be to label it as COMMENT. However, a more nuanced approach might be to label it as QUERY in the sense that it seeks to understand the context of the incident, but in the context