In [1]:
import os
import re
import json
import torch
import numpy as np
from tqdm.auto import tqdm
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from transformers import pipeline
import seaborn as sns
import matplotlib.pyplot as plt

from data_loader import load_dataset, format_input_with_context

[nltk_data] Downloading package punkt to /home2/nchw73/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
MODEL_NAME = 'meta-llama/Llama-3.1-8B-Instruct'


RESULTS_DIR = "./results/prompting/"
os.makedirs(RESULTS_DIR, exist_ok=True)

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# load data
train_df, dev_df, test_df = load_dataset()

USE_CTX = False # whether to use [CTX] parts of threads (src --> CTX tweets --> parent --> target)
VALID_STANCES = ['SUPPORT', 'DENY', 'QUERY', 'COMMENT']

Loading cached data from saved_data/datasets.pkl...


In [None]:
# prompts
PERSONA = """You are an expert in rumour stance analysis on twitter."""
INSTRUCTION = """Your task is to classify the stance of the TARGET tweet towards the veracity of the rumour in the SOURCE tweet."""
INPUT_FORMAT = """\
    The input will be provided as a single string containing labeled segments:
    "[SRC] ... [PARENT] ... [TARGET] ..." (Note: [PARENT] may be omitted if not applicable).
    """
LABEL_DEFNS = """\
    Classification Labels:
    - SUPPORT: The reply supports the veracity of the source claim
    - DENY: The reply denies the veracity of the source claim
    - QUERY: The reply asks for additional evidence in relation to the veracity of the source claim
    - COMMENT: The reply makes their own comment without a clear contribution to assessing the veracity of the source claim
    """
OUTPUT_FORMAT = """\
    Respond with ONLY one word: SUPPORT, DENY, QUERY, or COMMENT
    """

SYS_PROMPT = f"""\
{PERSONA}
{INSTRUCTION}
{INPUT_FORMAT}
{LABEL_DEFNS}
{OUTPUT_FORMAT}
"""

USER_PROMPT_TEMPLATE = """\
Text: {thread_context}

Task: Classify the stance of [TARGET] towards the veracity of the rumour in [SRC].
"""

def build_user_prompt(thread_context):
    return USER_PROMPT_TEMPLATE.format(thread_context=thread_context)


def build_zero_shot_messages(thread_context):
    return [
        {"role": "system", "content": SYS_PROMPT},
        {"role": "user", "content": build_user_prompt(thread_context)},
    ]


def build_few_shot_messages(thread_context, examples=None):
    messages = [{"role": "system", "content": SYS_PROMPT}]
    
    if examples:
        for ex in examples:
            messages.append({"role": "user", "content": build_user_prompt(ex['source'])})
            messages.append({"role": "assistant", "content": ex['label']})
    
    messages.append({"role": "user", "content": build_user_prompt(thread_context)})
    return messages


In [None]:

def parse_stance_response(text):
    text = text.upper().strip()
    for label in VALID_STANCES:
        if re.search(rf'\b{label}\b', text):
            return label
    return None

In [None]:
# LLM

def generate_response(pipe, messages, max_new_tokens=10): # max n tokens to limit response length
    output = pipe(
        messages, 
        max_new_tokens=max_new_tokens,
        do_sample=False, # deterministic output
        pad_token_id=pipe.tokenizer.eos_token_id, # prevent warnings
        )
    return output[0]["generated_text"][-1]["content"].strip()


# def classify_instance(pipe, input_text, mode="zero-shot", examples=None):
#     if mode == "zero-shot":
#         messages = build_zero_shot_messages(input_text)
#     else:
#         messages = build_few_shot_messages(input_text, examples)
    
#     response = generate_response(pipe, messages)
#     label = parse_stance_response(response)
#     return label, response


# --
def plot_confusion_matrix(cm, mode, save_path=None):
    """Plot confusion matrix."""
    labels = ['support', 'deny', 'query', 'comment']
    
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title(f'Confusion Matrix ({mode})')
    
    if save_path:
        plt.savefig(save_path, dpi=150, bbox_inches='tight')
        print(f"Saved: {save_path}")
    
    plt.show()


def save_results(results, model_key, mode):
    """Save evaluation results to JSON."""
    output = {
        'model': model_key,
        'mode': mode,
        'macro_f1': results['macro_f1'],
        'per_class_f1': results['per_class_f1'],
        'predictions': results['predictions'],
        'true_labels': results['true_labels'],
        'raw_responses': results['raw_responses']
    }
    
    filename = f"{RESULTS_DIR}{model_key}_{mode}_results.json"
    with open(filename, 'w') as f:
        json.dump(output, f, indent=2)
    print(f"Saved: {filename}")

In [6]:
# few shot examples
support_example = train_df[train_df['label_text']=='support'].iloc[0]
deny_example = train_df[train_df['label_text']=='deny'].iloc[0]
query_example = train_df[train_df['label_text']=='query'].iloc[0]
comment_example = train_df[train_df['label_text']=='comment'].iloc[0]


def get_few_shot_examples(df, n_per_class=1):
    """Select stratified random examples for few-shot prompting."""
    examples = []
    
    for label in ['support', 'deny', 'query', 'comment']:
        class_df = df[df['label_text'] == label]
        samples = class_df.sample(n=min(n_per_class, len(class_df)))
        
        for _, row in samples.iterrows():
            examples.append({
                'source': format_input_with_context(row, df, use_features=False, use_context=USE_CTX),
                'label': label.upper()
            })
    
    return examples

In [None]:
# load model
pipe = pipeline(
    "text-generation", 
    MODEL_NAME, 
    device_map='auto',
    torch_dtype=torch.float16,
)


Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

Some parameters are on the meta device because they were offloaded to the cpu.


tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

Device set to use cuda:0


In [None]:
def evaluate_prompting(pipe, df, mode="zero-shot", examples=None, batch_size=1, verbose=True):
    from datasets import Dataset
    
    # Prepare all messages upfront
    all_messages = []
    for _, row in df.iterrows():
        input_text = format_input_with_context(row, df, use_features=False, use_context=USE_CTX)
        if mode == "zero-shot":
            messages = build_zero_shot_messages(input_text)
        else:
            messages = build_few_shot_messages(input_text, examples)
        all_messages.append(messages)
    
    # Batched inference using Dataset
    dataset = Dataset.from_dict({"messages": all_messages})
    
    raw_responses = []
    for out in tqdm(pipe(dataset["messages"], max_new_tokens=10, batch_size=batch_size), 
                    total=len(df), desc=f"Evaluating ({mode})"):
        raw_responses.append(out[0]["generated_text"][-1]["content"].strip())
    
    # Parse responses
    predictions = [parse_stance_response(r) for r in raw_responses]

    # count number of None (errors), print and replace with comment
    num_errors = predictions.count(None)
    if num_errors > 0:
        print(f"Warning: {num_errors} errors in predictions. Replacing with 'comment'.")
        predictions = [p if p is not None else 'comment' for p in predictions]

    true_labels = df['label_text'].tolist()
    
    # Metrics
    labels = ['support', 'deny', 'query', 'comment']
    macro_f1 = f1_score(true_labels, predictions, average='macro')
    per_class_f1 = f1_score(true_labels, predictions, average=None, labels=labels)
    
    if verbose:
        print(f"\n{'='*60}\nResults ({mode})\n{'='*60}")
        print(f"Macro F1: {macro_f1:.4f}")
        print(f"\nPer-class F1:")
        for lbl, f1 in zip(labels, per_class_f1):
            print(f"  {lbl}: {f1:.4f}")
        print(f"\n{classification_report(true_labels, predictions, labels=labels)}")
    
    return {
        'predictions': predictions,
        'true_labels': true_labels,
        'raw_responses': raw_responses,
        'macro_f1': macro_f1,
        'per_class_f1': dict(zip(labels, per_class_f1)),
        'confusion_matrix': confusion_matrix(true_labels, predictions, labels=labels)
    }

In [25]:
# zero-shot

row = dev_df.iloc[1]
df = dev_df

input_text = format_input_with_context(row, df, use_features=False, use_context=USE_CTX)
messages = build_zero_shot_messages(input_text)
print(messages)
print()

# pipeline does this
# prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
# print(prompt)
# print()

response = generate_response(pipe, messages)
label = parse_stance_response(response)
print(f"Ground truth: {row['label_text']}")
print(f"Predicted:    {label}")
print(f"Response:     {response}")
print()

# zero_results = evaluate_prompting(pipe, dev_df, mode="zero-shot")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for

KeyboardInterrupt: 

In [29]:
# few-shot

row = dev_df.iloc[102]
df = dev_df

few_shot_examples = get_few_shot_examples(train_df, n_per_class=1)
input_text = format_input_with_context(row, df, use_features=False)
messages = build_few_shot_messages(input_text, few_shot_examples)


prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
print(prompt)
print()

response = generate_response(pipe, messages)
label = parse_stance_response(response)
print(f"Ground truth: {row['label_text']}")
# print(f"Predicted:    {label}")
print(f"Response:     {response}")
print()

# few_results = evaluate_prompting(pipe, df, mode="few-shot", examples=few_shot_examples)


# print(f"\n{'='*60}\nSummary\n{'='*60}")
# print(f"Zero-Shot Macro F1: {zero_results['macro_f1']:.4f}")
# print(f"Few-Shot Macro F1:  {few_results['macro_f1']:.4f}")
# print(f"Improvement: {(few_results['macro_f1'] - zero_results['macro_f1'])*100:.2f}%")

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

You are an expert in rumour stance analysis on social media.

Your task is to classify the stance of the TARGET tweet towards the veracity of the rumour in the SOURCE tweet.

**Input Structure:**
The input will be provided as a single string containing labeled segments:
"[SRC] ... [PARENT] ... [TARGET] ..." (Note: [PARENT] may be omitted if not applicable).

**Classification Labels:**
- **SUPPORT**: The reply supports the veracity of the source claim
- **DENY**: The reply denies the veracity of the source claim
- **QUERY**: The reply asks for additional evidence in relation to the veracity of the source claim
- **COMMENT**: The reply makes their own comment without a clear contribution to assessing the veracity of the source claim

**Output Format:**
Respond with ONLY one word: SUPPORT, DENY, QUERY, or COMMENT<|eot_id|><|start_header_id|>user<|end_header_id|>

**T

In [30]:
row['text']

"@marcepa49 the flight profile. That doesn't indicate that the autopilot was set to 100 feet. I've worked on these systems, tested them"