In [1]:
pip install datasets



In [2]:
pip install torch



In [3]:
# importing the dataset directly from huggingface
from datasets import load_dataset, Dataset

# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("ailsntua/QEvasion")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
ds

DatasetDict({
    train: Dataset({
        features: ['title', 'date', 'president', 'url', 'question_order', 'interview_question', 'interview_answer', 'gpt3.5_summary', 'gpt3.5_prediction', 'question', 'annotator_id', 'annotator1', 'annotator2', 'annotator3', 'inaudible', 'multiple_questions', 'affirmative_questions', 'index', 'clarity_label', 'evasion_label'],
        num_rows: 3448
    })
    test: Dataset({
        features: ['title', 'date', 'president', 'url', 'question_order', 'interview_question', 'interview_answer', 'gpt3.5_summary', 'gpt3.5_prediction', 'question', 'annotator_id', 'annotator1', 'annotator2', 'annotator3', 'inaudible', 'multiple_questions', 'affirmative_questions', 'index', 'clarity_label', 'evasion_label'],
        num_rows: 308
    })
})

In [5]:
print("\nTrain sample:\n", ds["train"][0])
print("\nColumns:\n", ds["train"].column_names)


Train sample:
 {'title': "The President's News Conference in Hanoi, Vietnam", 'date': 'September 10, 2023', 'president': 'Joseph R. Biden', 'url': 'https://www.presidency.ucsb.edu/documents/the-presidents-news-conference-hanoi-vietnam-0', 'question_order': 1, 'interview_question': 'Q. Of the Biden administration. And accused the United States of containing China while pushing for diplomatic talks.How would you respond to that? And do you think President Xi is being sincere about getting the relationship back on track as he bans Apple in China?', 'interview_answer': "Well, look, first of all, theI am sincere about getting the relationship right. And one of the things that is going on now is, China is beginning to change some of the rules of the game, in terms of trade and other issues.And so one of the things we talked about, for example, is that they're now talking about making sure that no Chineseno one in the Chinese Government can use a Western cell phone. Those kinds of things.And

In [6]:
MODEL = "roberta-base"
MAX_LENGTH = 512

In [7]:
pip install sentencepiece protobuf



In [8]:
pip install transformers datasets scikit-learn accelerate



In [9]:
import pandas as pd
import torch
from datasets import load_dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from sklearn.utils.class_weight import compute_class_weight

In [10]:
# pip install --upgrade transformers huggingface_hub accelerate sentencepiece

In [11]:
print("\nDataset Columns:", ds['train'].column_names)
print("Sample Entry:", ds['train'][0])


Dataset Columns: ['title', 'date', 'president', 'url', 'question_order', 'interview_question', 'interview_answer', 'gpt3.5_summary', 'gpt3.5_prediction', 'question', 'annotator_id', 'annotator1', 'annotator2', 'annotator3', 'inaudible', 'multiple_questions', 'affirmative_questions', 'index', 'clarity_label', 'evasion_label']
Sample Entry: {'title': "The President's News Conference in Hanoi, Vietnam", 'date': 'September 10, 2023', 'president': 'Joseph R. Biden', 'url': 'https://www.presidency.ucsb.edu/documents/the-presidents-news-conference-hanoi-vietnam-0', 'question_order': 1, 'interview_question': 'Q. Of the Biden administration. And accused the United States of containing China while pushing for diplomatic talks.How would you respond to that? And do you think President Xi is being sincere about getting the relationship back on track as he bans Apple in China?', 'interview_answer': "Well, look, first of all, theI am sincere about getting the relationship right. And one of the thing

In [12]:
LABEL_NORMALIZER = {
    # Clear Reply mappings
    'explicit': 'Explicit',
    'clear reply': 'Explicit',
    'clear': 'Explicit',

    # Ambivalent (Evasion) mappings
    'implicit': 'Implicit',
    'dodging': 'Dodging',
    'general': 'General',
    'deflection': 'Deflection',
    'partial': 'Partial/half-answer',
    'partial/half-answer': 'Partial/half-answer',

    # Clear Non-Reply mappings
    'declining': 'Declining to answer',
    'declining to answer': 'Declining to answer',
    'ignorance': 'Claims ignorance',
    'claims ignorance': 'Claims ignorance',
    'clarification': 'Clarification',

    # Fallback mappings
    'clear non-reply': 'Declining to answer',
    'ambivalent': 'Implicit', # Default ambivalent to Implicit if unspecified
    'ambiguous': 'Implicit'
}

In [13]:
raw_df = ds['train'].to_pandas()

expanded_rows = []

print(f"Processing {len(raw_df)} original rows...")

Processing 3448 original rows...


In [14]:
for _, row in raw_df.iterrows():
    q_text = row['interview_question']
    a_text = row['interview_answer']

    # Collect all potential labels for this row
    potential_labels = []

    # A. Check the 3 Annotators (Priority: High)
    # The FAQ states any annotator is valid, so we train on ALL of them[cite: 210].
    for col in ['annotator1', 'annotator2', 'annotator3']:
        val = row.get(col)
        if val and str(val).lower() not in ['nan', 'none', '']:
            potential_labels.append(str(val).strip())

    # B. If no annotators (Training set usually), use evasion_label
    if not potential_labels:
        val = row.get('evasion_label')
        if val and str(val).lower() not in ['nan', 'none', '']:
            potential_labels.append(str(val).strip())

    # C. Fallback: Use clarity_label if evasion is missing
    # (e.g., "Clear Reply" -> "Explicit")
    if not potential_labels:
        clarity = row.get('clarity_label')
        if clarity == 'Clear Reply':
            potential_labels.append('Explicit')
        elif clarity == 'Clear Non-Reply':
            potential_labels.append('Declining to answer') # Generic fallback

    # Add normalized rows to training data
    for raw_label in potential_labels:
        # Normalize (e.g., "Partial" -> "Partial/half-answer")
        clean_label = LABEL_NORMALIZER.get(raw_label, raw_label)

        # Only add if it's one of our valid target classes
        if clean_label in LABEL_NORMALIZER.values():
            expanded_rows.append({
                "text": q_text,
                "text_pair": a_text,
                "label_str": clean_label
            })

# Create final dataframe
df_train = pd.DataFrame(expanded_rows)
print(f"Final Training Size: {len(df_train)} (Exploded from {len(raw_df)})")
print(f"Classes Found: {df_train['label_str'].unique()}")

Final Training Size: 3448 (Exploded from 3448)
Classes Found: ['Explicit' 'General' 'Partial/half-answer' 'Dodging' 'Implicit'
 'Deflection' 'Declining to answer' 'Claims ignorance' 'Clarification']


In [15]:
unique_labels = sorted(df_train['label_str'].unique().tolist())
label2id = {l: i for i, l in enumerate(unique_labels)}
id2label = {i: l for i, l in enumerate(unique_labels)}

df_train['labels'] = df_train['label_str'].map(label2id)

# Compute Weights to handle imbalance (Explicit is ~30%, Clarification ~2%) [cite: 991]
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(df_train['labels']),
    y=df_train['labels']
)
weights_tensor = torch.tensor(class_weights, dtype=torch.float)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
weights_tensor = weights_tensor.to(device)

print(f"Class Weights applied: {weights_tensor}")

Class Weights applied: tensor([3.2194, 4.1643, 2.6421, 1.0055, 0.5427, 0.3642, 0.9925, 0.7851, 4.8495],
       device='cuda:0')


In [16]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)

def preprocess_function(examples):
    return tokenizer(
        examples["text"],
        examples["text_pair"],
        truncation=True,
        max_length=MAX_LENGTH,
        padding="max_length"
    )

hf_dataset = Dataset.from_pandas(df_train)
hf_dataset = hf_dataset.map(preprocess_function, batched=True)

# Prepare for PyTorch
cols = ['input_ids', 'attention_mask', 'labels']
if 'token_type_ids' in hf_dataset.column_names: cols.append('token_type_ids')
hf_dataset.set_format(type='torch', columns=cols)

# 90/10 Train/Validation Split
split_ds = hf_dataset.train_test_split(test_size=0.1, seed=42)
test_dataset = split_ds['test']

Map:   0%|          | 0/3448 [00:00<?, ? examples/s]

In [17]:
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # Weighted Cross Entropy to help rare classes
        loss_fct = nn.CrossEntropyLoss(weight=weights_tensor)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    # Metric is Macro F1 as per shared task guidelines [cite: 1068]
    return {"f1_macro": f1_score(labels, predictions, average='macro')}

In [18]:
pip install "numpy<2.0"



In [19]:
import torch.nn as nn

In [20]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL,
    num_labels=len(unique_labels),
    id2label=id2label,
    label2id=label2id
).to(device)

training_args = TrainingArguments(
    output_dir="./clarity_final_model",
    learning_rate=2e-5,              # Recommended by paper [cite: 1162]
    per_device_train_batch_size=8,   # Adjust to 4 if OOM
    per_device_eval_batch_size=8,
    num_train_epochs=5,              # 5 epochs for convergence
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    report_to="none"
)

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=split_ds["train"],
    eval_dataset=split_ds["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

print("\nStarting Training...")
trainer.train()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = WeightedTrainer(



Starting Training...


Epoch,Training Loss,Validation Loss,F1 Macro
1,No log,1.967547,0.185123
2,2.081900,1.792706,0.217615
3,1.790300,1.794834,0.301735
4,1.590900,1.814894,0.340802
5,1.590900,1.838561,0.319709


TrainOutput(global_step=1940, training_loss=1.7208361124254994, metrics={'train_runtime': 1624.9738, 'train_samples_per_second': 9.548, 'train_steps_per_second': 1.194, 'total_flos': 4082424588887040.0, 'train_loss': 1.7208361124254994, 'epoch': 5.0})

In [22]:
print("\n--- GENERATING PREDICTIONS ---")

# Run prediction loop on the Test Split
predictions_output = trainer.predict(test_dataset)
predicted_ids = np.argmax(predictions_output.predictions, axis=-1)

# Convert IDs back to String Labels (Task 2 Output)
predicted_labels = [id2label[pid] for pid in predicted_ids]

# Apply Hierarchy Mapping for Task 1 (Figure 3 Logic)
task1_clarity_preds = []
for label in predicted_labels:
    if label == 'Explicit':
        task1_clarity_preds.append('Clear Reply')
    elif label in ['Declining to answer', 'Claims ignorance', 'Clarification']:
        task1_clarity_preds.append('Clear Non-Reply')
    else:
        # Implicit, Dodging, General, Deflection, Partial -> Ambivalent
        task1_clarity_preds.append('Ambivalent Reply')


--- GENERATING PREDICTIONS ---


In [23]:
import pandas as pd
import numpy as np

# 1. GET THE DATA TO PREDICT ON
# We try to find the official 'test' split. If missing, we use 'validation' or our custom split.
target_split = None

if 'test' in ds:
    print("Found official Test split! Predicting on that...")
    target_split = ds['test']
elif 'validation' in ds:
    print("No Test split found. Predicting on Validation split...")
    target_split = ds['validation']
else:
    print("Using our custom split for demonstration...")
    target_split = test_dataset

# 2. RUN INFERENCE
print(f"Predicting on {len(target_split)} examples...")
predictions_output = trainer.predict(target_split)
predicted_ids = np.argmax(predictions_output.predictions, axis=-1)

# 3. CONVERT TO LABELS & MAP HIERARCHY
# We need two columns: Task 1 (Clarity) and Task 2 (Evasion)
task2_preds = [id2label[pid] for pid in predicted_ids]
task1_preds = []

for label in task2_preds:
    # THE HIERARCHY MAPPING (Figure 3 from Paper)
    if label == 'Explicit':
        task1_preds.append('Clear Reply')
    elif label in ['Declining to answer', 'Claims ignorance', 'Clarification']:
        task1_preds.append('Clear Non-Reply')
    else:
        # Implicit, Dodging, General, Deflection, Partial -> Ambivalent
        task1_preds.append('Ambivalent Reply')

# 4. SAVE SUBMISSION CSV
# We assume the test set has an 'index' or 'id' column. If not, we use row numbers.
ids = target_split['index'] if 'index' in target_split.column_names else range(len(target_split))

submission_df = pd.DataFrame({
    'index': ids,
    'question': target_split['interview_question'], # Helpful for debugging
    'task1_clarity': task1_preds,
    'task2_evasion': task2_preds
})

# Save to CSV
submission_filename = "clarity_submission.csv"
submission_df.to_csv(submission_filename, index=False)

print(f"\nSUCCESS! Submission file saved to: {submission_filename}")
print(submission_df[['task1_clarity', 'task2_evasion']].head(10))

Found official Test split! Predicting on that...
Predicting on 308 examples...


ValueError: No columns in the dataset match the model's forward method signature: (input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, labels, output_attentions, output_hidden_states, return_dict, label, label_ids, labels). The following columns have been ignored: [question, index, gpt3.5_prediction, question_order, president, annotator_id, interview_answer, evasion_label, affirmative_questions, url, multiple_questions, clarity_label, title, date, annotator3, inaudible, annotator1, annotator2, gpt3.5_summary, interview_question]. Please check the dataset and model. You may need to set `remove_unused_columns=False` in `TrainingArguments`.