<h3>Load Subtask 1 Annotations and Verify a Raw Document</h3>

In [58]:
import os
import pandas as pd

# Define paths to English data (adjust paths if needed)
en_annotations_path = "EN/annotations/subtask-1-annotations.txt"
en_raw_docs_path = "EN/raw-documents/"

# Custom parser function to handle multiple fine-grained roles
def parse_annotations(line):
    parts = line.strip().split("\t")  # Split by tab
    article_id, entity_mention, start_offset, end_offset, main_role = parts[:5]
    fine_grained_roles = parts[5:]  # Capture remaining parts as fine-grained roles
    return {
        "article_id": article_id,
        "entity_mention": entity_mention,
        "start_offset": int(start_offset),
        "end_offset": int(end_offset),
        "main_role": main_role,
        "fine_grained_roles": fine_grained_roles,
    }

# Read the annotation file line-by-line
with open(en_annotations_path, "r", encoding="utf-8") as f:
    data = [parse_annotations(line) for line in f]

# Convert to DataFrame
annotations = pd.DataFrame(data)

# Display the first few rows to verify
print("Sample Annotations:\n", annotations.head())

# Pick the first article's ID from annotations to verify
sample_article_id = annotations.iloc[0]["article_id"]
article_file_path = os.path.join(en_raw_docs_path, f"{sample_article_id}")

# Load and print a snippet of the raw document
with open(article_file_path, "r", encoding="utf-8") as f:
    article_content = f.read()

print(f"\nSample Article Content ({sample_article_id}):\n", article_content[:500])


Sample Annotations:
          article_id entity_mention  start_offset  end_offset    main_role  \
0  EN_UA_103861.txt        Chinese           791         797   Antagonist   
1  EN_UA_103861.txt          China          1516        1520   Antagonist   
2  EN_UA_103861.txt          Hamas          2121        2125   Antagonist   
3  EN_UA_103861.txt   Donald Trump          4909        4920  Protagonist   
4  EN_UA_021270.txt         Yermak           667         672   Antagonist   

       fine_grained_roles  
0                   [Spy]  
1            [Instigator]  
2             [Terrorist]  
3  [Peacemaker, Guardian]  
4           [Incompetent]  

Sample Article Content (EN_UA_103861.txt):
 The World Needs Peacemaker Trump Again 

 by Jeff Crouere, The Liberty Daily:

The world is in total chaos after 39 months of the Biden presidency. The southern border of our country is porous and millions of individuals from around the world have descended on our country.

These “undocumented migrants

<h3>Tokenization</h3>

In [59]:
from transformers import XLMRobertaTokenizer
import torch
import os
import pandas as pd

# Define paths to English data (adjust paths if needed)
en_annotations_path = "EN/annotations/subtask-1-annotations.txt"
en_raw_docs_path = "EN/raw-documents/"

# Custom parser function to handle multiple fine-grained roles
def parse_annotations(line):
    parts = line.strip().split("\t")  # Split by tab
    article_id, entity_mention, start_offset, end_offset, main_role = parts[:5]
    fine_grained_roles = parts[5:]  # Capture remaining parts as fine-grained roles
    return {
        "article_id": article_id,
        "entity_mention": entity_mention,
        "start_offset": int(start_offset),
        "end_offset": int(end_offset),
        "main_role": main_role,
        "fine_grained_roles": fine_grained_roles,
    }

# Read the annotation file line-by-line
with open(en_annotations_path, "r", encoding="utf-8") as f:
    data = [parse_annotations(line) for line in f]

# Convert to DataFrame
annotations = pd.DataFrame(data)

# Display the first few rows to verify
print("Sample Annotations:\n", annotations.head())

# Pick the first article's ID from annotations to verify
sample_article_id = annotations.iloc[0]["article_id"]
article_file_path = os.path.join(en_raw_docs_path, f"{sample_article_id}")

# Load and print a snippet of the raw document
with open(article_file_path, "r", encoding="utf-8") as f:
    article_content = f.read()

print(f"\nSample Article Content ({sample_article_id}):\n", article_content[:500])


# Initialize the XLM-RoBERTa tokenizer
tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")

# Function to extract text from articles based on offsets
def extract_text(article_id, start_offset, end_offset):
    with open(f"EN/raw-documents/{article_id}", "r", encoding="utf-8") as f:
        content = f.read()
    return content[start_offset:end_offset]

# Add extracted text to the DataFrame
annotations["extracted_text"] = annotations.apply(
    lambda row: extract_text(row["article_id"], row["start_offset"], row["end_offset"]), axis=1
)

# Tokenize the extracted text
encodings = tokenizer(
    list(annotations["extracted_text"]),
    padding=True,
    truncation=True,
    return_tensors="pt"
)

# Encode labels for main roles (e.g., Protagonist, Antagonist) into numerical values
main_role_encoder = {role: i for i, role in enumerate(annotations["main_role"].unique())}
annotations["main_role_encoded"] = annotations["main_role"].map(main_role_encoder)

# Encode fine-grained roles into multi-label vectors
fine_grained_roles = list(set([role for roles in annotations["fine_grained_roles"] for role in roles]))
fine_role_encoder = {role: i for i, role in enumerate(fine_grained_roles)}

def encode_fine_grained_roles(roles):
    encoding = [0] * len(fine_role_encoder)
    for role in roles:
        encoding[fine_role_encoder[role]] = 1
    return encoding

annotations["fine_grained_encoded"] = annotations["fine_grained_roles"].apply(encode_fine_grained_roles)

# Convert encoded labels into tensors
main_labels_tensor = torch.tensor(list(annotations["main_role_encoded"]), dtype=torch.long)
fine_labels_tensor = torch.tensor(list(annotations["fine_grained_encoded"]), dtype=torch.float32)

print("Sample Encodings:", encodings["input_ids"][:2])
print("Sample Main Labels:", main_labels_tensor[:2])
print("Sample Fine-Grained Labels:", fine_labels_tensor[:2])


Sample Annotations:
          article_id entity_mention  start_offset  end_offset    main_role  \
0  EN_UA_103861.txt        Chinese           791         797   Antagonist   
1  EN_UA_103861.txt          China          1516        1520   Antagonist   
2  EN_UA_103861.txt          Hamas          2121        2125   Antagonist   
3  EN_UA_103861.txt   Donald Trump          4909        4920  Protagonist   
4  EN_UA_021270.txt         Yermak           667         672   Antagonist   

       fine_grained_roles  
0                   [Spy]  
1            [Instigator]  
2             [Terrorist]  
3  [Peacemaker, Guardian]  
4           [Incompetent]  

Sample Article Content (EN_UA_103861.txt):
 The World Needs Peacemaker Trump Again 

 by Jeff Crouere, The Liberty Daily:

The world is in total chaos after 39 months of the Biden presidency. The southern border of our country is porous and millions of individuals from around the world have descended on our country.

These “undocumented migrants

<h3>Train/Validation Split and DataLoader Setup</h3>

In [60]:
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset

# Split the data into 80% train and 20% validation
train_indices, val_indices = train_test_split(
    range(len(annotations)),
    test_size=0.2,
    random_state=42
)

# Prepare train and validation tensors
train_encodings = {key: val[train_indices] for key, val in encodings.items()}
val_encodings = {key: val[val_indices] for key, val in encodings.items()}

train_main_labels = main_labels_tensor[train_indices]
train_fine_labels = fine_labels_tensor[train_indices]

val_main_labels = main_labels_tensor[val_indices]
val_fine_labels = fine_labels_tensor[val_indices]

# Create TensorDataset and DataLoader objects
train_dataset = TensorDataset(
    train_encodings["input_ids"], train_encodings["attention_mask"],
    train_main_labels, train_fine_labels
)
val_dataset = TensorDataset(
    val_encodings["input_ids"], val_encodings["attention_mask"],
    val_main_labels, val_fine_labels
)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

print(f"Number of training samples: {len(train_loader.dataset)}")
print(f"Number of validation samples: {len(val_loader.dataset)}")


Number of training samples: 331
Number of validation samples: 83


<h3>Set Up the Model and Trainer</h3>

In [62]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import f1_score, accuracy_score
import numpy as np

# Assuming you've already run preprocessing and tokenization

# Step 1: Initialize the Dataset Class
class EntityFramingDataset(Dataset):
    def __init__(self, encodings, main_labels, fine_labels):
        self.encodings = encodings
        self.main_labels = main_labels
        self.fine_labels = fine_labels

    def __len__(self):
        return len(self.main_labels)

    def __getitem__(self, idx):
        # Combine main role and fine-grained roles into one label tensor
        labels = torch.cat([self.main_labels[idx].unsqueeze(0), self.fine_labels[idx]], dim=0)
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = labels
        return item

# Step 2: Prepare the Train and Validation Data
train_dataset = EntityFramingDataset(train_encodings, train_main_labels, train_fine_labels)
val_dataset = EntityFramingDataset(val_encodings, val_main_labels, val_fine_labels)

# Step 3: Verify the Dataset Shapes (Optional Debugging Step)
sample = train_dataset[0]
print("Sample Input IDs:", sample["input_ids"].shape)
print("Sample Labels:", sample["labels"], sample["labels"].shape)

# Step 4: Initialize the Model
# Adjust the number of labels to match the total number of roles
num_main_labels = 1  # Assuming a single main role prediction
num_fine_labels = len(fine_role_encoder)  # Number of fine-grained roles
total_labels = num_main_labels + num_fine_labels  # Total number of labels

model = XLMRobertaForSequenceClassification.from_pretrained(
    "xlm-roberta-base",
    num_labels=total_labels,  # Total labels for the classification task
    problem_type="multi_label_classification"
)

def exact_match_ratio(y_true, y_pred):
    """
    Calculate the exact match ratio for multi-label predictions.
    Handles both 1D and 2D cases gracefully.
    """
    # Convert tensors to NumPy arrays if needed
    if isinstance(y_true, torch.Tensor):
        y_true = y_true.cpu().numpy()
    if isinstance(y_pred, torch.Tensor):
        y_pred = y_pred.cpu().numpy()

    # For single-label case
    if y_true.ndim == 1:
        return np.mean(y_true == y_pred)

    # For multi-label case (2D)
    elif y_true.ndim == 2:
        return np.mean(np.all(y_true == y_pred, axis=1))

    else:
        raise ValueError("Unexpected dimensions in labels.")


# Step 5: Define the Metric Function for Evaluation
def compute_metrics(eval_pred):
    logits, labels = eval_pred

    # Ensure logits and labels are tensors
    logits_tensor = torch.tensor(logits)
    labels_tensor = torch.tensor(labels)

    # Convert logits to binary predictions
    main_pred = (logits_tensor[:, 0] > 0).int()
    fine_pred = (torch.sigmoid(logits_tensor[:, 1:]) > 0.5).int()

    # Extract labels
    main_labels = labels_tensor[:, 0].int()
    fine_labels = (labels_tensor[:, 1:] > 0).int()  # Ensure binary format

    # Calculate metrics
    main_acc = accuracy_score(main_labels.cpu(), main_pred.cpu())
    fine_f1 = f1_score(fine_labels.cpu(), fine_pred.cpu(), average="micro", zero_division=1)
    exact_match_main = exact_match_ratio(main_labels.cpu(), main_pred.cpu())
    exact_match_fine = exact_match_ratio(fine_labels.cpu(), fine_pred.cpu())

    # Optional: Print detailed classification report for debugging
    from sklearn.metrics import classification_report
    print(classification_report(fine_labels.cpu(), fine_pred.cpu(), zero_division=1))

    return {
        "main_accuracy": main_acc,
        "fine_f1_score": fine_f1,
        "exact_match_main": exact_match_main,
        "exact_match_fine": exact_match_fine,
    }


sample = train_dataset[0]
print("Sample Input IDs:", sample["input_ids"].shape)
print("Sample Labels:", sample["labels"], sample["labels"].shape)


# Step 6: Set Up the Training Arguments
training_args = TrainingArguments(
    output_dir="./results",        # Directory to save checkpoints
    evaluation_strategy="epoch",   # Evaluate at the end of each epoch
    logging_dir="./logs",          # Directory for logs
    logging_steps=10,              # Log every 10 steps
    per_device_train_batch_size=10, # Batch size for training
    per_device_eval_batch_size=10,  # Batch size for evaluation
    num_train_epochs=2,            # Number of epochs
    save_strategy="epoch",         # Save model at the end of each epoch
    load_best_model_at_end=True,   # Load the best model after training
    metric_for_best_model="fine_f1_score",  # Select the best model based on F1 score
    greater_is_better=True,        # Higher F1 score is better
)

# Step 7: Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics  # Attach custom metric function
)

# Step 8: Start Training
trainer.train()

Sample Input IDs: torch.Size([14])
Sample Labels: tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0.]) torch.Size([23])


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Sample Input IDs: torch.Size([14])
Sample Labels: tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0.]) torch.Size([23])


  0%|          | 0/68 [00:00<?, ?it/s]

{'loss': 0.6451, 'grad_norm': 10.242195129394531, 'learning_rate': 4.2647058823529415e-05, 'epoch': 0.29}
{'loss': 0.485, 'grad_norm': 2.5487565994262695, 'learning_rate': 3.529411764705883e-05, 'epoch': 0.59}
{'loss': 0.3567, 'grad_norm': 1.210931658744812, 'learning_rate': 2.7941176470588236e-05, 'epoch': 0.88}


  0%|          | 0/9 [00:00<?, ?it/s]

              precision    recall  f1-score   support

           0       1.00      0.00      0.00         5
           1       1.00      0.00      0.00         5
           2       1.00      0.00      0.00         4
           3       1.00      0.00      0.00         2
           4       1.00      0.00      0.00         1
           5       1.00      0.00      0.00         1
           6       1.00      0.00      0.00         5
           7       1.00      1.00      1.00         0
           8       1.00      0.00      0.00        11
           9       1.00      1.00      1.00         0
          10       1.00      0.00      0.00         6
          11       1.00      0.00      0.00        10
          12       1.00      0.00      0.00         1
          13       1.00      0.00      0.00         1
          14       1.00      0.00      0.00         5
          15       1.00      0.00      0.00         6
          16       1.00      0.00      0.00         2
          17       1.00    

  0%|          | 0/9 [00:00<?, ?it/s]

              precision    recall  f1-score   support

           0       1.00      0.00      0.00         5
           1       1.00      0.00      0.00         5
           2       1.00      0.00      0.00         4
           3       1.00      0.00      0.00         2
           4       1.00      0.00      0.00         1
           5       1.00      0.00      0.00         1
           6       1.00      0.00      0.00         5
           7       1.00      1.00      1.00         0
           8       1.00      0.00      0.00        11
           9       1.00      1.00      1.00         0
          10       1.00      0.00      0.00         6
          11       1.00      0.00      0.00        10
          12       1.00      0.00      0.00         1
          13       1.00      0.00      0.00         1
          14       1.00      0.00      0.00         5
          15       1.00      0.00      0.00         6
          16       1.00      0.00      0.00         2
          17       1.00    

TrainOutput(global_step=68, training_loss=0.3706405934165506, metrics={'train_runtime': 600.9716, 'train_samples_per_second': 1.102, 'train_steps_per_second': 0.113, 'total_flos': 4763619226632.0, 'train_loss': 0.3706405934165506, 'epoch': 2.0})

<h3>Evaluate</h3>

In [63]:
# Evaluate the model using the validation dataset
eval_results = trainer.evaluate()

# Print evaluation metrics
print("Evaluation Results:", eval_results)


  0%|          | 0/9 [00:00<?, ?it/s]

              precision    recall  f1-score   support

           0       1.00      0.00      0.00         5
           1       1.00      0.00      0.00         5
           2       1.00      0.00      0.00         4
           3       1.00      0.00      0.00         2
           4       1.00      0.00      0.00         1
           5       1.00      0.00      0.00         1
           6       1.00      0.00      0.00         5
           7       1.00      1.00      1.00         0
           8       1.00      0.00      0.00        11
           9       1.00      1.00      1.00         0
          10       1.00      0.00      0.00         6
          11       1.00      0.00      0.00        10
          12       1.00      0.00      0.00         1
          13       1.00      0.00      0.00         1
          14       1.00      0.00      0.00         5
          15       1.00      0.00      0.00         6
          16       1.00      0.00      0.00         2
          17       1.00    

<h3>Save Model</h3>

In [64]:
# Save the model and tokenizer
model.save_pretrained("./entity-framing-model2")
tokenizer.save_pretrained("./entity-framing-model")
print("Model and tokenizer saved!")


Model and tokenizer saved!


<h3>Load Model (optional)</h3>

In [69]:
from transformers import XLMRobertaForSequenceClassification

# Load the saved model and tokenizer
loaded_model = XLMRobertaForSequenceClassification.from_pretrained("./entity-framing-model2")
loaded_tokenizer = XLMRobertaTokenizer.from_pretrained("./entity-framing-model")

print("Model and tokenizer loaded successfully!")


Model and tokenizer loaded successfully!


In [70]:
# Re-run evaluation
eval_results = trainer.evaluate()
print("Updated Evaluation Results:", eval_results)

  0%|          | 0/9 [00:00<?, ?it/s]

              precision    recall  f1-score   support

           0       1.00      0.00      0.00         5
           1       1.00      0.00      0.00         5
           2       1.00      0.00      0.00         4
           3       1.00      0.00      0.00         2
           4       1.00      0.00      0.00         1
           5       1.00      0.00      0.00         1
           6       1.00      0.00      0.00         5
           7       1.00      1.00      1.00         0
           8       1.00      0.00      0.00        11
           9       1.00      1.00      1.00         0
          10       1.00      0.00      0.00         6
          11       1.00      0.00      0.00        10
          12       1.00      0.00      0.00         1
          13       1.00      0.00      0.00         1
          14       1.00      0.00      0.00         5
          15       1.00      0.00      0.00         6
          16       1.00      0.00      0.00         2
          17       1.00    

<h3>Predictions on new data</h3>

In [50]:
def predict(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    outputs = loaded_model(**inputs)
    logits = outputs.logits.detach().numpy()

    # Convert logits to binary predictions
    predictions = (logits > 0).astype(int)

    return predictions

# Example usage
example_text = "Donald Trump visited China to promote peace."
predictions = predict(example_text)
print("Predictions:", predictions)


Predictions: [[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]


<h3>report</h3>

In [51]:
from sklearn.metrics import classification_report

# Get predictions and true labels from the validation set
preds_output = trainer.predict(val_dataset)
preds = (preds_output.predictions > 0).astype(int)

# Create a classification report
print("Main Role Classification Report:\n", classification_report(val_main_labels, preds[:, 0]))
print("Fine-Grained Role Classification Report:\n", classification_report(val_fine_labels, preds[:, 1:]))

  0%|          | 0/11 [00:00<?, ?it/s]

Main Role Classification Report:
               precision    recall  f1-score   support

           0       0.67      1.00      0.81        56
           1       0.00      0.00      0.00        16
           2       0.00      0.00      0.00        11

    accuracy                           0.67        83
   macro avg       0.22      0.33      0.27        83
weighted avg       0.46      0.67      0.54        83

Fine-Grained Role Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         5
           1       0.00      0.00      0.00         5
           2       0.00      0.00      0.00         4
           3       0.00      0.00      0.00         2
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         1
           6       0.00      0.00      0.00         5
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00        11
  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [52]:
# Flatten the list of fine-grained roles and count occurrences
fine_role_counts = annotations["fine_grained_roles"].explode().value_counts()
print("Fine-Grained Role Distribution:\n", fine_role_counts)


Fine-Grained Role Distribution:
 fine_grained_roles
Instigator           49
Guardian             40
Conspirator          38
Incompetent          35
Foreign Adversary    35
Victim               33
Tyrant               29
Deceiver             26
Saboteur             20
Virtuous             19
Corrupt              17
Peacemaker           15
Terrorist            14
Underdog             12
Rebel                11
Martyr               11
Bigot                 9
Traitor               8
Scapegoat             8
Exploited             6
Spy                   3
Forgotten             1
Name: count, dtype: int64
