<h3>Google Colab Setup</h3>

In [1]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


<h3>Step 2: Install and Import Libraries</h3>

In [2]:
!pip install transformers datasets sklearn torch tqdm

Collecting datasets
  Downloading datasets-3.0.2-py3-none-any.whl.metadata (20 kB)
Collecting sklearn
  Downloading sklearn-0.0.post12.tar.gz (2.6 kB)
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m: See above for details.


In [3]:
import os
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, XLMRobertaModel
from sklearn.preprocessing import MultiLabelBinarizer
import torch.nn as nn
from torch.cuda.amp import GradScaler, autocast  # For mixed precision
print(f"Using device: {device}")

Using device: cuda


<h3>Load the XLM-R Model and Tokenizer</h3>

In [4]:
class XLMRWithEntityRoles(nn.Module):
    def __init__(self, pretrained_model_name='xlm-roberta-base', num_main_roles=3, num_fine_roles=22):
        super(XLMRWithEntityRoles, self).__init__()
        self.xlm_r = XLMRobertaModel.from_pretrained(pretrained_model_name)

        # Main role classification head (single-label classification)
        self.main_role_classifier = nn.Linear(self.xlm_r.config.hidden_size, num_main_roles)

        # Fine-grained role classification head (multi-label classification)
        self.fine_role_classifier = nn.Linear(self.xlm_r.config.hidden_size, num_fine_roles)

    def forward(self, input_ids, attention_mask):
        # Forward pass through XLM-R base
        outputs = self.xlm_r(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output  # Use the pooled output

        # Predict main role
        main_role_logits = self.main_role_classifier(pooled_output)

        # Predict fine-grained roles
        fine_role_logits = self.fine_role_classifier(pooled_output)

        return main_role_logits, fine_role_logits

In [5]:
# Load XLM-R model and tokenizer
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
# Instantiate the custom model
model = XLMRWithEntityRoles(
    pretrained_model_name='xlm-roberta-base',
    num_main_roles=3,  # Protagonist, Antagonist, Innocent
    num_fine_roles=22  # Total number of fine-grained roles from taxonomy
).to(device)  # Move to GPU
print(f"Using device: {device}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Using device: cuda


<h3>Load One Article and Its Annotation</h3>

In [6]:
import os

# Function to load all articles from a given folder
def load_all_articles(lang_folder):
    """Load all articles from the language folder into a dictionary."""
    articles = {}
    for filename in os.listdir(lang_folder):
        if filename.endswith(".txt"):
            article_id = filename.split('.')[0]  # Extract article ID without .txt
            with open(os.path.join(lang_folder, filename), 'r', encoding='utf-8') as f:
                articles[article_id] = f.read()  # Store article content
    print(f"Loaded {len(articles)} articles.")
    return articles

# Function to load all annotations from the annotations file
def load_all_annotations(annotation_file):
    """Load all annotations into a dictionary grouped by article ID."""
    annotations = {}
    with open(annotation_file, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('\t')
            article_id = parts[0].split('.')[0]  # Extract article ID without .txt

            # Create a new list for the article's annotations if not already present
            if article_id not in annotations:
                annotations[article_id] = []

            # Add the annotation to the respective article's list
            annotations[article_id].append({
                'article_id': article_id,
                'entity': parts[1],
                'start': int(parts[2]),
                'end': int(parts[3]),
                'main_role': parts[4],
                'fine_roles': parts[5:]
            })
    print(f"Loaded annotations for {len(annotations)} articles.")
    return annotations

# Example usage
lang_folder = "EN/raw-documents"  # Folder containing English articles
annotation_file = "EN/annotations/subtask-1-annotations.txt"

# Load all articles and annotations
articles = load_all_articles(lang_folder)
annotations = load_all_annotations(annotation_file)

# Print summary of the loaded data
print(f"Total Articles Loaded: {len(articles)}")
for article_id, anns in annotations.items():
    print(f"Article ID: {article_id}, Annotations: {len(anns)}")
print(f"Using device: {device}")

Loaded 200 articles.
Loaded annotations for 106 articles.
Total Articles Loaded: 200
Article ID: EN_UA_103861, Annotations: 4
Article ID: EN_UA_021270, Annotations: 5
Article ID: EN_UA_103403, Annotations: 1
Article ID: EN_UA_027676, Annotations: 9
Article ID: EN_UA_017310, Annotations: 2
Article ID: EN_CC_100005, Annotations: 2
Article ID: EN_UA_008586, Annotations: 3
Article ID: EN_UA_014637, Annotations: 3
Article ID: EN_UA_019640, Annotations: 8
Article ID: EN_UA_022051, Annotations: 4
Article ID: EN_UA_004616, Annotations: 4
Article ID: EN_UA_025165, Annotations: 3
Article ID: EN_UA_024321, Annotations: 4
Article ID: EN_UA_000923, Annotations: 8
Article ID: EN_CC_100095, Annotations: 3
Article ID: EN_CC_100106, Annotations: 3
Article ID: EN_UA_014014, Annotations: 4
Article ID: EN_UA_026697, Annotations: 4
Article ID: EN_UA_101067, Annotations: 1
Article ID: EN_UA_002531, Annotations: 9
Article ID: EN_UA_103732, Annotations: 1
Article ID: EN_UA_002668, Annotations: 5
Article ID: E

<h3>Marking Entity Mentions and Tokenization</h3>

In [15]:
def get_entity_context(article, annotation, window_size=50):
    """Extract a window of text around a single entity."""
    start = max(int(annotation['start']) - window_size, 0)
    end = min(int(annotation['end']) + window_size, len(article))
    return article[start:end]

def mark_entity_in_context(context, annotation):
    """Wrap the entity in the context window with [ENTITY] and [/ENTITY] tags."""
    entity_start = context.find(annotation['entity'])

    if entity_start == -1:
        raise ValueError(f"Entity '{annotation['entity']}' not found in the provided context.")

    # Compute the end of the entity within the context
    entity_end = entity_start + len(annotation['entity'])

    # Mark the entity in the context
    marked_context = (
        context[:entity_start] +
        f"[ENTITY]{annotation['entity']}[/ENTITY]" +
        context[entity_end:]
    )

    return marked_context


# # Example usage
# marked_content = mark_entity_in_context(article, annotations)
# print("Marked Article Content:\n", marked_content)




# def extract_context_around_entity(article, start, end, window_size=150):
#     """Extract a snippet of context around the entity mention."""
#     context_start = max(0, start - window_size)
#     context_end = min(len(article), end + window_size)
#     return article[context_start:context_end]


# def mark_entity_in_context(entity_context, entity):
#     """Wrap the specific entity mention with [ENTITY] and [/ENTITY]."""
#     return entity_context.replace(entity, f"[ENTITY]{entity}[/ENTITY]")


# def tokenize_marked_context(marked_context):
#     """Tokenize the marked context using the XLM-R tokenizer."""
#     return tokenizer(
#         marked_context,
#         padding='max_length',
#         max_length=512,
#         truncation=True,
#         return_tensors="pt"
#     )


# Tokenize the marked article
def tokenize_marked_article(marked_article):
    """Tokenize the entire marked article."""
    tokenized_output = tokenizer(
        marked_article,
        padding='max_length',
        max_length=512,
        truncation=True,
        return_tensors='pt'
    )
    return tokenized_output

# # Tokenize the marked article
# tokenized_article = tokenize_marked_article(marked_content)

# # Example output
# print("Tokenized Article:\n", tokenized_article)


# Example usage
# marked_article = mark_entities_in_article(article, annotations)
# print("Marked Article Content:\n", marked_article[:2500])  # Print first 500 chars

# # Tokenize the marked article
# tokenized_article = tokenize_article(marked_article)
# print("\nTokenized Article:\n", tokenized_article)
print(f"Using device: {device}")

Using device: cuda


In [16]:
# Role-to-label conversion
MAIN_ROLE_MAPPING = {"Protagonist": 0, "Antagonist": 1, "Innocent": 2}

FINE_GRAINED_ROLE_LIST = [
    "Guardian", "Martyr", "Peacemaker", "Rebel", "Underdog", "Virtuous",
    "Instigator", "Conspirator", "Tyrant", "Foreign Adversary", "Traitor",
    "Spy", "Saboteur", "Corrupt", "Incompetent", "Terrorist", "Deceiver",
    "Bigot", "Forgotten", "Exploited", "Victim", "Scapegoat"
]

ROLE_MAPPING = {
    "Protagonist": ["Guardian", "Virtuous", "Peacemaker", "Underdog", "Martyr"],
    "Antagonist": ["Conspirator", "Spy", "Terrorist", "Instigator", "Tyrant", "Saboteur"],
    "Innocent": ["Victim", "Forgotten", "Exploited", "Scapegoat"]
}

def role_to_label(annotation):
    """Convert main role and fine-grained roles to label tensors."""
    # Main role label mapping
    main_role_label = MAIN_ROLE_MAPPING[annotation['main_role']]

    # Filter fine-grained roles based on the main role
    valid_roles = ROLE_MAPPING.get(annotation['main_role'], [])
    fine_role_labels = [0] * len(FINE_GRAINED_ROLE_LIST)

    for role in annotation['fine_roles']:
        if role in valid_roles:
            fine_role_labels[FINE_GRAINED_ROLE_LIST.index(role)] = 1

    return main_role_label, fine_role_labels
print(f"Using device: {device}")

Using device: cuda


In [17]:
from torch.utils.data import Dataset, DataLoader, random_split

class EntityRoleDataset(Dataset):
    def __init__(self, articles, annotations, window_size=50):
        """
        Initialize the dataset with all articles and their respective annotations.
        :param articles: Dictionary of article_id -> article content
        :param annotations: Dictionary of article_id -> list of annotations
        :param window_size: Size of the context window around entities.
        """
        self.articles = articles
        self.annotations = annotations
        self.window_size = window_size

        # Flatten (article_id, annotation) pairs for easy indexing
        self.data = [
            (article_id, annotation)
            for article_id, anns in annotations.items()
            for annotation in anns
        ]

    def __len__(self):
        # Number of samples = number of annotations across all articles
        return len(self.data)

    def __getitem__(self, idx):
        # Get the (article_id, annotation) pair for the current index
        article_id, annotation = self.data[idx]

        # Get the article content
        article = self.articles[article_id]

        # Get the context window around the entity
        context = get_entity_context(article, annotation, self.window_size)

        # Mark the entity within the context
        marked_context = mark_entity_in_context(context, annotation)

        # Tokenize the marked context
        tokenized_context = tokenizer(
            marked_context,
            padding='max_length',
            max_length=512,
            truncation=True,
            return_tensors='pt'
        )

        # Convert roles to labels
        main_label, fine_labels = role_to_label(annotation)

        return (
            tokenized_context['input_ids'].squeeze(0),
            tokenized_context['attention_mask'].squeeze(0),
            torch.tensor(main_label).long(),  # Tensor for main role
            torch.tensor(fine_labels).float()  # Tensor for fine-grained roles
        )
print(f"Using device: {device}")

Using device: cuda


In [44]:
# Create Dataset and DataLoader
dataset = EntityRoleDataset(articles, annotations, window_size=50)

# Split into train and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# DataLoaders for training and validation
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=2, shuffle=False)

print(f"Training examples: {len(train_dataset)}, Validation examples: {len(val_dataset)}")
print(f"Using device: {device}")

AttributeError: 'list' object has no attribute 'items'

In [45]:
import torch.optim as optim
from tqdm import tqdm  # For progress bar

# Optimizer
optimizer = optim.AdamW(model.parameters(), lr=2e-5)  # Use AdamW for fine-tuning

# Loss functions with weighting
main_role_loss_fn = nn.CrossEntropyLoss(weight=torch.tensor([2.0, 1.0, 3.0]).to(device))
fine_grained_loss_fn = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([1.5]).to(device))

def train_model(model, train_loader, val_loader, epochs=3, log_interval=10):
    """Train the model and log progress."""
    model.train()  # Set model to training mode

    for epoch in range(epochs):
        total_loss = 0
        print(f"Epoch {epoch + 1}/{epochs}:")

        # Create a progress bar for the epoch
        progress_bar = tqdm(enumerate(train_loader), total=len(train_loader), desc=f"Epoch {epoch + 1}")

        for batch_idx, (input_ids, attention_mask, main_label, fine_labels) in progress_bar:
            # Move data to device (GPU or CPU)
            input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)
            main_label = main_label.to(device)
            fine_labels = fine_labels.to(device, dtype=torch.float)

            # Forward pass
            optimizer.zero_grad()
            main_role_logits, fine_role_logits = model(input_ids, attention_mask)

            # Compute losses
            main_loss = main_role_loss_fn(main_role_logits, main_label)
            fine_loss = fine_grained_loss_fn(fine_role_logits, fine_labels)
            loss = main_loss + 0.5 * fine_loss  # Optional: Weighted loss

            # Backward pass and optimization
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            # Log progress every few batches
            if batch_idx % log_interval == 0:
                progress_bar.set_postfix({"Batch Loss": loss.item()})

        print(f"Epoch {epoch + 1}/{epochs}, Avg Loss: {total_loss / len(train_loader):.4f}")

        # Perform validation at the end of each epoch
        validate_model(model, val_loader)

def validate_model(model, val_loader):
    """Validate the model."""
    model.eval()  # Set model to evaluation mode
    val_loss = 0

    with torch.no_grad():
        for input_ids, attention_mask, main_label, fine_labels in val_loader:
            input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)
            main_label = main_label.to(device)
            fine_labels = fine_labels.to(device, dtype=torch.float)

            # Forward pass
            main_role_logits, fine_role_logits = model(input_ids, attention_mask)

            # Compute validation losses
            main_loss = main_role_loss_fn(main_role_logits, main_label)
            fine_loss = fine_grained_loss_fn(fine_role_logits, fine_labels)
            val_loss += main_loss.item() + fine_loss.item()

    print(f"Validation Loss: {val_loss / len(val_loader):.4f}")
    model.train()  # Switch back to training mode

print(f"Using device: {device}")

Using device: cuda


In [46]:
# Start training the model
train_model(model, train_loader, val_loader, epochs=3)


Epoch 1/3:


Epoch 1: 100%|██████████| 166/166 [00:44<00:00,  3.69it/s, Batch Loss=1.77]


Epoch 1/3, Avg Loss: 1.1059
Validation Loss: 1.0308
Epoch 2/3:


Epoch 2: 100%|██████████| 166/166 [00:46<00:00,  3.60it/s, Batch Loss=1.62]


Epoch 2/3, Avg Loss: 1.0952
Validation Loss: 1.0569
Epoch 3/3:


Epoch 3: 100%|██████████| 166/166 [00:46<00:00,  3.58it/s, Batch Loss=1.79]


Epoch 3/3, Avg Loss: 1.1091
Validation Loss: 1.0876


<h3>Save model</h3>

In [31]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [32]:
# Path to save the model (in Colab or Google Drive)
MODEL_PATH = '/content/drive/MyDrive/xlm_model/saved_model.pth'  # Or '/content/saved_model.pth'

# Save the model's state_dict
torch.save(model.state_dict(), MODEL_PATH)
print(f"Model saved to {MODEL_PATH}")


Model saved to /content/drive/MyDrive/xlm_model/saved_model.pth


In [33]:
# Save the entire model (not just state_dict)
torch.save(model, MODEL_PATH)

<h3>Load Model</h3>

In [None]:
# Load the entire model
model = torch.load(MODEL_PATH)
model.to(device)  # Ensure model is on the correct device


<h3>Predict</h3>

In [50]:
ROLE_MAPPING = {
    "Protagonist": ["Guardian", "Virtuous", "Peacemaker", "Underdog", "Martyr"],
    "Antagonist": ["Conspirator", "Spy", "Terrorist", "Instigator", "Tyrant", "Saboteur"],
    "Innocent": ["Victim", "Forgotten", "Exploited", "Scapegoat"]
}


def predict_for_multiple_articles(model, articles_annotations, window_size=50):
    """Predict roles for entities across multiple articles."""
    model.eval()  # Set model to evaluation mode
    all_predictions = []

    with torch.no_grad():
        for article_id, (article, annotations) in articles_annotations.items():
            article_predictions = []

            for annotation in annotations:
                # Extract context window around the entity
                context = get_entity_context(article, annotation, window_size)

                # Mark the entity in the context
                marked_context = mark_entity_in_context(context, annotation)

                # Tokenize the marked context
                tokenized_context = tokenizer(
                    marked_context,
                    padding='max_length',
                    max_length=512,
                    truncation=True,
                    return_tensors='pt'
                ).to(device)  # Send to the correct device

                # Forward pass to get logits
                main_role_logits, fine_role_logits = model(
                    tokenized_context['input_ids'],
                    tokenized_context['attention_mask']
                )

                # Predict main role using softmax
                main_role_probs = torch.softmax(main_role_logits, dim=1)
                main_role_pred = torch.argmax(main_role_probs, dim=1).item()
                main_role_str = decode_main_role(main_role_pred)

                # Predict fine-grained roles using sigmoid
                fine_role_probs = torch.sigmoid(fine_role_logits).squeeze(0)
                print(f"Fine-Grained Role Probabilities: {fine_role_probs.tolist()}")
                role_threshold = fine_role_probs.mean() + fine_role_probs.std()
                fine_role_pred = (fine_role_probs > role_threshold).nonzero(as_tuple=True)[0]


                # Filter fine-grained roles to match the main role
                valid_roles = ROLE_MAPPING.get(main_role_str, [])
                print(f"Valid Roles for {main_role_str}: {valid_roles}")
                print(f"Predicted Fine Roles: {[FINE_GRAINED_ROLE_LIST[i] for i in fine_role_pred]}")

                filtered_fine_roles = [
                    FINE_GRAINED_ROLE_LIST[i] for i in fine_role_pred
                    if FINE_GRAINED_ROLE_LIST[i] in valid_roles
                ]

                # Collect predictions for the entity
                article_predictions.append({
                    'entity': annotation['entity'],
                    'main_role': main_role_str,
                    'fine_roles': filtered_fine_roles
                })

            # Store predictions for this article
            all_predictions.append({
                'article_id': article_id,
                'predictions': article_predictions
            })

    return all_predictions


# Helper function to decode the main role ID back to a string
def decode_main_role(main_role_id):
    MAIN_ROLE_MAPPING = {0: "Protagonist", 1: "Antagonist", 2: "Innocent"}
    return MAIN_ROLE_MAPPING.get(main_role_id, "Unknown")


def exact_match_ratio_for_multiple_articles(all_predictions, articles_annotations):
    """Calculate the exact match ratio across multiple articles."""
    correct = 0
    total = 0

    # Iterate through predictions and annotations using indices
    for i in range(len(all_predictions)):
        article_pred = all_predictions[i]
        annotations = articles_annotations[i]  # Get annotations directly

        pred_entities = article_pred['predictions']

        # Ensure predictions and annotations have the same length
        min_len = min(len(pred_entities), len(annotations))

        for j in range(min_len):
            pred = pred_entities[j]
            ann = annotations[j]

            # Check if main role and fine-grained roles match exactly
            if (pred['main_role'] == ann['main_role'] and
                    set(pred['fine_roles']) == set(ann['fine_roles'])):
                correct += 1
            total += 1

    return correct / total if total > 0 else 0.0


In [51]:
import os

# Function to load an article given its ID and folder path
def load_article(article_id, lang_folder):
    article_path = os.path.join(lang_folder, f"{article_id}")
    with open(article_path, 'r', encoding='utf-8') as f:
        article = f.read()
    return article

# Function to load all annotations from the annotation file
def load_all_annotations(annotation_file):
    annotations_dict = {}

    with open(annotation_file, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('\t')
            article_id = parts[0]  # Article ID

            # Structure the annotation
            annotation = {
                'entity': parts[1],
                'start': int(parts[2]),
                'end': int(parts[3]),
                'main_role': parts[4],
                'fine_roles': parts[5:]
            }

            # Collect annotations for each article in a list
            if article_id not in annotations_dict:
                annotations_dict[article_id] = []
            annotations_dict[article_id].append(annotation)

    return annotations_dict

# Function to create the `articles_annotations` dictionary
def load_articles_annotations(lang_folder, annotation_file):
    articles_annotations = {}

    # Load all annotations first
    annotations_dict = load_all_annotations(annotation_file)

    # Loop through the articles based on available annotation IDs
    for article_id in annotations_dict.keys():
        article_text = load_article(article_id, lang_folder)
        annotations = annotations_dict[article_id]

        # Store the article text and its annotations as a tuple
        articles_annotations[article_id] = (article_text, annotations)

    return articles_annotations

# Example Usage
lang_folder = "EN/raw-documents"  # Folder containing English articles
annotation_file = "EN/annotations/subtask-1-annotations.txt"

# Load all articles and their annotations
articles_annotations = load_articles_annotations(lang_folder, annotation_file)

# Verify the structure
for article_id, (article_text, annotations) in articles_annotations.items():
    print(f"Article ID: {article_id}")
    print(f"Article Content (First 100 chars): {article_text[:100]}")
    print(f"Annotations: {annotations}\n")


Article ID: EN_UA_103861.txt
Article Content (First 100 chars): The World Needs Peacemaker Trump Again 

 by Jeff Crouere, The Liberty Daily:

The world is in total
Annotations: [{'entity': 'Chinese', 'start': 791, 'end': 797, 'main_role': 'Antagonist', 'fine_roles': ['Spy']}, {'entity': 'China', 'start': 1516, 'end': 1520, 'main_role': 'Antagonist', 'fine_roles': ['Instigator']}, {'entity': 'Hamas', 'start': 2121, 'end': 2125, 'main_role': 'Antagonist', 'fine_roles': ['Terrorist']}, {'entity': 'Donald Trump', 'start': 4909, 'end': 4920, 'main_role': 'Protagonist', 'fine_roles': ['Peacemaker', 'Guardian']}]

Article ID: EN_UA_021270.txt
Article Content (First 100 chars): Ukraine's Fate Will Be Decided In Coming Year, Top Zelensky Aide Admits 

 Ukraine's Fate Will Be De
Annotations: [{'entity': 'Yermak', 'start': 667, 'end': 672, 'main_role': 'Antagonist', 'fine_roles': ['Incompetent']}, {'entity': 'Zelensky', 'start': 846, 'end': 853, 'main_role': 'Antagonist', 'fine_roles': ['Incompe

In [52]:
predictions = predict_for_multiple_articles(model, articles_annotations, window_size=50)

# Assuming `all_annotations` is a list of annotations for all articles
# Pass the annotations correctly by extracting them from the values
annotations_list = [ann for _, ann in articles_annotations.values()]
emr = exact_match_ratio_for_multiple_articles(predictions, annotations_list)

print(f"Exact Match Ratio: {emr:.2f}")

# Print predictions
for article_pred in predictions:
    print(f"Article ID: {article_pred['article_id']}")
    for pred in article_pred['predictions']:
        entity = pred["entity"]
        main_role = pred["main_role"]
        fine_roles = pred["fine_roles"]

        print(f"Entity: {entity}")
        print(f"  Main Role: {main_role}")
        print(f"  Fine-Grained Roles: {', '.join(fine_roles)}\n")


Fine-Grained Role Probabilities: [0.183330237865448, 0.07235962897539139, 0.08065172284841537, 0.0251990407705307, 0.09104344248771667, 0.10315995663404465, 0.18548066914081573, 0.15528427064418793, 0.11860454082489014, 0.020167149603366852, 0.021038053557276726, 0.04313741251826286, 0.10087250173091888, 0.02429286576807499, 0.025104478001594543, 0.07179445773363113, 0.019222889095544815, 0.018534621223807335, 0.03038029558956623, 0.05343528091907501, 0.15256908535957336, 0.062432911247015]
Valid Roles for Antagonist: ['Conspirator', 'Spy', 'Terrorist', 'Instigator', 'Tyrant', 'Saboteur']
Predicted Fine Roles: ['Guardian', 'Instigator', 'Conspirator', 'Victim']
Fine-Grained Role Probabilities: [0.18315614759922028, 0.07215557992458344, 0.08047887682914734, 0.025095462799072266, 0.09083646535873413, 0.1029348224401474, 0.1852513998746872, 0.15512752532958984, 0.11837158352136612, 0.020083937793970108, 0.020959632471203804, 0.042984116822481155, 0.10067393630743027, 0.024203697219491005,

In [54]:
import os
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, jaccard_score

# Function to load an article given its ID and folder path
def load_article(article_id, lang_folder):
    article_path = os.path.join(lang_folder, f"{article_id}")
    with open(article_path, 'r', encoding='utf-8') as f:
        article = f.read()
    return article

# Function to load all annotations from the annotation file
def load_all_annotations(annotation_file):
    annotations_dict = {}
    with open(annotation_file, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('\t')
            article_id = parts[0]
            annotation = {
                'entity': parts[1],
                'start': int(parts[2]),
                'end': int(parts[3]),
                'main_role': parts[4],
                'fine_roles': parts[5:]
            }
            if article_id not in annotations_dict:
                annotations_dict[article_id] = []
            annotations_dict[article_id].append(annotation)
    return annotations_dict

# Function to create the articles_annotations dictionary
def load_articles_annotations(lang_folder, annotation_file):
    articles_annotations = {}
    annotations_dict = load_all_annotations(annotation_file)
    for article_id in annotations_dict.keys():
        article_text = load_article(article_id, lang_folder)
        annotations = annotations_dict[article_id]
        articles_annotations[article_id] = (article_text, annotations)
    return articles_annotations

# Load articles and their annotations
lang_folder = "EN/raw-documents"
annotation_file = "EN/annotations/subtask-1-annotations.txt"
articles_annotations = load_articles_annotations(lang_folder, annotation_file)

# Example prediction function (replace with your actual model prediction logic)
predictions = predict_for_multiple_articles(model, articles_annotations, window_size=50)

# Extract ground truth annotations for evaluation
def extract_labels(annotations):
    entities_labels = []
    for annotation in annotations:
        entity = annotation['entity']
        fine_roles = annotation['fine_roles']
        entities_labels.append((entity, fine_roles))
    return entities_labels

# Flatten predictions and true labels for evaluation
true_labels = []
pred_labels = []

for article_pred in predictions:
    article_id = article_pred['article_id']
    predicted_entities = article_pred['predictions']
    true_annotations = articles_annotations[article_id][1]

    # Extract true and predicted roles for comparison
    true_entities = extract_labels(true_annotations)
    for pred in predicted_entities:
        entity = pred["entity"]
        fine_roles_pred = pred["fine_roles"]

        # Find corresponding true roles
        matching_true = next((t for t in true_entities if t[0] == entity), None)
        fine_roles_true = matching_true[1] if matching_true else []

        # Store the roles for comparison
        true_labels.append(fine_roles_true)
        pred_labels.append(fine_roles_pred)

# Convert roles into binary vectors for multi-label comparison
mlb = MultiLabelBinarizer()
true_binary = mlb.fit_transform(true_labels)
pred_binary = mlb.transform(pred_labels)

# Evaluation metrics function
def evaluate_metrics(true_binary, pred_binary):
    accuracy = accuracy_score(true_binary, pred_binary)
    precision = precision_score(true_binary, pred_binary, average='weighted', zero_division=0)
    recall = recall_score(true_binary, pred_binary, average='weighted', zero_division=0)
    f1 = f1_score(true_binary, pred_binary, average='weighted', zero_division=0)
    jaccard = jaccard_score(true_binary, pred_binary, average='samples', zero_division=0)

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"Jaccard Similarity: {jaccard:.4f}")

# Run the evaluation
evaluate_metrics(true_binary, pred_binary)


Fine-Grained Role Probabilities: [0.183330237865448, 0.07235962897539139, 0.08065172284841537, 0.0251990407705307, 0.09104344248771667, 0.10315995663404465, 0.18548066914081573, 0.15528427064418793, 0.11860454082489014, 0.020167149603366852, 0.021038053557276726, 0.04313741251826286, 0.10087250173091888, 0.02429286576807499, 0.025104478001594543, 0.07179445773363113, 0.019222889095544815, 0.018534621223807335, 0.03038029558956623, 0.05343528091907501, 0.15256908535957336, 0.062432911247015]
Valid Roles for Antagonist: ['Conspirator', 'Spy', 'Terrorist', 'Instigator', 'Tyrant', 'Saboteur']
Predicted Fine Roles: ['Guardian', 'Instigator', 'Conspirator', 'Victim']
Fine-Grained Role Probabilities: [0.18315614759922028, 0.07215557992458344, 0.08047887682914734, 0.025095462799072266, 0.09083646535873413, 0.1029348224401474, 0.1852513998746872, 0.15512752532958984, 0.11837158352136612, 0.020083937793970108, 0.020959632471203804, 0.042984116822481155, 0.10067393630743027, 0.024203697219491005,