<h3>Load Annotations and Extract Entity Mentions</h3>

In [94]:
import pandas as pd

# Custom parser function to handle fine-grained roles
def parse_annotations(line):
    parts = line.strip().split("\t")  # Split the line by tabs
    article_id, entity_mention, start_offset, end_offset, main_role = parts[:5]
    fine_grained_roles = parts[5:]  # Capture remaining parts as fine-grained roles
    return {
        "article_id": article_id,
        "entity_mention": entity_mention,
        "start_offset": int(start_offset),
        "end_offset": int(end_offset),
        "main_role": main_role,
        "fine_grained_roles": fine_grained_roles
    }

# Read the file and parse it line-by-line
with open("subtask-1-annotations.txt", "r", encoding="utf-8") as f:
    data = [parse_annotations(line) for line in f]

# Convert to DataFrame
annotations = pd.DataFrame(data)

# Display the first few rows to verify
print(annotations.describe())


       start_offset   end_offset
count    414.000000   414.000000
mean    1343.722222  1352.758454
std      976.379227   976.200424
min       55.000000    64.000000
25%      489.750000   495.500000
50%     1221.000000  1230.000000
75%     2063.500000  2068.750000
max     4909.000000  4920.000000


<h3>Generate Label Encoders</h3>

In [95]:
from sklearn.preprocessing import MultiLabelBinarizer

# Create label encoders for main roles and fine-grained roles
main_role_encoder = MultiLabelBinarizer()
fine_grained_role_encoder = MultiLabelBinarizer()

# Fit the encoders on the roles found in the dataset
main_role_labels = annotations["main_role"].apply(lambda x: [x])  # Wrap each label in a list
fine_grained_labels = annotations["fine_grained_roles"]

main_role_encoded = main_role_encoder.fit_transform(main_role_labels)
fine_grained_encoded = fine_grained_role_encoder.fit_transform(fine_grained_labels)

# Convert encoded labels back to DataFrames for easy merging
main_role_df = pd.DataFrame(main_role_encoded, columns=main_role_encoder.classes_)
fine_grained_role_df = pd.DataFrame(fine_grained_encoded, columns=fine_grained_role_encoder.classes_)

# Merge the encoded labels back into the original DataFrame
annotations = pd.concat([annotations, main_role_df, fine_grained_role_df], axis=1)

# Display the updated DataFrame
print(annotations.head())


         article_id entity_mention  start_offset  end_offset    main_role  \
0  EN_UA_103861.txt        Chinese           791         797   Antagonist   
1  EN_UA_103861.txt          China          1516        1520   Antagonist   
2  EN_UA_103861.txt          Hamas          2121        2125   Antagonist   
3  EN_UA_103861.txt   Donald Trump          4909        4920  Protagonist   
4  EN_UA_021270.txt         Yermak           667         672   Antagonist   

       fine_grained_roles  Antagonist  Innocent  Protagonist  Bigot  ...  \
0                   [Spy]           1         0            0      0  ...   
1            [Instigator]           1         0            0      0  ...   
2             [Terrorist]           1         0            0      0  ...   
3  [Peacemaker, Guardian]           0         0            1      0  ...   
4           [Incompetent]           1         0            0      0  ...   

   Rebel  Saboteur  Scapegoat  Spy  Terrorist  Traitor  Tyrant  Underdog  \
0   

<h3>Extract Context Sentences</h3>

In [96]:
# Helper function to extract the sentence containing the entity
def extract_entity_context(article_path, start_offset, end_offset):
    with open(article_path, "r", encoding="utf-8") as file:
        text = file.read()

    # Extract the surrounding sentence or context
    entity_span = text[start_offset:end_offset]
    sentence = text[max(0, start_offset - 50):end_offset + 50]  # Surrounding context
    return entity_span, sentence

# Extract context for all rows in the annotations DataFrame
context_data = []
for _, row in annotations.iterrows():
    article_path = f"raw-documents/{row['article_id']}"  # Adjust path as needed
    entity_span, context = extract_entity_context(article_path, row['start_offset'], row['end_offset'])
    context_data.append(context)

# Add the extracted context to the DataFrame
annotations["context"] = context_data

# Display the DataFrame with the context column
print(annotations.head())


         article_id entity_mention  start_offset  end_offset    main_role  \
0  EN_UA_103861.txt        Chinese           791         797   Antagonist   
1  EN_UA_103861.txt          China          1516        1520   Antagonist   
2  EN_UA_103861.txt          Hamas          2121        2125   Antagonist   
3  EN_UA_103861.txt   Donald Trump          4909        4920  Protagonist   
4  EN_UA_021270.txt         Yermak           667         672   Antagonist   

       fine_grained_roles  Antagonist  Innocent  Protagonist  Bigot  ...  \
0                   [Spy]           1         0            0      0  ...   
1            [Instigator]           1         0            0      0  ...   
2             [Terrorist]           1         0            0      0  ...   
3  [Peacemaker, Guardian]           0         0            1      0  ...   
4           [Incompetent]           1         0            0      0  ...   

   Saboteur  Scapegoat  Spy  Terrorist  Traitor  Tyrant  Underdog  Victim  \
0  

<h3>Tokenization</h3>

In [97]:
from transformers import XLMRobertaTokenizer

# Load XLM-RoBERTa tokenizer
tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")

# Tokenize the context sentences
def tokenize_function(examples):
    return tokenizer(examples["context"], padding="max_length", truncation=True)

# Prepare the dataset for tokenization
dataset = annotations[["context"]]
tokenized_dataset = dataset.apply(lambda x: tokenize_function(x), axis=1)

# Convert tokenized data to DataFrame for model input
tokenized_df = pd.DataFrame(list(tokenized_dataset))
print(tokenized_df.head())


                                      attention_mask  \
0  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...   
1  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...   
2  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...   
3  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...   
4  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...   

                                           input_ids  
0  [0, 22, 142, 237, 16450, 33770, 305, 4, 7739, ...  
1  [0, 46684, 47, 12983, 2363, 130367, 7, 23, 244...  
2  [0, 47, 82739, 5, 2161, 18374, 361, 4, 8254, 5...  
3  [0, 442, 83, 80234, 47, 37629, 2367, 509, 1230...  
4  [0, 764, 6, 187404, 4745, 31455, 9, 77007, 215...  


<h3>Prepare Dataset for Model</h3>

In [98]:
import torch
from torch.utils.data import Dataset

import torch
from torch.utils.data import Dataset

class EntityFramingDataset(Dataset):
    def __init__(self, encodings, main_labels, fine_labels):
        # Convert lists to tensors with appropriate dimensions
        self.encodings = encodings
        self.main_labels = torch.tensor(main_labels, dtype=torch.float32)
        self.fine_labels = torch.tensor(fine_labels, dtype=torch.float32)

    def __len__(self):
        return len(self.main_labels)

    def __getitem__(self, idx):
        # Prepare the input encoding and labels for the given index
        item = {key: torch.tensor(val[idx]).clone().detach() for key, val in self.encodings.items()}
        item["main_labels"] = self.main_labels[idx]
        item["fine_labels"] = self.fine_labels[idx]
        return item



# Prepare encodings
encodings = tokenizer(
    list(annotations["context"]),
    padding=True,
    truncation=True,
    return_tensors="pt"
)

# Prepare labels (ensure tensors are float)
main_labels = torch.tensor(main_role_encoded, dtype=torch.float32)
fine_labels = torch.tensor(fine_grained_encoded, dtype=torch.float32)

# Create the dataset
dataset = EntityFramingDataset(encodings, main_labels, fine_labels)



  self.main_labels = torch.tensor(main_labels, dtype=torch.float32)
  self.fine_labels = torch.tensor(fine_labels, dtype=torch.float32)


<h3>Set Up the Trainer and Model</h3>

In [99]:
from transformers import XLMRobertaForSequenceClassification, Trainer, TrainingArguments

# Load the XLM-R model for multi-label classification
model = XLMRobertaForSequenceClassification.from_pretrained(
    "xlm-roberta-base",
    cache_dir="./models",  # Store/download the model locally
    num_labels=len(main_role_encoder.classes_) + len(fine_grained_role_encoder.classes_),
    problem_type="multi_label_classification"
)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<h3>Define Training Arguments</h3>

In [100]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",            # Directory to save checkpoints
    evaluation_strategy="epoch",       # Evaluate at the end of each epoch
    save_strategy="epoch",             # Save the model at the end of each epoch
    logging_dir="./logs",              # Directory for logs
    logging_steps=10,                  # Log every 10 steps
    per_device_train_batch_size=8,     # Batch size for training
    per_device_eval_batch_size=8,      # Batch size for evaluation
    num_train_epochs=3,                # Number of epochs
    load_best_model_at_end=True,       # Load best model at the end
    metric_for_best_model="loss",      # Use loss to select the best model
    greater_is_better=False,           # Lower loss is better
)




In [102]:
from sklearn.model_selection import train_test_split

# Split dataset into 80% train and 20% validation
train_indices, val_indices = train_test_split(
    list(range(len(encodings['input_ids']))), test_size=0.2, random_state=42
)

# Create encodings for train and validation sets
train_encodings = {key: val[train_indices] for key, val in encodings.items()}
val_encodings = {key: val[val_indices] for key, val in encodings.items()}

# Create labels for train and validation sets
train_main_labels = [main_labels[i] for i in train_indices]
val_main_labels = [main_labels[i] for i in val_indices]

train_fine_labels = [fine_labels[i] for i in train_indices]
val_fine_labels = [fine_labels[i] for i in val_indices]

# Convert labels to tensors
# Ensure the labels are lists of numerical values, not nested lists
train_main_labels_tensor = torch.tensor([label for label in train_main_labels], dtype=torch.float32)
train_fine_labels_tensor = torch.tensor([label for label in train_fine_labels], dtype=torch.float32)

val_main_labels_tensor = torch.tensor([label for label in val_main_labels], dtype=torch.float32)
val_fine_labels_tensor = torch.tensor([label for label in val_fine_labels], dtype=torch.float32)


# Create datasets using the custom EntityFramingDataset class
train_dataset = EntityFramingDataset(train_encodings, train_main_labels_tensor, train_fine_labels_tensor)
eval_dataset = EntityFramingDataset(val_encodings, val_main_labels_tensor, val_fine_labels_tensor)


ValueError: only one element tensors can be converted to Python scalars

<h3>Define Custom Metrics</h3>

In [92]:
import numpy as np
from sklearn.metrics import f1_score, accuracy_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    return {"f1": f1}

<h3>Initialize the Trainer</h3>

In [93]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics  # Attach custom metric
)

NameError: name 'train_dataset' is not defined

<h3>Train the Model</h3>

In [71]:
trainer.train()

  0%|          | 0/156 [10:53<?, ?it/s]
  6%|▋         | 10/156 [09:27<2:17:59, 56.71s/it]
  item = {key: torch.tensor(val[idx]).clone().detach() for key, val in self.encodings.items()}
  6%|▋         | 10/156 [00:47<10:34,  4.35s/it]

{'loss': 0.5866, 'grad_norm': 1.6705063581466675, 'learning_rate': 4.67948717948718e-05, 'epoch': 0.19}


 13%|█▎        | 20/156 [01:34<10:20,  4.56s/it]

{'loss': 0.4103, 'grad_norm': 1.0101654529571533, 'learning_rate': 4.358974358974359e-05, 'epoch': 0.38}


 19%|█▉        | 30/156 [02:11<07:38,  3.64s/it]

{'loss': 0.3242, 'grad_norm': 1.0920099020004272, 'learning_rate': 4.038461538461539e-05, 'epoch': 0.58}


 26%|██▌       | 40/156 [02:47<06:52,  3.56s/it]

{'loss': 0.2715, 'grad_norm': 0.4572836458683014, 'learning_rate': 3.717948717948718e-05, 'epoch': 0.77}


 32%|███▏      | 50/156 [03:25<06:48,  3.85s/it]

{'loss': 0.2564, 'grad_norm': 0.45132243633270264, 'learning_rate': 3.397435897435898e-05, 'epoch': 0.96}


  item = {key: torch.tensor(val[idx]).clone().detach() for key, val in self.encodings.items()}
 38%|███▊      | 60/156 [04:11<06:21,  3.97s/it]

{'loss': 0.2341, 'grad_norm': 0.4390738904476166, 'learning_rate': 3.0769230769230774e-05, 'epoch': 1.15}


 45%|████▍     | 70/156 [04:50<05:22,  3.74s/it]

{'loss': 0.2342, 'grad_norm': 0.48095059394836426, 'learning_rate': 2.756410256410257e-05, 'epoch': 1.35}


 51%|█████▏    | 80/156 [05:27<04:56,  3.90s/it]

{'loss': 0.2385, 'grad_norm': 0.460703581571579, 'learning_rate': 2.435897435897436e-05, 'epoch': 1.54}


 58%|█████▊    | 90/156 [06:06<04:02,  3.67s/it]

{'loss': 0.2264, 'grad_norm': 0.4588775932788849, 'learning_rate': 2.1153846153846154e-05, 'epoch': 1.73}


 64%|██████▍   | 100/156 [06:43<03:43,  3.99s/it]

{'loss': 0.2322, 'grad_norm': 0.3887184262275696, 'learning_rate': 1.794871794871795e-05, 'epoch': 1.92}


  item = {key: torch.tensor(val[idx]).clone().detach() for key, val in self.encodings.items()}
 71%|███████   | 110/156 [07:28<02:55,  3.81s/it]

{'loss': 0.2267, 'grad_norm': 0.4514479339122772, 'learning_rate': 1.4743589743589745e-05, 'epoch': 2.12}


 77%|███████▋  | 120/156 [08:06<02:12,  3.69s/it]

{'loss': 0.2257, 'grad_norm': 0.331670880317688, 'learning_rate': 1.153846153846154e-05, 'epoch': 2.31}


 83%|████████▎ | 130/156 [08:44<01:42,  3.94s/it]

{'loss': 0.2302, 'grad_norm': 0.32617467641830444, 'learning_rate': 8.333333333333334e-06, 'epoch': 2.5}


 90%|████████▉ | 140/156 [09:23<01:00,  3.79s/it]

{'loss': 0.2329, 'grad_norm': 0.4229016900062561, 'learning_rate': 5.128205128205128e-06, 'epoch': 2.69}


 96%|█████████▌| 150/156 [10:06<00:25,  4.18s/it]

{'loss': 0.2208, 'grad_norm': 0.44514214992523193, 'learning_rate': 1.9230769230769234e-06, 'epoch': 2.88}


  item = {key: torch.tensor(val[idx]).clone().detach() for key, val in self.encodings.items()}
100%|██████████| 156/156 [10:45<00:00,  4.14s/it]

{'train_runtime': 645.7079, 'train_samples_per_second': 1.923, 'train_steps_per_second': 0.242, 'train_loss': 0.2746143692579025, 'epoch': 3.0}





TrainOutput(global_step=156, training_loss=0.2746143692579025, metrics={'train_runtime': 645.7079, 'train_samples_per_second': 1.923, 'train_steps_per_second': 0.242, 'total_flos': 24258503394792.0, 'train_loss': 0.2746143692579025, 'epoch': 3.0})

<h3>Save Model and Results</h3>

In [72]:
model.save_pretrained("./entity-framing-model")
tokenizer.save_pretrained("./entity-framing-model")

('./entity-framing-model\\tokenizer_config.json',
 './entity-framing-model\\special_tokens_map.json',
 './entity-framing-model\\sentencepiece.bpe.model',
 './entity-framing-model\\added_tokens.json')

<h3>Evaluate the Model</h3>

In [74]:
from sklearn.model_selection import train_test_split

# Split dataset: 80% for training, 20% for evaluation
train_indices, val_indices = train_test_split(
    range(len(main_labels)), test_size=0.2, random_state=42
)

# Create training and evaluation datasets
train_encodings = {key: val[train_indices] for key, val in encodings.items()}
val_encodings = {key: val[val_indices] for key, val in encodings.items()}

train_main_labels = [main_labels[i] for i in train_indices]
val_main_labels = [main_labels[i] for i in val_indices]

train_fine_labels = [fine_labels[i] for i in train_indices]
val_fine_labels = [fine_labels[i] for i in val_indices]

train_dataset = EntityFramingDataset(train_encodings, train_main_labels, train_fine_labels)
eval_dataset = EntityFramingDataset(val_encodings, val_main_labels, val_fine_labels)


ValueError: only one element tensors can be converted to Python scalars

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,  # Add the evaluation dataset
)


In [73]:
# Evaluate the model
results = trainer.evaluate()
print(results)


ValueError: Trainer: evaluation requires an eval_dataset.