Step 1: Install Dependencies

In [1]:
# !pip install transformers datasets scikit-learn pandas --upgrade

Step 2: Load and Prepare the Data

In [2]:
import pandas as pd

# Custom function to handle rows with multiple fine-grained roles
def load_annotations(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        rows = []
        for line in f:
            parts = line.strip().split("\t")
            article_id, entity_mention, start_offset, end_offset, main_role = parts[:5]
            fine_grained_roles = parts[5:]  # Handle multiple fine-grained roles
            rows.append({
                "article_id": article_id,
                "entity_mention": entity_mention,
                "start_offset": int(start_offset),
                "end_offset": int(end_offset),
                "main_role": main_role,
                "fine_grained_roles": fine_grained_roles,
            })
        return pd.DataFrame(rows)

# Load the annotations file
annotations_path = "ENN/subtask-1-annotations.txt"  # Replace with your file path
annotations = load_annotations(annotations_path)

# Display the first few rows
print(annotations.head())

         article_id   entity_mention  start_offset  end_offset    main_role  \
0  EN_CC_100013.txt       Bill Gates            93         102   Antagonist   
1  EN_CC_100013.txt              BBC          1860        1862   Antagonist   
2  EN_CC_100013.txt  Jeffrey Epstein          2005        2019   Antagonist   
3  EN_UA_300009.txt     Fail Alsynov           176         187  Protagonist   
4  EN_UA_300009.txt   Bashkir people          1616        1629     Innocent   

    fine_grained_roles  
0  [Deceiver, Corrupt]  
1           [Deceiver]  
2            [Corrupt]  
3      [Rebel, Martyr]  
4             [Victim]  


Step 3: Map Raw Documents

In [3]:
import os

# Load raw documents
raw_documents_path = "ENN/raw-documents"  # Replace with your path
raw_documents = {}
for filename in os.listdir(raw_documents_path):
    if os.path.isfile(os.path.join(raw_documents_path, filename)):
        with open(os.path.join(raw_documents_path, filename), "r", encoding="utf-8") as f:
            raw_documents[filename] = f.read()

# Map article_text to annotations
annotations["article_text"] = annotations["article_id"].map(raw_documents)

# Verify if the column is added
print(annotations.head())

         article_id   entity_mention  start_offset  end_offset    main_role  \
0  EN_CC_100013.txt       Bill Gates            93         102   Antagonist   
1  EN_CC_100013.txt              BBC          1860        1862   Antagonist   
2  EN_CC_100013.txt  Jeffrey Epstein          2005        2019   Antagonist   
3  EN_UA_300009.txt     Fail Alsynov           176         187  Protagonist   
4  EN_UA_300009.txt   Bashkir people          1616        1629     Innocent   

    fine_grained_roles                                       article_text  
0  [Deceiver, Corrupt]  Bill Gates Says He Is ‘The Solution’ To Climat...  
1           [Deceiver]  Bill Gates Says He Is ‘The Solution’ To Climat...  
2            [Corrupt]  Bill Gates Says He Is ‘The Solution’ To Climat...  
3      [Rebel, Martyr]  Russia: Clashes erupt in Bashkortostan as righ...  
4             [Victim]  Russia: Clashes erupt in Bashkortostan as righ...  


Step 4: Split Data

In [4]:
from sklearn.model_selection import train_test_split

# Taxonomy for validation
taxonomy = {
    "Protagonist": ["Guardian", "Martyr", "Peacemaker", "Rebel", "Underdog", "Virtuous"],
    "Antagonist": [
        "Instigator", "Conspirator", "Tyrant", "Foreign Adversary", "Traitor", "Spy",
        "Saboteur", "Corrupt", "Incompetent", "Terrorist", "Deceiver", "Bigot"
    ],
    "Innocent": ["Forgotten", "Exploited", "Victim", "Scapegoat"]
}

# Function to validate roles
def validate_roles(row):
    main_role = row["main_role"]
    valid_fine_roles = taxonomy.get(main_role, [])
    fine_roles = row["fine_grained_roles"]
    invalid_roles = [role for role in fine_roles if role not in valid_fine_roles]
    if invalid_roles:
        raise ValueError(f"Invalid roles {invalid_roles} for main role {main_role}")
    return True

# Validate roles in the dataset
annotations.apply(validate_roles, axis=1)

# Split the data
train_data, test_data = train_test_split(annotations, test_size=0.2, random_state=42)
val_data, test_data = train_test_split(test_data, test_size=0.5, random_state=42)

print(f"Train size: {len(train_data)}, Validation size: {len(val_data)}, Test size: {len(test_data)}")

Train size: 548, Validation size: 69, Test size: 69


Step 5: Tokenize Data

In [5]:
from transformers import DistilBertTokenizer
from transformers import BertTokenizer, BertForSequenceClassification

# Initialize DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
# tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


def tokenize_data(data):
    inputs = []
    labels = []

    for _, row in data.iterrows():
        context_window = f"Entity: {row['entity_mention']} Context: {row['article_text'][:400]}"
        inputs.append(context_window)
        labels.append(row["main_role"])

    encodings = tokenizer(inputs, padding="max_length", truncation=True, max_length=400, return_tensors="pt")
    return encodings, labels


# Tokenize datasets
train_encodings, train_labels = tokenize_data(train_data)
val_encodings, val_labels = tokenize_data(val_data)
test_encodings, test_labels = tokenize_data(test_data)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Step 6: Prepare the Dataset

In [6]:
import torch

class EntityRoleDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels, label_mapping):
        self.encodings = encodings
        self.labels = [label_mapping[label] for label in labels]  # Map labels to integers

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

# Map labels to integers
label_mapping = {"Protagonist": 0, "Antagonist": 1, "Innocent": 2}

# Create datasets
train_dataset = EntityRoleDataset(train_encodings, train_labels, label_mapping)
val_dataset = EntityRoleDataset(val_encodings, val_labels, label_mapping)
test_dataset = EntityRoleDataset(test_encodings, test_labels, label_mapping)

Step 7: Define Training and Train the Model

In [7]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score

# Initialize DistilBERT for sequence classification
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(label_mapping))
# model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_mapping))

# Define metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average="weighted")
    return {"accuracy": acc, "f1": f1}

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=32,
    num_train_epochs=20,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.83471,0.710145,0.589781
2,No log,0.800475,0.710145,0.589781
3,No log,0.792021,0.681159,0.605639
4,No log,0.799292,0.637681,0.597733
5,No log,0.824718,0.710145,0.672827
6,No log,0.824672,0.666667,0.678986
7,No log,0.806488,0.695652,0.682651
8,No log,0.759366,0.753623,0.755087
9,No log,0.851175,0.73913,0.72149
10,No log,0.939547,0.724638,0.723212


TrainOutput(global_step=360, training_loss=0.2768935309516059, metrics={'train_runtime': 575.6347, 'train_samples_per_second': 19.04, 'train_steps_per_second': 0.625, 'total_flos': 1134272328768000.0, 'train_loss': 0.2768935309516059, 'epoch': 20.0})

Step 8: Evaluate the Model

In [8]:
from sklearn.metrics import classification_report

# Evaluate on test data
test_results = trainer.evaluate(test_dataset)
print("Test Results:", test_results)

# Get predictions
import numpy as np

predictions = trainer.predict(test_dataset)
predicted_labels = np.argmax(predictions.predictions, axis=1)

# Convert test labels to numeric
test_labels_numeric = [label_mapping[label] for label in test_labels]

# Classification report
print(classification_report(test_labels_numeric, predicted_labels, target_names=label_mapping.keys()))
print(predicted_labels)


Test Results: {'eval_loss': 1.6920788288116455, 'eval_accuracy': 0.6521739130434783, 'eval_f1': 0.6327381038446523, 'eval_runtime': 0.8457, 'eval_samples_per_second': 81.587, 'eval_steps_per_second': 10.642, 'epoch': 20.0}
              precision    recall  f1-score   support

 Protagonist       0.54      0.39      0.45        18
  Antagonist       0.71      0.82      0.76        45
    Innocent       0.25      0.17      0.20         6

    accuracy                           0.65        69
   macro avg       0.50      0.46      0.47        69
weighted avg       0.63      0.65      0.63        69

[1 1 0 0 0 1 1 2 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 0 1 1 0 1 0 1 1 0 1 1 0 1 1
 1 1 1 0 2 1 1 1 1 0 1 1 1 1 1 0 2 1 1 1 1 1 0 1 1 1 1 1 1 1 0 1]


<h1>Fine-Grained Role Classification</h1>

Step 1: Prepare the Fine-Grained Role Classification Dataset

In [9]:
fine_grained_taxonomy = {
    "Protagonist": ["Guardian", "Martyr", "Peacemaker", "Rebel", "Underdog", "Virtuous"],
    "Antagonist": [
        "Instigator", "Conspirator", "Tyrant", "Foreign Adversary", "Traitor", "Spy", "Saboteur",
        "Corrupt", "Incompetent", "Terrorist", "Deceiver", "Bigot"
    ],
    "Innocent": ["Forgotten", "Exploited", "Victim", "Scapegoat"]
}


In [10]:
from sklearn.preprocessing import MultiLabelBinarizer

# Prepare dataset for fine-grained classification
def prepare_fine_grained_data(annotations, main_role_predictions):
    fine_grained_data = annotations.copy()
    fine_grained_data['predicted_main_role'] = main_role_predictions
    return fine_grained_data

# Use main role predictions from DistilBERT
fine_grained_data = prepare_fine_grained_data(test_data, predicted_labels)

# Add main role predictions to test_data
test_data['predicted_main_role'] = fine_grained_data['predicted_main_role']


# Ensure 'fine_grained_roles' is a list
fine_grained_data['fine_grained_roles'] = fine_grained_data['fine_grained_roles'].apply(
    lambda x: x if isinstance(x, list) else []
)

# Binarize the fine-grained roles using MultiLabelBinarizer
mlb = MultiLabelBinarizer()
fine_grained_labels = mlb.fit_transform(fine_grained_data['fine_grained_roles'])

# Verify the transformation
print("Classes for Fine-Grained Roles:", mlb.classes_)
fine_grained_data['fine_grained_label'] = list(fine_grained_labels)


Classes for Fine-Grained Roles: ['Conspirator' 'Corrupt' 'Deceiver' 'Exploited' 'Foreign Adversary'
 'Guardian' 'Incompetent' 'Instigator' 'Martyr' 'Peacemaker' 'Rebel'
 'Saboteur' 'Scapegoat' 'Terrorist' 'Traitor' 'Tyrant' 'Victim' 'Virtuous']


Step 2: Define Features for ML Models

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import hstack

# Generate TF-IDF features for text
tfidf_vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 5))
text_features = tfidf_vectorizer.fit_transform(fine_grained_data['article_text'])

# One-hot encode the predicted main roles
role_encoder = OneHotEncoder()
role_features = role_encoder.fit_transform(fine_grained_data['predicted_main_role'].values.reshape(-1, 1))

print(fine_grained_data.head())  # Ensure the 'fine_grained_roles' column exists and has values
print("-----")
print(fine_grained_data['fine_grained_roles'].head())  # Inspect the values in 'fine_grained_roles'

X_fine_grained = hstack([text_features, role_features])
y_fine_grained = fine_grained_data['fine_grained_roles']


           article_id    entity_mention  start_offset  end_offset  \
69   EN_CC_100076.txt           Reuters           129         135   
616  EN_CC_200116.txt  Pierre Poilievre           648         663   
651  EN_UA_011260.txt           Ukraine          1225        1231   
250  EN_UA_010091.txt            Europe          1015        1020   
603  EN_UA_028520.txt      Viktor Orbán           656         667   

       main_role fine_grained_roles  \
69    Antagonist      [Incompetent]   
616  Protagonist         [Guardian]   
651   Antagonist      [Conspirator]   
250  Protagonist         [Guardian]   
603   Antagonist          [Corrupt]   

                                          article_text  predicted_main_role  \
69   Reuters Joins BBC in Failed ‘Fact Check’ of Da...                    1   
616  Canadian Conservative Leader Poilievre Scorche...                    1   
651  Russia has a clear plan to resolve the conflic...                    0   
250  European leader calls on worl

Step 3: Train-Test Split

In [12]:
from sklearn.model_selection import train_test_split
from imblearn.combine import SMOTETomek
from imblearn.over_sampling import RandomOverSampler

X_train_fine, X_test_fine, y_train_fine, y_test_fine = train_test_split(
    X_fine_grained, fine_grained_labels, test_size=0.2, random_state=42
)
# # First handle cases with very few samples using RandomOverSampler
# ros = RandomOverSampler(random_state=42)
# X_train_fine, y_train_fine = ros.fit_resample(X_train_fine, y_train_fine)

# # Apply SMOTE for remaining imbalances
# smote_tomek = SMOTETomek(random_state=42)
# X_train_fine, y_train_fine = smote_tomek.fit_resample(X_train_fine, y_train_fine)

Step 4: Train ML Models for Fine-Grained Role Prediction

In [13]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import hamming_loss
from sklearn.metrics import precision_score, recall_score, f1_score

# Wrap models in OneVsRestClassifier
multi_label_models = {
    "SVM": OneVsRestClassifier(SVC(kernel="linear", probability=True)),
    "Logistic Regression": OneVsRestClassifier(LogisticRegression(max_iter=2000, random_state=42)),
    "Random Forest": OneVsRestClassifier(RandomForestClassifier(n_estimators=200, random_state=42)),
    "MLP": OneVsRestClassifier(MLPClassifier(hidden_layer_sizes=(256, 512, 256), max_iter=1000, random_state=42)),
}

# Train and evaluate each model
for model_name, model in multi_label_models.items():
    print(f"Training {model_name}...")
    model.fit(X_train_fine, y_train_fine)
    y_pred = model.predict(X_test_fine)

    # Hamming Loss
    print(f"{model_name} Hamming Loss: {hamming_loss(y_test_fine, y_pred):.4f}")

    # Exact Match Ratio
    exact_match_ratio = accuracy_score(y_test_fine, y_pred)
    print(f"{model_name} Exact Match Ratio (Subset Accuracy): {exact_match_ratio:.4f}")

    # Precision, Recall, F1-Score (Macro-Averaged)
    precision = precision_score(y_test_fine, y_pred, average='macro')
    recall = recall_score(y_test_fine, y_pred, average='macro')
    f1 = f1_score(y_test_fine, y_pred, average='macro')
    print(f"{model_name} Macro Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}")

    # Multi-Label Classification Report
    print(f"{model_name} Classification Report:")
    print(classification_report(y_test_fine, y_pred, target_names=mlb.classes_))



Training SVM...




SVM Hamming Loss: 0.0595
SVM Exact Match Ratio (Subset Accuracy): 0.0000
SVM Macro Precision: 0.0000, Recall: 0.0000, F1-Score: 0.0000
SVM Classification Report:
                   precision    recall  f1-score   support

      Conspirator       0.00      0.00      0.00         3
          Corrupt       0.00      0.00      0.00         1
         Deceiver       0.00      0.00      0.00         0
        Exploited       0.00      0.00      0.00         0
Foreign Adversary       0.00      0.00      0.00         0
         Guardian       0.00      0.00      0.00         0
      Incompetent       0.00      0.00      0.00         2
       Instigator       0.00      0.00      0.00         0
           Martyr       0.00      0.00      0.00         1
       Peacemaker       0.00      0.00      0.00         1
            Rebel       0.00      0.00      0.00         0
         Saboteur       0.00      0.00      0.00         1
        Scapegoat       0.00      0.00      0.00         1
        Ter

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Logistic Regression Hamming Loss: 0.0595
Logistic Regression Exact Match Ratio (Subset Accuracy): 0.0000
Logistic Regression Macro Precision: 0.0000, Recall: 0.0000, F1-Score: 0.0000
Logistic Regression Classification Report:
                   precision    recall  f1-score   support

      Conspirator       0.00      0.00      0.00         3
          Corrupt       0.00      0.00      0.00         1
         Deceiver       0.00      0.00      0.00         0
        Exploited       0.00      0.00      0.00         0
Foreign Adversary       0.00      0.00      0.00         0
         Guardian       0.00      0.00      0.00         0
      Incompetent       0.00      0.00      0.00         2
       Instigator       0.00      0.00      0.00         0
           Martyr       0.00      0.00      0.00         1
       Peacemaker       0.00      0.00      0.00         1
            Rebel       0.00      0.00      0.00         0
         Saboteur       0.00      0.00      0.00         1
      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Random Forest Hamming Loss: 0.0595
Random Forest Exact Match Ratio (Subset Accuracy): 0.0714
Random Forest Macro Precision: 0.0556, Recall: 0.0278, F1-Score: 0.0370
Random Forest Classification Report:
                   precision    recall  f1-score   support

      Conspirator       0.00      0.00      0.00         3
          Corrupt       0.00      0.00      0.00         1
         Deceiver       0.00      0.00      0.00         0
        Exploited       0.00      0.00      0.00         0
Foreign Adversary       0.00      0.00      0.00         0
         Guardian       0.00      0.00      0.00         0
      Incompetent       0.00      0.00      0.00         2
       Instigator       0.00      0.00      0.00         0
           Martyr       0.00      0.00      0.00         1
       Peacemaker       0.00      0.00      0.00         1
            Rebel       0.00      0.00      0.00         0
         Saboteur       0.00      0.00      0.00         1
        Scapegoat       0.00  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


MLP Hamming Loss: 0.0595
MLP Exact Match Ratio (Subset Accuracy): 0.0714
MLP Macro Precision: 0.0556, Recall: 0.0278, F1-Score: 0.0370
MLP Classification Report:
                   precision    recall  f1-score   support

      Conspirator       0.00      0.00      0.00         3
          Corrupt       0.00      0.00      0.00         1
         Deceiver       0.00      0.00      0.00         0
        Exploited       0.00      0.00      0.00         0
Foreign Adversary       0.00      0.00      0.00         0
         Guardian       0.00      0.00      0.00         0
      Incompetent       0.00      0.00      0.00         2
       Instigator       0.00      0.00      0.00         0
           Martyr       0.00      0.00      0.00         1
       Peacemaker       0.00      0.00      0.00         1
            Rebel       0.00      0.00      0.00         0
         Saboteur       0.00      0.00      0.00         1
        Scapegoat       0.00      0.00      0.00         1
        Ter

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Step 5: Use Models for Prediction

In [14]:
def predict_fine_grained_with_taxonomy(model, new_data, tfidf_vectorizer, role_encoder, mlb, fine_grained_taxonomy, label_mapping):
    # Ensure required columns exist
    required_columns = ['article_text', 'predicted_main_role']
    missing_columns = [col for col in required_columns if col not in new_data.columns]
    if missing_columns:
        raise ValueError(f"The following columns are missing from new_data: {missing_columns}")

    # Generate features
    text_features = tfidf_vectorizer.transform(new_data['article_text'])
    role_features = role_encoder.transform(new_data['predicted_main_role'].values.reshape(-1, 1))
    features = hstack([text_features, role_features])

    # Predict fine-grained roles
    fine_grained_binary_preds = model.predict(features)
    fine_grained_preds = mlb.inverse_transform(fine_grained_binary_preds)  # Binary to readable roles

    # Enforce taxonomy constraints
    filtered_preds = []
    for i, main_role in enumerate(new_data['predicted_main_role']):
        # Convert integer label to string label
        main_role_str = {v: k for k, v in label_mapping.items()}[main_role]
        valid_roles = fine_grained_taxonomy[main_role_str]  # Get valid roles for this main role
        filtered_roles = [role for role in fine_grained_preds[i] if role in valid_roles]
        filtered_preds.append(filtered_roles)

    return filtered_preds



# Example usage with taxonomy enforcement
best_model = multi_label_models["MLP"]  # Replace with your best-performing model
multi_label_preds = predict_fine_grained_with_taxonomy(
    best_model,
    test_data,
    tfidf_vectorizer,
    role_encoder,
    mlb,
    fine_grained_taxonomy,
    label_mapping  # Pass the label mapping dictionary
)

# Add filtered predictions to test data
test_data["predicted_fine_grained_roles"] = multi_label_preds
print(test_data[["article_id", "entity_mention", "predicted_main_role", "predicted_fine_grained_roles"]])

           article_id                        entity_mention  \
69   EN_CC_100076.txt                               Reuters   
616  EN_CC_200116.txt                      Pierre Poilievre   
651  EN_UA_011260.txt                               Ukraine   
250  EN_UA_010091.txt                                Europe   
603  EN_UA_028520.txt                          Viktor Orbán   
..                ...                                   ...   
210  EN_UA_016012.txt                             Zinchenko   
310  EN_CC_300064.txt  'Letzte Generation' (Last Generation   
10   EN_UA_014637.txt                     Yevgeny Prigozhin   
211  EN_UA_016012.txt                    Wladimir Klitschko   
120  EN_UA_102990.txt                                Russia   

     predicted_main_role predicted_fine_grained_roles  
69                     1                           []  
616                    1                           []  
651                    0                           []  
250                

Step 6: Evaluate Multi-Label Predictions

In [15]:
from sklearn.metrics import hamming_loss, accuracy_score, precision_score, recall_score, f1_score

# Evaluate multi-label predictions
print("Hamming Loss:", hamming_loss(y_test_fine, y_pred))
print("Exact Match Ratio (Subset Accuracy):", accuracy_score(y_test_fine, y_pred))

# Macro Precision, Recall, F1-Score
precision = precision_score(y_test_fine, y_pred, average='macro')
recall = recall_score(y_test_fine, y_pred, average='macro')
f1 = f1_score(y_test_fine, y_pred, average='macro')
print(f"Macro Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}")


Hamming Loss: 0.05952380952380952
Exact Match Ratio (Subset Accuracy): 0.07142857142857142
Macro Precision: 0.0556, Recall: 0.0278, F1-Score: 0.0370


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
