In [None]:
pip install transformers datasets scikit-learn torch evaluate

In [None]:
import os

def load_annotations(annotations_file):
    position = []
    document_references = []
    x = 0
    with open(annotations_file, 'r') as file:
        for line in file:
            parts = line.strip().split('\t')
            document_references.append(parts[0])
            position.append([parts[2], parts[3]])
            x += 1
    print(x)

    return position, document_references

def load_documents(raw_documents_folder):
    documents = {}
    for filename in sorted(os.listdir(raw_documents_folder)):
        file_path = os.path.join(raw_documents_folder, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            documents[filename] = file.read()
    return documents

def process_documents(annotations_file, raw_documents_folder):

    position_array, document_references = load_annotations(annotations_file)
    documents_dict = load_documents(raw_documents_folder)

    result_array = [documents_dict[doc] for doc in document_references if doc in documents_dict]

    return result_array, position_array

annotations_file = 'subtask-1-entity-mentions.txt'
raw_documents_folder = 'subtask-1-documents'
result_array, position_array = process_documents(annotations_file, raw_documents_folder)



235


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

tokenizer = AutoTokenizer.from_pretrained("VSPuzzler/SemEval2025FinalModel")
model = AutoModelForSequenceClassification.from_pretrained("VSPuzzler/SemEval2025FinalModel")
model.eval()

def mark_target_word(text, position_range, start_token="[TARGET]", end_token="[/TARGET]", max_tokens=512):
    start_char, end_char = position_range
    marked_text = text[:int(start_char)] + start_token + text[int(start_char):int(end_char)] + end_token + text[int(end_char):]

    tokens = tokenizer.tokenize(marked_text)

    if len(tokens) > max_tokens:
        target_start_index = len(tokenizer.tokenize(text[:int(start_char)]))
        target_end_index = target_start_index + len(tokenizer.tokenize(text[int(start_char):int(end_char)]))

        context_size = (max_tokens - (target_end_index - target_start_index)) // 2

        start_index = max(0, target_start_index - context_size)
        end_index = min(len(tokens), target_end_index + context_size)

        tokens = tokens[start_index:end_index]
        marked_text = tokenizer.convert_tokens_to_string(tokens)

    return marked_text

def tokenize_function(text):
    return tokenizer(text, padding="max_length", truncation=True, max_length=128, return_tensors="pt")

def predict(model, tokenizer, result_array, position_array):
    predictions = []

    for i, text in enumerate(result_array):
        position_range = position_array[i]

        marked_text = mark_target_word(text, position_range)

        tokenized_text = tokenize_function(marked_text)

        # Make predictions
        with torch.no_grad():
            outputs = model(
                input_ids=tokenized_text['input_ids'],
                attention_mask=tokenized_text['attention_mask']
            )
            logits = outputs.logits
            prediction = torch.argmax(logits, dim=1).item()
            predictions.append(prediction)

    return predictions

predictions = predict(model, tokenizer, result_array, position_array)
print("Predictions:", predictions)


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

models = {
    0: {
        "tokenizer": AutoTokenizer.from_pretrained("VSPuzzler/ProtagonistClassificationModel"),
        "model": AutoModelForSequenceClassification.from_pretrained("VSPuzzler/ProtagonistClassificationModel")
    },
    1: {
        "tokenizer": AutoTokenizer.from_pretrained("VSPuzzler/AntagonistClassificationModel"),
        "model": AutoModelForSequenceClassification.from_pretrained("VSPuzzler/AntagonistClassificationModel")
    },
    2: {
        "tokenizer": AutoTokenizer.from_pretrained("VSPuzzler/InnocentClassificationModel"),
        "model": AutoModelForSequenceClassification.from_pretrained("VSPuzzler/InnocentClassificationModel")
    }
}

for key in models:
    models[key]["model"].eval()

def tokenize_function(text, tokenizer):
    return tokenizer(text, padding="max_length", truncation=True, max_length=128, return_tensors="pt")

def predict_from_models(predictions, result_array, position_array):
    final_predictions = []

    for i, category in enumerate(predictions):
        text = result_array[i]
        position_range = position_array[i]
        model_info = models.get(category)

        tokenizer = model_info["tokenizer"]
        model = model_info["model"]

        marked_text = mark_target_word(text, position_range)
        tokenized_text = tokenize_function(marked_text, tokenizer)

        with torch.no_grad():
            outputs = model(
                input_ids=tokenized_text['input_ids'],
                attention_mask=tokenized_text['attention_mask']
            )
            logits = outputs.logits
            probabilities = torch.sigmoid(logits).squeeze().tolist()
            predicted_labels = [idx for idx, prob in enumerate(probabilities) if prob > 0.175]
            if not predicted_labels:
                predicted_labels = [torch.argmax(logits).item()]

            final_predictions.append(predicted_labels)


    return final_predictions

multi_label_predictions = predict_from_models(predictions, result_array, position_array)
print("Final Multi-Label Predictions:", multi_label_predictions)


In [None]:
main_category_labels = {
    0: "Protagonist",
    1: "Antagonist",
    2: "Innocent"
}

protagonist_labels = {0: "Guardian", 1: "Martyr", 2: "Peacemaker", 3: "Rebel", 4: "Underdog", 5: "Virtuous"}
antagonist_labels = {0: "Instigator", 1: "Conspirator", 2: "Tyrant", 3: "Foreign Adversary", 4: "Traitor", 5: "Spy",
                     6: "Saboteur", 7: "Corrupt", 8: "Incompetent", 9: "Terrorist", 10: "Deceiver", 11: "Bigot"}
innocent_labels = {0: "Forgotten", 1: "Exploited", 2: "Victim", 3: "Scapegoat"}

def convert_predictions_to_text(main_categories, subcategories):
    text_results = []

    for main_category, subcategory_indices in zip(main_categories, subcategories):
        main_text = main_category_labels[main_category]

        if main_category == 0:
            sub_texts = [protagonist_labels[idx] for idx in subcategory_indices]
        elif main_category == 1:
            sub_texts = [antagonist_labels[idx] for idx in subcategory_indices]
        elif main_category == 2:
            sub_texts = [innocent_labels[idx] for idx in subcategory_indices]

        text_results.append({"Main Category": main_text, "Subcategories": sub_texts})

    return text_results



converted_text = convert_predictions_to_text(predictions, multi_label_predictions)
print(converted_text)


In [None]:
import os

def append_categories_to_annotations(annotations_file, converted_text, output_file):
    with open(annotations_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
        for i, line in enumerate(infile):
            parts = line.strip().split('\t')

            # Get the main category and subcategories from converted_text
            main_category = converted_text[i]["Main Category"] if i < len(converted_text) else "Unknown"
            subcategories = "\t".join(converted_text[i]["Subcategories"]) if i < len(converted_text) else ""

            # Append the main category and subcategories to the line
            updated_line = "\t".join(parts + [main_category, subcategories])
            outfile.write(updated_line + "\n")

output_file = 'output_file'
append_categories_to_annotations(annotations_file, converted_text, output_file)
print(f"Updated annotations saved to {output_file}")

Updated annotations saved to output_file
