In [2]:
import os
def load_annotations(annotations_file):
    annotations = []
    position = []
    with open(annotations_file, 'r') as file:
        for line in file:
            parts = line.strip().split('\t')
            position.append([parts[2],parts[3]])
            # Find the index where "Protagonist", "Antagonist", or "Innocent" appears
            for i, part in enumerate(parts):
                if part in ["Protagonist", "Antagonist", "Innocent"]:
                    main_info = parts[:i+1]  # Everything up to and including the found role
                    grouped_info = parts[i+1:]  # Everything after the role
                    annotations.append(main_info + [grouped_info])  # Grouped as a subarray
                    break
    return annotations, position

def load_documents(raw_documents_folder):
    documents = {}
    for filename in os.listdir(raw_documents_folder):
        file_path = os.path.join(raw_documents_folder, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            documents[filename] = file.read()
    return documents

def process_documents(annotations_file, raw_documents_folder):

    annotations, position_array = load_annotations(annotations_file)
    documents = load_documents(raw_documents_folder)

    result = []

    for annotation in annotations:
        filename = annotation[0]
        if filename in documents:
            document_text = documents[filename]
            result.append([document_text] + annotation)

    return result, position_array


annotations_file = 'subtask-1-annotations.txt'
raw_documents_folder = 'raw-documents'
result_array, position_array = process_documents(annotations_file, raw_documents_folder)

In [None]:
pip install transformers datasets scikit-learn torch evaluate

In [3]:
from huggingface_hub import notebook_login

notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
from datasets import DatasetDict, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import numpy as np
import torch
os.environ["WANDB_DISABLED"] = "true"


processed_data = [
    {"text": entry[0], "label": entry[-2]} for entry in result_array
]

# Map roles to integers
label_mapping = {"Protagonist": 0, "Antagonist": 1, "Innocent": 2}
for item in processed_data:
    item["label"] = label_mapping[item["label"]]

print(processed_data)



In [None]:
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader
import torch
import evaluate
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")

special_tokens = {"additional_special_tokens": ["[TARGET]", "[/TARGET]"]}
tokenizer.add_special_tokens(special_tokens)

#model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)
#model.resize_token_embeddings(len(tokenizer))

def mark_target_word(text, position_range, start_token="[TARGET]", end_token="[/TARGET]", max_tokens=512):
    start_char, end_char = position_range
    marked_text = text[:int(start_char)] + start_token + text[int(start_char):int(end_char)] + end_token + text[int(end_char):]

    tokens = tokenizer.tokenize(marked_text)

    if len(tokens) > max_tokens:
        target_start_index = len(tokenizer.tokenize(text[:int(start_char)]))
        target_end_index = target_start_index + len(tokenizer.tokenize(text[int(start_char):int(end_char)]))

        context_size = (max_tokens - (target_end_index - target_start_index)) // 2

        start_index = max(0, target_start_index - context_size)
        end_index = min(len(tokens), target_end_index + context_size)

        tokens = tokens[start_index:end_index]
        marked_text = tokenizer.convert_tokens_to_string(tokens)

    return marked_text
def tokenize_function(text):
    return tokenizer(text, padding="max_length", truncation=True, max_length=128)

tokenized_data = []

for x in range(len(processed_data)):
    text = processed_data[x]['text']
    label = processed_data[x]['label']
    position_range = position_array[x]

    marked_text = mark_target_word(text, position_range)
    tokenized_text = tokenize_function(marked_text)
    tokenized_data.append({"input_ids": tokenized_text["input_ids"], "attention_mask": tokenized_text["attention_mask"], "label": label})

class TokenizedDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return {
            "input_ids": torch.tensor(self.data[idx]["input_ids"], dtype=torch.long),
            "attention_mask": torch.tensor(self.data[idx]["attention_mask"], dtype=torch.long),
            "labels": torch.tensor(self.data[idx]["label"], dtype=torch.long)
        }

data_train, data_val = train_test_split(tokenized_data, test_size=0.2, random_state=42)

train_dataset = TokenizedDataset(data_train)
val_dataset = TokenizedDataset(data_val)

models = {"bert-large-uncased", "roberta-base", "distilbert-base-uncased", "microsoft/deberta-v3-base", "google/electra-base-discriminator"}

model = BertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True
)

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), axis=1)
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

results = trainer.evaluate()
print(f"Accuracy: {results['eval_accuracy']:.4f}")


In [None]:

model.save_pretrained("./text_classification_model")
tokenizer.save_pretrained("./text_classification_model")

In [None]:
-----------------------------------OLD CODE----------------------------------------------------------------

In [None]:
from huggingface_hub import HfApi, HfFolder
from transformers import AutoTokenizer, AutoModelForSequenceClassification

repo_name = "SemEval2025FinalModel"

model.save_pretrained("./text_classification_model")
tokenizer.save_pretrained("./text_classification_model")

model.push_to_hub(repo_name)
tokenizer.push_to_hub(repo_name)

print(f"Model successfully pushed to: https://huggingface.co/{repo_name}")


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

Model successfully pushed to: https://huggingface.co/SemEval2025FinalModel


In [None]:
from transformers import AutoTokenizer

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")

def mark_target_word(text, position_range, start_token="[TARGET]", end_token="[/TARGET]"):
    start_char, end_char = position_range
    marked_text =  text[:int(start_char)] + start_token +  text[int(start_char):int(end_char)] + end_token + text[int(end_char):]

    return marked_text

def tokenize_function(text):
    return tokenizer(text, padding="max_length", truncation=True, max_length=128)

tokenized_data = []

for x in range(len(processed_data)):
    text = processed_data[x]['text']
    label = processed_data[x]['label']
    position_range = position_array[x]

    # Add markers to the target phrase
    marked_text = mark_target_word(text, position_range)

    # Tokenize the marked text
    tokenized_text = tokenize_function(marked_text)

    # Append tokenized text and label
    tokenized_data.append((tokenized_text, label))

# Print tokenized data to verify
for tokenized_entry in tokenized_data:
    print("Tokenized Input IDs:", tokenized_entry[0]['input_ids'])
    print("Label:", tokenized_entry[1])


In [None]:
from datasets import Dataset
from sklearn.model_selection import train_test_split

train_data, val_data = train_test_split(tokenized_data, test_size=0.2, random_state=42)

In [None]:
print(len(train_data))
print(len(val_data))

548
138


In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels=3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(output_dir="test_trainer")

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")


def tokenize_function(text):
    return tokenizer(text, padding="max_length", truncation=True, max_length=128)


tokenized_data = []

for x in range(len(processed_data)):
    tokenized_text = tokenize_function(processed_data[x]['text'])
    label = processed_data[x]['label']
    tokenized_data.append((tokenized_text, label))


In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = TrainingArguments(
    output_dir="my_awesome_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

NameError: name 'TrainingArguments' is not defined

In [None]:
import evaluate

metric = evaluate.load("accuracy")


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
import torch

model.eval()
for batch in eval_loader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

eval_metric = metric.compute()
print(f"Accuracy: {eval_metric['accuracy']:.4f}")


Accuracy: 0.5091
