In [1]:
import spacy
import pandas as pd
import json
import torch
from torch.utils.data import Dataset as TorchDataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments, TrainerCallback
from datasets import Dataset

# Read the CSV file containing annotated data with labels
df = pd.read_csv("annotated_animals_with_labels_fix.csv")

# Function to convert entity string into a list of entities
def parse_entities(entity_str):
    try:
        # Replace single quotes with double quotes and parse the string as JSON
        return json.loads(entity_str.replace("'", '"'))
    except json.JSONDecodeError:
        # If there's an error during JSON parsing, return an empty list
        return []

# Apply the entity parsing function to the "entities" column in the dataframe
df["entities"] = df["entities"].apply(parse_entities)

# Load the pre-trained tokenizer for BERT model
model_name = "bert-base-cased"  # Using BERT base model (cased)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function for tokenizing text and aligning labels to the tokenized output
def tokenize_and_align_labels(examples):
    # Tokenize the text with padding and truncation to ensure the sequence length is fixed (max length of 512 tokens)
    tokenized_inputs = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512, return_tensors="pt")

    # Initialize an empty list to hold the labels for each example
    labels = []
    
    # Iterate over the text and associated entities in the dataset
    for text, entity_list in zip(examples["text"], examples["entities"]):
        # Tokenize the text to get tokenized versions of the words
        tokenized_text = tokenizer.tokenize(text)

        # Start with a list of 'O' labels for every token in the text (indicating no entity)
        word_labels = ['O'] * len(tokenized_text)

        # Align entity labels with the tokenized words
        for word, label in entity_list:
            # Tokenize the word and find the matching tokens in the text
            tokenized_word = tokenizer.tokenize(word)
            try:
                # Find the start and end index of the word in the tokenized text
                word_start_idx = tokenized_text.index(tokenized_word[0])
                word_end_idx = word_start_idx + len(tokenized_word) - 1

                # Assign the entity label to all tokens corresponding to this word
                for idx in range(word_start_idx, word_end_idx + 1):
                    word_labels[idx] = label
            except ValueError:
                # If the word is not found in the tokenized text, skip it
                continue

        # Ensure the label list is truncated or padded to 512 tokens (BERT's max sequence length)
        word_labels = word_labels[:512]  # Trim if it's too long
        labels.append(word_labels + ['O'] * (512 - len(word_labels)))  # Pad with 'O' if it's too short

    # Add the label list to the tokenized inputs dictionary
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Convert the dataframe to a Hugging Face Dataset format
dataset = Dataset.from_pandas(df)

# Apply the tokenization and label alignment function across the entire dataset
dataset = dataset.map(tokenize_and_align_labels, batched=True)

# Mapping label names to numerical values for model training (e.g., 'O' -> 0, 'ANIMAL' -> 1)
label_map = {'O': 0, 'ANIMAL': 1}

# Function to apply the label mapping
def apply_label_map(examples):
    return {'labels': [[label_map[label] for label in label_list] for label_list in examples['labels']]}

# Apply the label mapping function to the dataset
dataset = dataset.map(apply_label_map, batched=True)

# Load the pre-trained BERT model for token classification
model = AutoModelForTokenClassification.from_pretrained(
    model_name, num_labels=len(label_map)  # Number of labels (O and ANIMAL in this case)
)

# Define training arguments for the Trainer class
training_args = TrainingArguments(
    output_dir="./results",  # Directory to save the results
    evaluation_strategy="epoch",  # evaluation  data
    save_strategy="epoch",  # Save the model at the end of each epoch
    num_train_epochs=3,  # Number of epochs for training
    per_device_train_batch_size=16,  # Batch size for training
    per_device_eval_batch_size=16,  # Batch size for evaluation
    logging_dir="./logs",  # Directory for storing logs
)

# Custom callback to log the end of each epoch
class EpochLoggingCallback(TrainerCallback):
    def on_epoch_end(self, args, state, control, **kwargs):
        print(f"Epoch {state.epoch} is complete!")


dataset = dataset.train_test_split(test_size=0.2)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]

# Initialize the Trainer with the model, training arguments, and dataset
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,  # Используем тренировочный датасет
    eval_dataset=eval_dataset,  # Добавляем валидационный датасет
    callbacks=[EpochLoggingCallback]  # Добавляем callback
)

# Start training the model
trainer.train()

# Save the trained model and tokenizer to disk
model.save_pretrained("./trained_model")
tokenizer.save_pretrained("./trained_model")

# Print a message indicating the training is complete
print("🎯 Training completed! Model saved to './trained_model'")





Map:   0%|          | 0/12236 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1426 > 512). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/12236 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


KeyboardInterrupt: 