In [None]:
#import libraries

import os
import torch
import pandas as pd
from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification,
)
from seqeval.metrics import classification_report, f1_score


ModuleNotFoundError: No module named 'torch'

In [None]:
# load dataset 
def read_conll(filepath):
    sentences, labels = [], []
    with open(filepath, encoding='utf-8') as f:
        tokens, tags = [], []
        for line in f:
            line = line.strip()
            if not line:
                if tokens:
                    sentences.append(tokens)
                    labels.append(tags)
                    tokens, tags = [], []
            else:
                splits = line.split()
                if len(splits) == 2:
                    token, tag = splits
                    tokens.append(token)
                    tags.append(tag)
    return pd.DataFrame({'tokens': sentences, 'ner_tags': labels})

conll_path = "/data/ner_labels.conll"  
df = read_conll(conll_path)
df.head()


In [2]:
unique_tags = sorted({tag for doc in df["ner_tags"] for tag in doc})
tag2id = {tag: idx for idx, tag in enumerate(unique_tags)}
id2tag = {idx: tag for tag, idx in tag2id.items()}
print(tag2id)


NameError: name 'df' is not defined

In [None]:
#Define Label Map
model_checkpoint = "Davlan/bert-base-multilingual-cased-ner-hrl"  
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(tag2id), id2label=id2tag, label2id=tag2id)


In [None]:
#Load Tokenizer and Model

def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(example['tokens'], truncation=True, is_split_into_words=True)
    labels = []
    word_ids = tokenized_inputs.word_ids()

    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            labels.append(-100)
        elif word_idx != previous_word_idx:
            labels.append(tag2id[example['ner_tags'][word_idx]])
        else:
            labels.append(tag2id[example['ner_tags'][word_idx]])
        previous_word_idx = word_idx

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

dataset = Dataset.from_pandas(df)
tokenized_dataset = dataset.map(tokenize_and_align_labels)


In [None]:
#Load Tokenizer and Model
dataset = tokenized_dataset.train_test_split(test_size=0.2, seed=42)


In [None]:
#split the dataset

dataset = tokenized_dataset.train_test_split(test_size=0.2, seed=42)


In [None]:
# Define Training Arguments

args = TrainingArguments(
    output_dir="./ner_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
)


In [None]:
# Define trainer and train
data_collator = DataCollatorForTokenClassification(tokenizer)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()


In [None]:
#Evaluate Model

predictions, labels, _ = trainer.predict(dataset["test"])
preds = torch.argmax(torch.tensor(predictions), axis=-1)

true_labels, true_preds = [], []

for i in range(len(labels)):
    true_label = [id2tag[l] for l, p in zip(labels[i], preds[i]) if l != -100]
    true_pred = [id2tag[p] for l, p in zip(labels[i], preds[i]) if l != -100]
    true_labels.append(true_label)
    true_preds.append(true_pred)

print(classification_report(true_labels, true_preds))
print("F1 Score:", f1_score(true_labels, true_preds))


model.save_pretrained("ner_model")
tokenizer.save_pretrained("ner_model")
