In [7]:
# pip install transformers
# pip install torch

In [8]:
# !pip install --upgrade accelerate -U



In [9]:
# !pip install --upgrade transformers[torch]



In [10]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import TrainingArguments, Trainer
import torch
from torch.utils.data import Dataset

model_name = "dbmdz/bert-large-cased-finetuned-conll03-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

label_list = ["O", "B-MISC", "I-MISC", "B-ORG", "I-ORG", "B-PER", "I-PER", "B-LOC", "I-LOC"]
id2label = {i: label for i, label in enumerate(label_list)}
label2id = {label: i for i, label in enumerate(label_list)}


Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
class NERDataset(Dataset):
    def __init__(self, texts, tags, tokenizer, max_len, label2id):
        self.texts = texts
        self.tags = tags
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.label2id = label2id

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = self.texts[item]
        tags = self.tags[item]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        input_ids = encoding['input_ids'].flatten()
        attention_mask = encoding['attention_mask'].flatten()

        labels = [-100] * self.max_len  # Initialize labels with -100 (ignore token)
        tokens = self.tokenizer.convert_ids_to_tokens(input_ids)
        word_ids = encoding.word_ids()  # Get the word IDs to align the labels

        previous_word_idx = None
        for idx, word_idx in enumerate(word_ids):
            if word_idx is None:
                continue
            elif word_idx != previous_word_idx:
                labels[idx] = self.label2id[tags[word_idx]]
            previous_word_idx = word_idx

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': torch.tensor(labels, dtype=torch.long)
        }

In [12]:
texts = [
    "We are looking for a Data Scientist with experience in Python, Machine Learning, and Data Analysis.",
    "The ideal candidate should be proficient in SQL and have knowledge of NLP."
]
tags = [
    ["O", "O", "O", "O", "O", "B-MISC", "I-MISC", "O", "O", "O", "B-MISC", "O", "B-MISC", "I-MISC", "O", "O", "B-MISC", "I-MISC", "O"],  # Example tags
    ["O", "O", "O", "O", "O", "O", "O", "B-MISC", "O", "O", "O", "O", "B-MISC", "O"]  # Example tags
]

# Create dataset
dataset = NERDataset(texts, tags, tokenizer, max_len=32, label2id=label2id)

model.config.id2label = id2label
model.config.label2id = label2id

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,  # Increase the number of epochs
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=500,  # Save model every 500 steps
    eval_strategy="steps",  # Evaluate during training
    eval_steps=500  # Evaluate every 500 steps
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset,
    tokenizer=tokenizer,
)

trainer.train()

Step,Training Loss,Validation Loss


TrainOutput(global_step=10, training_loss=1.0571948051452638, metrics={'train_runtime': 261.1745, 'train_samples_per_second': 0.077, 'train_steps_per_second': 0.038, 'total_flos': 1160911292160.0, 'train_loss': 1.0571948051452638, 'epoch': 10.0})

In [13]:
def predict(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    outputs = model(**inputs).logits
    predictions = torch.argmax(outputs, dim=2)
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
    predicted_labels = [model.config.id2label[label_id.item()] for label_id in predictions[0]]
    return list(zip(tokens, predicted_labels))

# Test the model
test_text = "We need someone skilled in Java and Python"
predictions = predict(test_text, model, tokenizer)
print(predictions)


[('[CLS]', 'O'), ('We', 'O'), ('need', 'O'), ('someone', 'O'), ('skilled', 'O'), ('in', 'O'), ('Java', 'I-MISC'), ('and', 'O'), ('Python', 'I-MISC'), ('[SEP]', 'O')]
