In [1]:
# !pip install spacy
# !python -m spacy download en_core_web_sm

In [2]:
import spacy
from spacy.training import Example
from spacy.training.iob_utils import offsets_to_biluo_tags
import random

# Load spaCy model to use for tokenization
tokenizer_nlp = spacy.blank("en")

# Function to get token offsets
def get_token_offsets(text):
    doc = tokenizer_nlp(text)
    tokens = [token.text for token in doc]
    offsets = [(token.idx, token.idx + len(token.text)) for token in doc]
    return tokens, offsets

# Corrected detailed training data
DETAILED_TRAIN_DATA = [
    ("We are looking for a Data Scientist with experience in Python, Machine Learning, and Data Analysis.",
     {"entities": [(47, 53, "SKILL"), (59, 76, "SKILL"), (81, 94, "SKILL")]}),
    ("The ideal candidate should be proficient in SQL and have knowledge of NLP.",
     {"entities": [(41, 44, "SKILL"), (63, 66, "SKILL")]}),
    ("Experience with Java and Project Management is required.",
     {"entities": [(17, 21, "SKILL"), (26, 43, "SKILL")]}),
    ("We need expertise in Python, Java, and Machine Learning.",
     {"entities": [(18, 24, "SKILL"), (26, 30, "SKILL"), (36, 53, "SKILL")]}),
    ("Proficiency in data visualization tools like Tableau and Power BI is necessary.",
     {"entities": [(40, 47, "SKILL"), (52, 60, "SKILL")]}),
    ("Strong understanding of statistical analysis and modeling with R is preferred.",
     {"entities": [(43, 62, "SKILL"), (68, 69, "SKILL")]}),
    ("Knowledge of cloud platforms such as AWS and Azure is a plus.",
     {"entities": [(27, 30, "SKILL"), (35, 40, "SKILL")]}),
    ("Familiarity with database management systems including MySQL and PostgreSQL is advantageous.",
     {"entities": [(38, 43, "SKILL"), (48, 58, "SKILL")]}),
]

# Tokenize and adjust offsets
for text, annotations in DETAILED_TRAIN_DATA:
    tokens, offsets = get_token_offsets(text)
    print(f"Text: {text}")
    print(f"Tokens: {tokens}")
    print(f"Offsets: {offsets}")
    for start, end, label in annotations["entities"]:
        entity_text = text[start:end]
        # Verify and adjust entity offsets if needed
        token_start = None
        token_end = None
        for idx, (token_start_idx, token_end_idx) in enumerate(offsets):
            if token_start_idx <= start < token_end_idx:
                token_start = token_start_idx
            if token_start_idx < end <= token_end_idx:
                token_end = token_end_idx
                break
        if token_start is not None and token_end is not None:
            print(f"Entity: {entity_text} ({label}) -> Adjusted: {text[token_start:token_end]} ({token_start}, {token_end})")
        else:
            print(f"Entity: {entity_text} ({label}) -> Could not adjust")

# Adjusted DETAILED_TRAIN_DATA (use the verified and corrected offsets from above)
ADJUSTED_DETAILED_TRAIN_DATA = [
    ("We are looking for a Data Scientist with experience in Python, Machine Learning, and Data Analysis.",
     {"entities": [(47, 53, "SKILL"), (59, 76, "SKILL"), (81, 94, "SKILL")]}),
    ("The ideal candidate should be proficient in SQL and have knowledge of NLP.",
     {"entities": [(41, 44, "SKILL"), (63, 66, "SKILL")]}),
    ("Experience with Java and Project Management is required.",
     {"entities": [(17, 21, "SKILL"), (26, 43, "SKILL")]}),
    ("We need expertise in Python, Java, and Machine Learning.",
     {"entities": [(18, 24, "SKILL"), (26, 30, "SKILL"), (36, 53, "SKILL")]}),
    ("Proficiency in data visualization tools like Tableau and Power BI is necessary.",
     {"entities": [(40, 47, "SKILL"), (52, 60, "SKILL")]}),
    ("Strong understanding of statistical analysis and modeling with R is preferred.",
     {"entities": [(43, 62, "SKILL"), (68, 69, "SKILL")]}),
    ("Knowledge of cloud platforms such as AWS and Azure is a plus.",
     {"entities": [(27, 30, "SKILL"), (35, 40, "SKILL")]}),
    ("Familiarity with database management systems including MySQL and PostgreSQL is advantageous.",
     {"entities": [(38, 43, "SKILL"), (48, 58, "SKILL")]}),
]

# Load the blank spaCy model
nlp = spacy.blank("en")

# Create a new NER pipeline and add it to the model
ner = nlp.add_pipe("ner")

# Add labels to the NER pipeline
for _, annotations in ADJUSTED_DETAILED_TRAIN_DATA:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

# Check alignment
for text, annotations in ADJUSTED_DETAILED_TRAIN_DATA:
    doc = nlp.make_doc(text)
    tags = offsets_to_biluo_tags(doc, annotations["entities"])
    print(f"Text: {text}")
    print(f"Tokens: {[token.text for token in doc]}")
    print(f"Tags: {tags}")

# Training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.begin_training()
    for itn in range(50):
        losses = {}
        random.shuffle(ADJUSTED_DETAILED_TRAIN_DATA)
        for text, annotations in ADJUSTED_DETAILED_TRAIN_DATA:
            example = Example.from_dict(nlp.make_doc(text), annotations)
            nlp.update([example], drop=0.5, losses=losses)
        print(f"Losses at iteration {itn}: {losses}")

# Save the trained model
nlp.to_disk("adjusted_detailed_ner_model")

# Load the custom model
detailed_nlp = spacy.load("adjusted_detailed_ner_model")

# Test the custom model
test_text = "We need someone skilled in Java and Project Management."
doc = detailed_nlp(test_text)
print(f"Test Text: {test_text}")
print("Entities in detailed custom model:", [(ent.text, ent.label_) for ent in doc.ents])


Text: We are looking for a Data Scientist with experience in Python, Machine Learning, and Data Analysis.
Tokens: ['We', 'are', 'looking', 'for', 'a', 'Data', 'Scientist', 'with', 'experience', 'in', 'Python', ',', 'Machine', 'Learning', ',', 'and', 'Data', 'Analysis', '.']
Offsets: [(0, 2), (3, 6), (7, 14), (15, 18), (19, 20), (21, 25), (26, 35), (36, 40), (41, 51), (52, 54), (55, 61), (61, 62), (63, 70), (71, 79), (79, 80), (81, 84), (85, 89), (90, 98), (98, 99)]
Entity: ence i (SKILL) -> Adjusted: experience in (41, 54)
Entity: on, Machine Learn (SKILL) -> Adjusted: Python, Machine Learning (55, 79)
Entity: and Data Anal (SKILL) -> Adjusted: and Data Analysis (81, 98)
Text: The ideal candidate should be proficient in SQL and have knowledge of NLP.
Tokens: ['The', 'ideal', 'candidate', 'should', 'be', 'proficient', 'in', 'SQL', 'and', 'have', 'knowledge', 'of', 'NLP', '.']
Offsets: [(0, 3), (4, 9), (10, 19), (20, 26), (27, 29), (30, 40), (41, 43), (44, 47), (48, 51), (52, 56), (57, 6



Text: We are looking for a Data Scientist with experience in Python, Machine Learning, and Data Analysis.
Tokens: ['We', 'are', 'looking', 'for', 'a', 'Data', 'Scientist', 'with', 'experience', 'in', 'Python', ',', 'Machine', 'Learning', ',', 'and', 'Data', 'Analysis', '.']
Tags: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '-', '-', '-', '-', '-', '-', 'O', '-', '-', '-', 'O']
Text: The ideal candidate should be proficient in SQL and have knowledge of NLP.
Tokens: ['The', 'ideal', 'candidate', 'should', 'be', 'proficient', 'in', 'SQL', 'and', 'have', 'knowledge', 'of', 'NLP', '.']
Tags: ['O', 'O', 'O', 'O', 'O', 'O', '-', 'O', 'O', 'O', '-', 'O', 'O', 'O']
Text: Experience with Java and Project Management is required.
Tokens: ['Experience', 'with', 'Java', 'and', 'Project', 'Management', 'is', 'required', '.']
Tags: ['O', 'O', '-', 'O', '-', '-', 'O', 'O', 'O']
Text: We need expertise in Python, Java, and Machine Learning.
Tokens: ['We', 'need', 'expertise', 'in', 'Python', ',', 'Java', '

In [3]:
pip install transformers



In [4]:
pip install torch



In [5]:
pip install accelerate -U



In [6]:
pip install transformers[torch]



In [7]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import TrainingArguments, Trainer
import torch
from torch.utils.data import Dataset

model_name = "dbmdz/bert-large-cased-finetuned-conll03-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly iden

In [8]:
label_list = ["O", "B-MISC", "I-MISC", "B-ORG", "I-ORG", "B-PER", "I-PER", "B-LOC", "I-LOC"]
id2label = {i: label for i, label in enumerate(label_list)}
label2id = {label: i for i, label in enumerate(label_list)}

In [9]:
class NERDataset(Dataset):
    def __init__(self, texts, tags, tokenizer, max_len, label2id):
        self.texts = texts
        self.tags = tags
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.label2id = label2id

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = self.texts[item]
        tags = self.tags[item]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        input_ids = encoding['input_ids'].flatten()
        attention_mask = encoding['attention_mask'].flatten()

        labels = [-100] * self.max_len  # Initialize labels with -100 (ignore token)
        tokens = self.tokenizer.convert_ids_to_tokens(input_ids)
        token_index = 0
        for idx, word in enumerate(text.split()):
            while token_index < len(tokens) and tokens[token_index].startswith('##'):
                token_index += 1
            if token_index < len(tokens):
                labels[token_index] = self.label2id[tags[idx]]
                token_index += 1

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': torch.tensor(labels, dtype=torch.long)
        }

# Example data
texts = [
    "We are looking for a Data Scientist with experience in Python, Machine Learning, and Data Analysis.",
    "The ideal candidate should be proficient in SQL and have knowledge of NLP."
]
tags = [
    ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B-MISC", "O", "B-MISC", "I-MISC", "O", "O", "B-MISC", "I-MISC", "O"],  # Example tags
    ["O", "O", "O", "O", "O", "O", "O", "B-MISC", "O", "O", "O", "O", "B-MISC", "O"]  # Example tags
]

# Create dataset
dataset = NERDataset(texts, tags, tokenizer, max_len=32, label2id=label2id)


In [10]:
model.config.id2label = id2label
model.config.label2id = label2id

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset,
    tokenizer=tokenizer,
)

trainer.train()


Step,Training Loss


TrainOutput(global_step=3, training_loss=1.851509730021159, metrics={'train_runtime': 40.9651, 'train_samples_per_second': 0.146, 'train_steps_per_second': 0.073, 'total_flos': 348273387648.0, 'train_loss': 1.851509730021159, 'epoch': 3.0})

In [11]:
def predict(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors="pt")
    outputs = model(**inputs).logits
    predictions = torch.argmax(outputs, dim=2)
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
    predicted_labels = [model.config.id2label[label_id.item()] for label_id in predictions[0]]
    return list(zip(tokens, predicted_labels))

# Test the model
test_text = "We need someone skilled in Java and Project Management."
predictions = predict(test_text, model, tokenizer)
print(predictions)


[('[CLS]', 'O'), ('We', 'O'), ('need', 'O'), ('someone', 'O'), ('skilled', 'O'), ('in', 'O'), ('Java', 'I-MISC'), ('and', 'O'), ('Project', 'O'), ('Management', 'O'), ('.', 'O'), ('[SEP]', 'O')]
