In [1]:
#loading the dataset
from datasets import load_dataset
dataset = load_dataset('conll2003', trust_remote_code=True)

In [2]:
#inspecting the dataset structure
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})


In [3]:
#inspecting the first training sample
print(dataset['train'][0])

{'id': '0', 'tokens': ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'], 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7], 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0], 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}


In [4]:
#tokenization
#importing the tokenizer
# Tokenization
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

def tokenize_and_label_align(examples):
    # Tokenize the input tokens (which are already split into words)
    tokenized_inputs = tokenizer(
        examples["tokens"],
        padding="max_length",
        truncation=True,
        max_length=128,
        is_split_into_words=True,
    )

    all_labels = []
    for i, word_ids in enumerate(tokenized_inputs.word_ids(batch_index=i) for i in range(len(examples["tokens"]))):
        labels = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                labels.append(-100)
            elif word_idx != previous_word_idx:
                labels.append(examples["ner_tags"][i][word_idx])
            else:
                labels.append(-100)
            previous_word_idx = word_idx
        all_labels.append(labels)

    tokenized_inputs["labels"] = all_labels
    return tokenized_inputs

In [5]:
#mapping tokenize function to the entire dataset
tokenized_dataset = dataset.map(tokenize_and_label_align, batched=True)

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [6]:
#getting the no labels
label_list = dataset["train"].features["ner_tags"].feature.names
num_labels = len(label_list)

In [7]:
#loading the model
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=num_labels
)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
#data collator to pad and batch inputs tokens
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer = tokenizer)

In [9]:
#training arguments
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir = './ner_distilbert',
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    num_train_epochs = 3,
    learning_rate = 2e-5,
    weight_decay = 0.01,
    logging_dir = './ner_distilbert_logs',
    save_strategy = 'epoch',
    save_total_limit = 2,  
)

In [11]:
pip install --upgrade accelerate

Note: you may need to restart the kernel to use updated packages.


In [10]:
#importing the trainer engine
from transformers import Trainer

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_dataset['train'],
    eval_dataset = tokenized_dataset['validation'],
    tokenizer = tokenizer,
    data_collator = data_collator,
)

  trainer = Trainer(


In [None]:
#finetuing/training the pretrained model on our ner task
trainer.train()



Step,Training Loss


In [None]:
#saving the model's weights and configs
model.save_pretrained('./ner_distilbert_model')

In [None]:
#saving the tokenizer and settings
model.save_pretrained('./ner_distilbert_model')