# Downloading Modlues

In [171]:
!pip install transformers



In [172]:
!pip install datasets



In [173]:
!pip install seqeval



In [174]:
!pip install transformers[torch]



# Importing Dataset

In [175]:
import datasets
from datasets import load_dataset
dataset = load_dataset("conll2003")

In [176]:
dataset.column_names

{'train': ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
 'validation': ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
 'test': ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags']}

In [177]:
dataset.shape

{'train': (14041, 5), 'validation': (3250, 5), 'test': (3453, 5)}

In [178]:
train_dataset = dataset['train']
validation_dataset = dataset['validation']
test_dataset = dataset['test']

In [179]:
train_dataset[0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [180]:
ner_feature = dataset["train"].features["ner_tags"]
ner_feature

Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)

In [181]:
sentences = train_dataset['tokens']
ner_labels = train_dataset['ner_tags']

for i in range(5):
    print("Tokens:", sentences[i])
    print("NER Tags:", ner_labels[i])

Tokens: ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']
NER Tags: [3, 0, 7, 0, 0, 0, 7, 0, 0]
Tokens: ['Peter', 'Blackburn']
NER Tags: [1, 2]
Tokens: ['BRUSSELS', '1996-08-22']
NER Tags: [5, 0]
Tokens: ['The', 'European', 'Commission', 'said', 'on', 'Thursday', 'it', 'disagreed', 'with', 'German', 'advice', 'to', 'consumers', 'to', 'shun', 'British', 'lamb', 'until', 'scientists', 'determine', 'whether', 'mad', 'cow', 'disease', 'can', 'be', 'transmitted', 'to', 'sheep', '.']
NER Tags: [0, 3, 4, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Tokens: ['Germany', "'s", 'representative', 'to', 'the', 'European', 'Union', "'s", 'veterinary', 'committee', 'Werner', 'Zwingmann', 'said', 'on', 'Wednesday', 'consumers', 'should', 'buy', 'sheepmeat', 'from', 'countries', 'other', 'than', 'Britain', 'until', 'the', 'scientific', 'advice', 'was', 'clearer', '.']
NER Tags: [5, 0, 0, 0, 0, 3, 4, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0,

# Data Preprocessing

In [182]:
from transformers import AutoTokenizer, TFAlbertModel

In [183]:
model_checkpoint = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [184]:
sample_input = train_dataset[0]
tokenized_input = tokenizer(sample_input["tokens"],is_split_into_words=True)
print(sample_input["tokens"])
print(sample_input["ner_tags"])
print(tokenized_input)
print(tokenized_input.tokens())
print(tokenized_input.word_ids())

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']
[3, 0, 7, 0, 0, 0, 7, 0, 0]
{'input_ids': [101, 7327, 19164, 2446, 2655, 2000, 17757, 2329, 12559, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
['[CLS]', 'eu', 'rejects', 'german', 'call', 'to', 'boycott', 'british', 'lamb', '.', '[SEP]']
[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, None]


In [185]:
# function to allign labels with tokens after tokenization (special tokens are added after tokenization and some already tokenized words are also broken
# into subwords hence alligning is necessary)
def data_preprocessing(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            new_labels.append(-100)                            # this will be ignored while training
        else:
            label = labels[word_id]
            if label % 2 == 1:                                 # to change B to I
                label += 1
            new_labels.append(label)

    return new_labels

In [186]:
labels = sample_input["ner_tags"]
word_ids = tokenized_input.word_ids()
print(labels)
print(data_preprocessing(labels,word_ids))

[3, 0, 7, 0, 0, 0, 7, 0, 0]
[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, -100]


In [187]:
# mapping entire dataset
def tokenize_and_align_labels(inputs):
    tokenized_inputs = tokenizer(inputs["tokens"], truncation=True, is_split_into_words=True)
    all_labels = inputs["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(data_preprocessing(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

tokenized_dataset = dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns= dataset["train"].column_names,
)

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

In [188]:
tokenized_dataset.column_names

{'train': ['input_ids', 'attention_mask', 'labels'],
 'validation': ['input_ids', 'attention_mask', 'labels'],
 'test': ['input_ids', 'attention_mask', 'labels']}

In [189]:
sample = tokenized_dataset["train"][0]
sample

{'input_ids': [101,
  7327,
  19164,
  2446,
  2655,
  2000,
  17757,
  2329,
  12559,
  1012,
  102],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, -100]}

# Creating Compute Metrics



In [190]:
from sklearn.metrics import classification_report
import numpy as np

In [191]:
metric = datasets.load_metric("seqeval")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [192]:
label_list = dataset["train"].features["ner_tags"].feature.names
label_list

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [193]:
def compute_metrics(eval_preds):
    pred_logits, labels = eval_preds

    pred_logits = np.argmax(pred_logits, axis=2)
    predictions = []
    true_labels = []

    for prediction, label in zip(pred_logits, labels):
        pred_seq = [label_list[pred] for (pred, l) in zip(prediction, label) if l != -100]
        true_seq = [label_list[l] for (pred, l) in zip(prediction, label) if l != -100]

        predictions.append(pred_seq)
        true_labels.append(true_seq)

    results = metric.compute(predictions=predictions, references=true_labels)
    return {
    "precision": results["overall_precision"],
    "recall": results["overall_recall"],
    "f1": results["overall_f1"],
    "accuracy": results["overall_accuracy"],
  }

# Training

In [194]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=9
)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [195]:
!pip install accelerate -U



In [196]:
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification
from transformers import TrainingArguments, Trainer

data_collator = DataCollatorForTokenClassification(tokenizer)

args = TrainingArguments(
    model_checkpoint,
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [197]:
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2334,0.066354,0.895143,0.918041,0.906447,0.980762
2,0.048,0.055318,0.916186,0.936385,0.926176,0.9844
3,0.029,0.056039,0.922835,0.9379,0.930306,0.984829


TrainOutput(global_step=2634, training_loss=0.08046661422573081, metrics={'train_runtime': 269.5618, 'train_samples_per_second': 156.265, 'train_steps_per_second': 9.771, 'total_flos': 511057370840310.0, 'train_loss': 0.08046661422573081, 'epoch': 3.0})

# Saving the model

In [198]:
model.save_pretrained("./model1_directory")
tokenizer.save_pretrained("./tokenizer1_directory")

('./tokenizer1_directory/tokenizer_config.json',
 './tokenizer1_directory/special_tokens_map.json',
 './tokenizer1_directory/vocab.txt',
 './tokenizer1_directory/added_tokens.json',
 './tokenizer1_directory/tokenizer.json')

# Updating Config file
(for model evaluation)

In [199]:
id2label = {
    str(i): label for i,label in enumerate(label_list)
}
label2id = {
    label: str(i) for i,label in enumerate(label_list)
}

In [200]:
import json
config = json.load(open("./model1_directory/config.json"))
config["id2label"] = id2label
config["label2id"] = label2id
json.dump(config, open("./model1_directory/config.json","w"))

# Identification

In [201]:
from transformers import AutoModelForTokenClassification
BERT_fined_tuned = AutoModelForTokenClassification.from_pretrained("./model1_directory/")

In [202]:
from transformers import pipeline
NLP = pipeline("ner",model=BERT_fined_tuned,tokenizer=tokenizer)
text = '''Zuckerberg attended Harvard University, where he launched Facebook in February 2004 with his roommates Eduardo Saverin, Andrew McCollum, Dustin Moskovitz, and Chris Hughes. Originally launched in only select college campuses, the site expanded rapidly and eventually beyond colleges, reaching one billion users in 2012. Zuckerberg took the company public in May 2012 with majority shares. In 2007, at age 23, he became the world's youngest self-made billionaire. He has since used his funds to organize multiple philanthropic endeavors, including the establishment of the Chan Zuckerberg Initiative.'''
result = NLP(text)
for i in result:
  print(i)

{'entity': 'B-PER', 'score': 0.992896, 'index': 1, 'word': 'zu', 'start': 0, 'end': 2}
{'entity': 'I-PER', 'score': 0.99786144, 'index': 2, 'word': '##cker', 'start': 2, 'end': 6}
{'entity': 'I-PER', 'score': 0.9977314, 'index': 3, 'word': '##berg', 'start': 6, 'end': 10}
{'entity': 'B-ORG', 'score': 0.78169346, 'index': 5, 'word': 'harvard', 'start': 20, 'end': 27}
{'entity': 'I-ORG', 'score': 0.86094534, 'index': 6, 'word': 'university', 'start': 28, 'end': 38}
{'entity': 'B-ORG', 'score': 0.7829534, 'index': 11, 'word': 'facebook', 'start': 58, 'end': 66}
{'entity': 'B-PER', 'score': 0.9977779, 'index': 19, 'word': 'eduardo', 'start': 103, 'end': 110}
{'entity': 'I-PER', 'score': 0.9982369, 'index': 20, 'word': 'save', 'start': 111, 'end': 115}
{'entity': 'I-PER', 'score': 0.9982331, 'index': 21, 'word': '##rin', 'start': 115, 'end': 118}
{'entity': 'B-PER', 'score': 0.99743086, 'index': 23, 'word': 'andrew', 'start': 120, 'end': 126}
{'entity': 'I-PER', 'score': 0.9978181, 'index':

# Testing

In [203]:
test_results = trainer.evaluate(tokenized_dataset["test"])

# Calculating evaluation metrics

####ACCURACY

In [204]:
print("Accuracy : ", test_results['eval_accuracy'])

Accuracy :  0.9759857744630895


### Precision

In [205]:
print("Precision : ", test_results['eval_precision'])

Precision :  0.869431117203564


### Recall

In [206]:
print("Recall : ", test_results['eval_recall'])

Recall :  0.898371104815864


### F1 score

In [207]:
print("F1 score : ", test_results['eval_f1'])

F1 score :  0.8836642284918147
