In [None]:
! pip install transformers



In [None]:
# This notebook for NER fine tuning is one of hugging face examples converted to colab notebook

# Ref : https://huggingface.co/transformers/custom_datasets.html

In [None]:
!wget http://noisy-text.github.io/2017/files/wnut17train.conll # Lets get the data

--2020-11-16 03:47:18--  http://noisy-text.github.io/2017/files/wnut17train.conll
Resolving noisy-text.github.io (noisy-text.github.io)... 185.199.108.153, 185.199.109.153, 185.199.110.153, ...
Connecting to noisy-text.github.io (noisy-text.github.io)|185.199.108.153|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 493781 (482K) [application/octet-stream]
Saving to: ‘wnut17train.conll.1’


2020-11-16 03:47:18 (17.3 MB/s) - ‘wnut17train.conll.1’ saved [493781/493781]



In this case, we’ll just download the train set, which is a single text file. Each line of the file contains either (1) a word and tag separated by a tab, or (2) a blank line indicating the end of a document. Let’s write a function to read this in. We’ll take in the file path and return token_docs which is a list of lists of token strings, and token_tags which is a list of lists of tag strings.

In [None]:
from pathlib import Path
import re

def read_wnut(file_path):
    file_path = Path(file_path)

    raw_text = file_path.read_text().strip()
    raw_docs = re.split(r'\n\t?\n', raw_text)
    token_docs = []
    tag_docs = []
    for doc in raw_docs:
        tokens = []
        tags = []
        for line in doc.split('\n'):
            token, tag = line.split('\t')
            tokens.append(token)
            tags.append(tag)
        token_docs.append(tokens)
        tag_docs.append(tags)

    return token_docs, tag_docs

texts, tags = read_wnut('wnut17train.conll')

In [None]:
print(texts[0][10:17], tags[0][10:17], sep='\n')

['for', 'two', 'weeks', '.', 'Empire', 'State', 'Building']
['O', 'O', 'O', 'O', 'B-location', 'I-location', 'I-location']


In [None]:
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_tags, val_tags = train_test_split(texts, tags, test_size=.2) # split into train and validation

In [None]:
# The tags we created above needs to be converted into Ids

unique_tags = set(tag for doc in tags for tag in doc)
tag2id = {tag: id for id, tag in enumerate(unique_tags)}
id2tag = {id: tag for tag, id in tag2id.items()} # just inverse mapping

To encode the tokens, we’ll use a pre-trained DistilBert tokenizer. We can tell the tokenizer that we’re dealing with ready-split tokens rather than full sentence strings by passing is_split_into_words=True. We’ll also pass padding=True and truncation=True to pad the sequences to be the same length. Lastly, we can tell the model to return information about the tokens which are split by the wordpiece tokenization process, which we will need in a moment.

In [None]:
id2tag

{0: 'I-product',
 1: 'I-location',
 2: 'B-corporation',
 3: 'B-group',
 4: 'I-creative-work',
 5: 'B-person',
 6: 'B-creative-work',
 7: 'I-corporation',
 8: 'I-group',
 9: 'I-person',
 10: 'O',
 11: 'B-product',
 12: 'B-location'}

In [None]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')
train_encodings = tokenizer(train_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
val_encodings = tokenizer(val_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)

Now we arrive at a common obstacle with using pre-trained models for token-level classification: many of the tokens in the W-NUT corpus are not in DistilBert’s vocabulary. Bert and many models like it use a method called WordPiece Tokenization, meaning that single words are split into multiple tokens such that each token is likely to be in the vocabulary. For example, DistilBert’s tokenizer would split the Twitter handle @huggingface into the tokens ['@', 'hugging', '##face']. This is a problem for us because we have exactly one tag per token. If the tokenizer splits a token into multiple sub-tokens, then we will end up with a mismatch between our tokens and our labels.

One way to handle this is to only train on the tag labels for the first subtoken of a split token. We can do this in 🤗 Transformers by setting the labels we wish to ignore to -100. In the example above, if the label for @HuggingFace is 3 (indexing B-corporation), we would set the labels of ['@', 'hugging', '##face'] to [3, -100, -100].

Let’s write a function to do this. This is where we will use the offset_mapping from the tokenizer as mentioned above. For each sub-token returned by the tokenizer, the offset mapping gives us a tuple indicating the sub-token’s start position and end position relative to the original token it was split from. That means that if the first position in the tuple is anything other than 0, we will set its corresponding label to -100. While we’re at it, we can also set labels to -100 if the second position of the offset mapping is 0, since this means it must be a special token like [PAD] or [CLS].

In [None]:
import numpy as np

def encode_tags(tags, encodings):
    labels = [[tag2id[tag] for tag in doc] for doc in tags]
    encoded_labels = []
    for doc_labels, doc_offset in zip(labels, encodings.offset_mapping):
        # create an empty array of -100
        doc_enc_labels = np.ones(len(doc_offset),dtype=int) * -100
        arr_offset = np.array(doc_offset)

        # set labels whose first offset position is 0 and the second is not 0
        doc_enc_labels[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)] = doc_labels
        encoded_labels.append(doc_enc_labels.tolist())

    return encoded_labels

train_labels = encode_tags(train_tags, train_encodings)
val_labels = encode_tags(val_tags, val_encodings)

In [None]:
import torch

class WNUTDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_encodings.pop("offset_mapping") # we don't want to pass this to the model
val_encodings.pop("offset_mapping")
train_dataset = WNUTDataset(train_encodings, train_labels)
val_dataset = WNUTDataset(val_encodings, val_labels)

In [None]:
from transformers import DistilBertForTokenClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)
model = DistilBertForTokenClassification.from_pretrained('distilbert-base-cased', num_labels=len(unique_tags)) # add the layers to classify the tokens. len of layer = number of tagsb

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForTokenClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this 

Step,Training Loss
10,2.345685
20,2.253268
30,2.082476
40,1.786913
50,1.417712
60,0.834326
70,0.445556
80,0.321534
90,0.303329
100,0.223299


TrainOutput(global_step=510, training_loss=0.3489577648686428)

In [None]:
trainer.evaluate()

{'epoch': 3.0, 'eval_loss': 0.1142309382557869}

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


Save the model to drive, uncomment

In [None]:

model_dir = '/content/gdrive/My Drive/Colab Notebooks/'
path = model_dir + 'wnut_ner_pt_distilbert_uncased/model'
trainer.save_model(path)


Load the model from Drive, load the model from your saved weights, and the same tokenizer used for training. Then add it to pipeline for easy use. Dont forget to mount

In [None]:
from transformers import DistilBertForTokenClassification, DistilBertTokenizerFast, pipeline
model_dir = '/content/gdrive/My Drive/Colab Notebooks/'
path = model_dir + 'wnut_ner_pt_distilbert_uncased/model'
model = DistilBertForTokenClassification.from_pretrained(path)
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')
nlp = pipeline('ner',model= model,tokenizer=tokenizer)


In [None]:
id2tag

{0: 'I-product',
 1: 'I-location',
 2: 'B-corporation',
 3: 'B-group',
 4: 'I-creative-work',
 5: 'B-person',
 6: 'B-creative-work',
 7: 'I-corporation',
 8: 'I-group',
 9: 'I-person',
 10: 'O',
 11: 'B-product',
 12: 'B-location'}

In [None]:
text ="There's a lot of people showing off their iPhones on facebook today , so &lt; so is at such a place , it 's really not that interesting ;o )"
print(nlp(text))

[{'word': 'There', 'score': 0.9995560050010681, 'entity': 'LABEL_10', 'index': 1}, {'word': "'", 'score': 0.9996503591537476, 'entity': 'LABEL_10', 'index': 2}, {'word': 's', 'score': 0.9996324181556702, 'entity': 'LABEL_10', 'index': 3}, {'word': 'a', 'score': 0.99964839220047, 'entity': 'LABEL_10', 'index': 4}, {'word': 'lot', 'score': 0.999610960483551, 'entity': 'LABEL_10', 'index': 5}, {'word': 'of', 'score': 0.9996151328086853, 'entity': 'LABEL_10', 'index': 6}, {'word': 'people', 'score': 0.9996602535247803, 'entity': 'LABEL_10', 'index': 7}, {'word': 'showing', 'score': 0.9995064735412598, 'entity': 'LABEL_10', 'index': 8}, {'word': 'off', 'score': 0.9992566704750061, 'entity': 'LABEL_10', 'index': 9}, {'word': 'their', 'score': 0.9982964396476746, 'entity': 'LABEL_10', 'index': 10}, {'word': 'iPhone', 'score': 0.8602146506309509, 'entity': 'LABEL_11', 'index': 11}, {'word': '##s', 'score': 0.8072671890258789, 'entity': 'LABEL_10', 'index': 12}, {'word': 'on', 'score': 0.997908

In [None]:
for item in (nlp(text)):
  word = item['word']
  entity = id2tag[int(item['entity'].split("_")[1])]
  print (word, entity)

There O
' O
s O
a O
lot O
of O
people O
showing O
off O
their O
iPhone B-product
##s O
on O
face B-corporation
##book O
today O
, O
so O
& O
l O
##t O
; O
so O
is O
at O
such O
a O
place O
, O
it O
' O
s O
really O
not O
that O
interesting O
; O
o O
) O
