In [30]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

model_name = "QCRI/bert-base-multilingual-cased-pos-english"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

Some weights of the model checkpoint at QCRI/bert-base-multilingual-cased-pos-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [31]:
import nltk
nltk.download('treebank')

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Package treebank is already up-to-date!


True

In [32]:
tagged_sentences = nltk.corpus.treebank.tagged_sents()

print("Number of samples:", len(tagged_sentences))

Number of samples: 3914


In [33]:
tagged_sentences[0]

[('Pierre', 'NNP'),
 ('Vinken', 'NNP'),
 (',', ','),
 ('61', 'CD'),
 ('years', 'NNS'),
 ('old', 'JJ'),
 (',', ','),
 ('will', 'MD'),
 ('join', 'VB'),
 ('the', 'DT'),
 ('board', 'NN'),
 ('as', 'IN'),
 ('a', 'DT'),
 ('nonexecutive', 'JJ'),
 ('director', 'NN'),
 ('Nov.', 'NNP'),
 ('29', 'CD'),
 ('.', '.')]

In [34]:
sentences, sentence_tags = [], []
for tagged_sentence in tagged_sentences:
    sentence, tags = zip(*tagged_sentence)
    sentences.append(sentence)
    sentence_tags.append(tags)

In [35]:
sentences[0]

('Pierre',
 'Vinken',
 ',',
 '61',
 'years',
 'old',
 ',',
 'will',
 'join',
 'the',
 'board',
 'as',
 'a',
 'nonexecutive',
 'director',
 'Nov.',
 '29',
 '.')

In [36]:
sentence_tags[0]

('NNP',
 'NNP',
 ',',
 'CD',
 'NNS',
 'JJ',
 ',',
 'MD',
 'VB',
 'DT',
 'NN',
 'IN',
 'DT',
 'JJ',
 'NN',
 'NNP',
 'CD',
 '.')

In [37]:
len(tokenizer)

119547

In [38]:
model

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-1

In [39]:
from sklearn.model_selection import train_test_split
train_sentences, test_sentences, train_tags, test_tags = train_test_split(
    sentences, sentence_tags, test_size=0.3
)

valid_sentences, test_sentences, valid_tags, test_tags = train_test_split(
    test_sentences, test_tags, test_size=0.5
)

In [40]:
len(train_sentences), len(valid_sentences), len(test_sentences)

(2739, 587, 588)

In [41]:
import torch
from typing import List
import numpy as np

In [42]:
MAX_LEN = max([len(sentence) for sentence in sentences])

In [43]:
MAX_LEN

271

In [44]:
from collections import defaultdict

label2id = defaultdict(int, model.config.label2id)
id2label = {id: label for label, id in label2id.items()}

In [45]:
label2id

defaultdict(int,
            {'#': 7,
             '$': 6,
             "''": 5,
             ',': 2,
             '-LRB-': 17,
             '-RRB-': 32,
             '.': 4,
             ':': 3,
             'CC': 8,
             'CD': 9,
             'DT': 10,
             'EX': 11,
             'FW': 12,
             'IN': 13,
             'JJ': 14,
             'JJR': 15,
             'JJS': 16,
             'LS': 18,
             'MD': 19,
             'NN': 20,
             'NNP': 21,
             'NNPS': 22,
             'NNS': 23,
             'O': 0,
             'PDT': 24,
             'POS': 25,
             'PRP': 26,
             'PRP$': 27,
             'RB': 28,
             'RBR': 29,
             'RBS': 30,
             'RP': 31,
             'SYM': 33,
             'TO': 34,
             'UH': 35,
             'VB': 36,
             'VBD': 37,
             'VBG': 38,
             'VBN': 39,
             'VBP': 40,
             'VBZ': 41,
             'WDT': 42,
      

In [46]:
id2label

{7: '#',
 6: '$',
 5: "''",
 2: ',',
 17: '-LRB-',
 32: '-RRB-',
 4: '.',
 3: ':',
 8: 'CC',
 9: 'CD',
 10: 'DT',
 11: 'EX',
 12: 'FW',
 13: 'IN',
 14: 'JJ',
 15: 'JJR',
 16: 'JJS',
 18: 'LS',
 19: 'MD',
 20: 'NN',
 21: 'NNP',
 22: 'NNPS',
 23: 'NNS',
 0: 'O',
 24: 'PDT',
 25: 'POS',
 26: 'PRP',
 27: 'PRP$',
 28: 'RB',
 29: 'RBR',
 30: 'RBS',
 31: 'RP',
 33: 'SYM',
 34: 'TO',
 35: 'UH',
 36: 'VB',
 37: 'VBD',
 38: 'VBG',
 39: 'VBN',
 40: 'VBP',
 41: 'VBZ',
 42: 'WDT',
 43: 'WP',
 44: 'WP$',
 45: 'WRB',
 1: '``'}

In [47]:
from torch.utils.data import Dataset

class PosTagging_Dataset(Dataset):
    def __init__(self,
                 sentences: List[List[str]],
                 tags: List[List[str]],
                 tokenizer,
                 label2id,
                 max_len=MAX_LEN
                 ):
        super().__init__()
        self.sentences = sentences
        self.tags = tags
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.label2id = label2id

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        input_token = self.sentences[idx]
        label_token = self.tags[idx]

        input_token = self.tokenizer.convert_tokens_to_ids(input_token)
        attention_mask = [1] * len(input_token)
        labels = [self.label2id[token] for token in label_token]

        return {
            "input_ids": self.pad_and_truncate(input_token, pad_id=self.tokenizer.pad_token_id),
            "labels": self.pad_and_truncate(labels, pad_id=self.label2id["0"]),
            "attention_mask": self.pad_and_truncate(attention_mask, pad_id=0)
        }

    def pad_and_truncate(self, inputs: List[int], pad_id: int):
        if len(inputs) < self.max_len:
            padded_inputs = inputs + [pad_id] * (self.max_len - len(inputs))
        else:
            padded_inputs = inputs[:self.max_len]
        return torch.as_tensor(padded_inputs)

In [48]:
train_dataset = PosTagging_Dataset(train_sentences, train_tags, tokenizer, label2id)
val_dataset = PosTagging_Dataset(valid_sentences, valid_tags, tokenizer, label2id)
test_dataset = PosTagging_Dataset(test_sentences, test_tags, tokenizer, label2id)

In [49]:
next(iter(train_dataset))

{'input_ids': tensor([10357, 53306, 10105, 17936, 10114, 15767,   119,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,   

In [50]:
!pip install evaluate



In [51]:
import evaluate
accuracy = evaluate.load("accuracy")

ignore_label = len(label2id)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    mask = labels != ignore_label
    predictions = np.argmax(predictions, axis=-1)
    return accuracy.compute(predictions=predictions[mask], references=labels[mask])

In [52]:
from transformers import TrainingArguments,Trainer

training_args = TrainingArguments(
    output_dir="out_dir",
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)



Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


In [53]:
import os
os.environ["WANDB_DISABLED"] = "true"

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.036446,0.990011
2,No log,0.030142,0.991457
3,0.140800,0.026635,0.992375
4,0.140800,0.024801,0.99279
5,0.140800,0.023979,0.993098
6,0.025100,0.023291,0.993192
7,0.025100,0.022991,0.993305
8,0.025100,0.022854,0.993337
9,0.019900,0.022744,0.993443
10,0.019900,0.022718,0.993425


TrainOutput(global_step=1720, training_loss=0.056396062984022985, metrics={'train_runtime': 1765.6443, 'train_samples_per_second': 15.513, 'train_steps_per_second': 0.974, 'total_flos': 3789641345256360.0, 'train_loss': 0.056396062984022985, 'epoch': 10.0})