In [7]:
!pip install datasets



In [8]:
from datasets import load_dataset

raw_ds = load_dataset("conll2003")

conll2003.py:   0%|          | 0.00/9.57k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/12.3k [00:00<?, ?B/s]

The repository for conll2003 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/conll2003.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [9]:
raw_ds

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [10]:
#Let's reduce the number of rows in these splits, I don't have a strong enough GPU to train from this much data
raw_ds['validation'] = raw_ds['validation'].select(range(2000))
raw_ds['test'] = raw_ds['test'].select(range(2000))

In [11]:
raw_ds

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 2000
    })
})

In [12]:
raw_ds['train'].features["pos_tags"]

Sequence(feature=ClassLabel(names=['"', "''", '#', '$', '(', ')', ',', '.', ':', '``', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'NN|SYM', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB'], id=None), length=-1, id=None)

In [13]:
label_names = raw_ds['train'].features["pos_tags"].feature.names
print(label_names)


['"', "''", '#', '$', '(', ')', ',', '.', ':', '``', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'NN|SYM', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB']


In [19]:
words = raw_ds["train"][0]["tokens"]
labels = raw_ds["train"][0]["pos_tags"]
line1 = ""
line2 = ""
for word, label in zip(words, labels):
    full_label = label_names[label]
    max_length = max(len(word), len(full_label))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += full_label + " " * (max_length - len(full_label) + 1)

print(line1)
print(line2)

EU  rejects German call to boycott British lamb . 
NNP VBZ     JJ     NN   TO VB      JJ      NN   . 


In [20]:
#we are gonna finetune bert on this data to make a pos tagger model

from transformers import AutoTokenizer

model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]



In [21]:
raw_ds["train"][0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [22]:
tokenizer("Make me fall")

{'input_ids': [101, 7102, 1143, 2303, 102], 'token_type_ids': [0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1]}

In [23]:
def align_labels_with_tokens(labels, word_ids):
  new_labels = []
  current_word = None
  for word_id in word_ids:
    if word_id != current_word:
      current_word = word_id
      if word_id is None:
        label = -100
      else:
        label = labels[word_id]
      new_labels.append(label)

    elif word_id is None:
      new_labels.append(-100)

  return new_labels

In [24]:
inputs = tokenizer(raw_ds["train"][0]["tokens"], is_split_into_words=True)
inputs.tokens()

['[CLS]',
 'EU',
 'rejects',
 'German',
 'call',
 'to',
 'boycott',
 'British',
 'la',
 '##mb',
 '.',
 '[SEP]']

In [25]:
labels = raw_ds["train"][0]["pos_tags"]
word_ids = inputs.word_ids()
print(labels)
print(align_labels_with_tokens(labels, word_ids))

[22, 42, 16, 21, 35, 37, 16, 21, 7]
[-100, 22, 42, 16, 21, 35, 37, 16, 21, 7, -100]


In [26]:
def tokenize_and_align(examples):

  tokenized_inputs = tokenizer(
      examples['tokens'], truncation = True, is_split_into_words = True
  )

  all_labels = examples['pos_tags']
  new_labels = []
  for i, labels in enumerate(all_labels):
    word_ids = tokenized_inputs.word_ids(i)
    new_labels.append(align_labels_with_tokens(labels,word_ids))

  tokenized_inputs['labels'] = new_labels

  return tokenized_inputs

In [27]:
tokenized_ds = raw_ds.map(
    tokenize_and_align,
    batched = True,
    remove_columns = raw_ds['train'].column_names
)

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [28]:
tokenized_ds['train'][0]

{'input_ids': [101,
  7270,
  22961,
  1528,
  1840,
  1106,
  21423,
  1418,
  2495,
  12913,
  119,
  102],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [-100, 22, 42, 16, 21, 35, 37, 16, 21, 7, -100]}

In [29]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer= tokenizer, return_tensors = "tf")


In [30]:
train_ds = tokenized_ds['train'].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=16,
)

eval_ds = tokenized_ds["validation"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=16,
)

In [34]:
#for inference object to work(hopefuly, you never know with huggingface)

id2label = {id:label for id, label in enumerate(label_names)}
label2id = {label:id for id,label in id2label.items()}


In [35]:
from transformers import TFAutoModelForTokenClassification

model = TFAutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label = id2label,
    label2id = label2id
)

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForTokenClassification.

Some weights or buffers of the TF 2.0 model TFBertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [36]:
model.summary()

Model: "tf_bert_for_token_classification_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  107719680 
                                                                 
 dropout_75 (Dropout)        multiple                  0 (unused)
                                                                 
 classifier (Dense)          multiple                  36143     
                                                                 
Total params: 107755823 (411.06 MB)
Trainable params: 107755823 (411.06 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [37]:
model.config.num_labels, len(label_names)

(47, 47)

In [38]:
#Now Finetuning the model
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [39]:
from transformers import create_optimizer
import tensorflow as tf

tf.keras.mixed_precision.set_global_policy("mixed_float16")

num_epochs = 3
num_train_steps = len(train_ds)*num_epochs

optimizer, schedule = create_optimizer(
    init_lr = 2e-5,
    num_warmup_steps = 1000,
    num_train_steps = num_train_steps,
    weight_decay_rate = 0.01
)

model.compile(optimizer = optimizer)

In [40]:
from transformers.keras_callbacks import PushToHubCallback

callback = PushToHubCallback(output_dir = "bert-pos-tagging",tokenizer = tokenizer)

model.fit(
    train_ds,
    validation_data = eval_ds,
    callbacks = [callback],
    epochs = num_epochs
)


For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
Cloning https://huggingface.co/Sanyam52/bert-pos-tagging into local empty directory.


Download file tf_model.h5:   0%|          | 15.6k/411M [00:00<?, ?B/s]

Clean file tf_model.h5:   0%|          | 1.00k/411M [00:00<?, ?B/s]

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tf_keras.src.callbacks.History at 0x79d6204db6d0>

In [None]:
#Now using my model

In [41]:
from transformers import pipeline

pipe = pipeline("token-classification", model="Sanyam52/bert-pos-tagging")

print(pipe("My name is Sanyam and more huggingface projects are coming"))

tf_model.h5:   0%|          | 0.00/431M [00:00<?, ?B/s]

Some layers from the model checkpoint at Sanyam52/bert-pos-tagging were not used when initializing TFBertForTokenClassification: ['dropout_75']
- This IS expected if you are initializing TFBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForTokenClassification were initialized from the model checkpoint at Sanyam52/bert-pos-tagging.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForTokenClassification for predictions without further training.
Hardware accelerator e.g. GPU is available in the environment, but no `dev

[{'entity': 'PRP$', 'score': 0.973, 'index': 1, 'word': 'My', 'start': 0, 'end': 2}, {'entity': 'NN', 'score': 0.995, 'index': 2, 'word': 'name', 'start': 3, 'end': 7}, {'entity': 'VBZ', 'score': 0.993, 'index': 3, 'word': 'is', 'start': 8, 'end': 10}, {'entity': 'NNP', 'score': 0.976, 'index': 4, 'word': 'San', 'start': 11, 'end': 14}, {'entity': 'CC', 'score': 0.99, 'index': 5, 'word': '##yam', 'start': 14, 'end': 17}, {'entity': 'RB', 'score': 0.582, 'index': 6, 'word': 'and', 'start': 18, 'end': 21}, {'entity': 'VBG', 'score': 0.8413, 'index': 7, 'word': 'more', 'start': 22, 'end': 26}, {'entity': 'NN', 'score': 0.8584, 'index': 8, 'word': 'hugging', 'start': 27, 'end': 34}, {'entity': 'VBP', 'score': 0.564, 'index': 9, 'word': '##face', 'start': 34, 'end': 38}, {'entity': 'VBG', 'score': 0.956, 'index': 10, 'word': 'projects', 'start': 39, 'end': 47}, {'entity': 'VBG', 'score': 0.9854, 'index': 11, 'word': 'are', 'start': 48, 'end': 51}, {'entity': 'VBG', 'score': 0.983, 'index': 

In [42]:
#Now metrics

In [43]:
!pip install seqeval




In [44]:
!pip install evaluate



In [45]:
import evaluate

metric = evaluate.load("seqeval")

In [46]:
labels = raw_ds["train"][0]["pos_tags"]
labels = [label_names[i] for i in labels]
labels

['NNP', 'VBZ', 'JJ', 'NN', 'TO', 'VB', 'JJ', 'NN', '.']

In [47]:
predictions = labels.copy()
predictions[3] = "."
metric.compute(predictions=[predictions], references=[labels])



{'B': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'BZ': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'J': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 2},
 'N': {'precision': 1.0, 'recall': 0.5, 'f1': 0.6666666666666666, 'number': 2},
 'NP': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'O': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 1.0,
 'overall_recall': 0.875,
 'overall_f1': 0.9333333333333333,
 'overall_accuracy': 0.8888888888888888}

In [49]:
import numpy as np

all_predictions = []
all_labels = []
for batch in eval_ds:
    logits = model.predict_on_batch(batch)["logits"]
    labels = batch["labels"]
    predictions = np.argmax(logits, axis=-1)
    for prediction, label in zip(predictions, labels):
        for predicted_idx, label_idx in zip(prediction, label):
            if label_idx == -100:
                continue
            all_predictions.append(label_names[predicted_idx])
            all_labels.append(label_names[label_idx])
metric.compute(predictions=[all_predictions], references=[all_labels])

{"'": {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 5},
 'B': {'precision': 0.7593406593406593,
  'recall': 0.5998263888888888,
  'f1': 0.6702230843840931,
  'number': 1152},
 'BD': {'precision': 0.8158123370981755,
  'recall': 0.746422893481717,
  'f1': 0.7795765877957659,
  'number': 1258},
 'BG': {'precision': 0.7151335311572701,
  'recall': 0.5906862745098039,
  'f1': 0.6469798657718121,
  'number': 408},
 'BN': {'precision': 0.7292161520190024,
  'recall': 0.5770676691729323,
  'f1': 0.6442812172088143,
  'number': 532},
 'BP': {'precision': 0.8290155440414507,
  'recall': 0.7339449541284404,
  'f1': 0.7785888077858882,
  'number': 218},
 'BR': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 29},
 'BS': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 15},
 'BZ': {'precision': 0.8766666666666667,
  'recall': 0.7827380952380952,
  'f1': 0.8270440251572327,
  'number': 336},
 'C': {'precision': 0.7824267782426778,
  'recall': 0.6875,
  'f1': 0.7318982387475538,