In [1]:
import datasets 
import numpy as np 
from transformers import BertTokenizerFast 
from transformers import DataCollatorForTokenClassification 
from transformers import AutoModelForTokenClassification 
from sklearn.model_selection import train_test_split
from datasets import Dataset

import warnings
warnings.filterwarnings('ignore')

# https://github.com/rohan-paul/MachineLearning-DeepLearning-Code-for-my-YouTube-Channel/blob/master/NLP/YT_Fine_tuning_BERT_NER_v1.ipynb

  from .autonotebook import tqdm as notebook_tqdm
2023-01-31 22:46:19.476016: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# This code block reads the data.txt file and outputs a list of lists with the tokens and
# a list of list of the semantic tags

full_sen = []
with open('train.en.txt') as fh:
    # Skip initial comments that starts with #
    while True:
        line = fh.readline()
        # break while statement if it is not a comment line
        # i.e. does not startwith #
        if not line.startswith('#'):
            full_sen.append(line) 
        if not line:
            break    

tokens = []
tags = []
train_tags = []
train_token =[]
train_data = []
train =[]
for i in range (0, len(full_sen)):
    string = full_sen[i].split("\t")
    if not len(full_sen[i]) == 0: 
        if string[0] == '\n':
            train_token.append(tokens) 
            tokens = []
            train_tags.append(tags)
            tags = []
            train_data.append(train)
            train = []
        else:
            tokens.append(string[0])
            tags.append(string[3])
            train.append((string[0],string[3]))
            
print("The training data set has",len(train_data), "sentences." )

The training data set has 7745 sentences.


In [3]:
full_sen = []
with open('test.en.txt') as fh:
    # Skip initial comments that starts with #
    while True:
        line = fh.readline()
        # break while statement if it is not a comment line
        # i.e. does not startwith #
        if not line.startswith('#'):
            full_sen.append(line) 
        if not line:
            break   
tokens = []
tags = []
test_token = []
test_tags =[]
test_data = []
test =[]
for i in range (0, len(full_sen)):
    string = full_sen[i].split("\t")
    if not len(full_sen[i]) == 0: 
        if string[0] == '\n':
            test_token.append(tokens) 
            tokens = []
            test_tags.append(tags)
            tags = []
            test_data.append(train)
            test = []
        else:
            tokens.append(string[0])
            tags.append(string[3])
            test.append((string[0],string[3]))

print("The testing data set has",len(test_data), "sentences.")

The testing data set has 1052 sentences.


In [4]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased") 


In [5]:
mydict = np.load('universal_dict.npy',allow_pickle='TRUE').item()

In [6]:
def transform_into_ints(data,mydict):
    for sentences in range(0,len(data)):
        sent = data[sentences]
        for i in range(0,len(sent)):
            word = sent[i]
            transformation = mydict[word]
            sent[i] = transformation
    return data

In [7]:
train_tags_transformed = train_tags
test_tags_transformed = test_tags

transformed_input_train = transform_into_ints(train_tags_transformed,mydict)
transformed_input_test = transform_into_ints(test_tags_transformed,mydict)

In [8]:
X_train, X_val, y_train, y_val = train_test_split(
         train_token, transformed_input_train, test_size = 0.2, random_state=18)

In [9]:
from collections import Counter

D = y_train

# Flatten the nested list
flattened_list = [item for sublist in D for item in sublist]

# Count the frequencies of each value
counter = Counter(flattened_list)

# Print the frequencies of each value
print(counter)

Counter({13: 6203, 1: 5221, 9: 3242, 21: 2996, 18: 2244, 23: 2081, 2: 1944, 5: 1916, 12: 1409, 6: 1368, 0: 1281, 4: 1128, 11: 1039, 34: 920, 7: 781, 20: 752, 3: 618, 19: 551, 26: 519, 31: 435, 24: 341, 14: 273, 39: 249, 16: 186, 45: 164, 44: 156, 29: 146, 15: 136, 49: 135, 38: 124, 43: 121, 37: 111, 22: 106, 25: 105, 53: 97, 8: 94, 41: 93, 17: 89, 10: 88, 42: 80, 28: 77, 33: 73, 50: 66, 35: 66, 51: 65, 47: 46, 27: 45, 52: 43, 40: 41, 46: 39, 30: 35, 48: 33, 36: 31, 32: 26, 61: 25, 58: 21, 55: 19, 62: 17, 54: 15, 57: 11, 59: 11, 60: 11, 67: 6, 63: 5, 64: 3, 65: 1, 66: 1, 56: 1})


In [10]:
def get_ids(tokens, tags):
    ids = []
    token = []
    ner_tags = []
    for i in range(0, len(tokens)):
        ids.append(i)
        token.append(tokens[i])
        ner_tags.append(tags[i])
    return ids, token, ner_tags

In [11]:
# To get it into the correct form: https://huggingface.co/docs/datasets/v1.1.1/loading_datasets.html

ids_train, tokens_train, ner_tags_train = get_ids(X_train, y_train)
ids_val, tokens_val, ner_tags_val = get_ids(X_val, y_val)

ids_test, tokens_test, ner_tags_test = get_ids(test_token, transformed_input_test)

train = {'input_ids': ids_train,
            'tokens': tokens_train,
            'ner_tags': ner_tags_train}

validation = {'input_ids': ids_val,
            'tokens': tokens_val,
            'ner_tags': ner_tags_val}

test = {'input_ids': ids_test,
            'tokens': tokens_test,
            'ner_tags': ner_tags_test}

train = Dataset.from_dict(train) 
validation = Dataset.from_dict(validation)
test = Dataset.from_dict(test)

data = {'train': train,
            'validation': validation,
            'test': test}

In [12]:
conll2003 = datasets.load_dataset("conll2003") 
conll2003

conll2003['train'] = data['train']
conll2003['validation'] = data['validation']
conll2003['test'] = data['test']

Found cached dataset conll2003 (/Users/julianbehrendt/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98)
100%|██████████| 3/3 [00:00<00:00, 163.04it/s]


In [13]:
conll2003

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'tokens', 'ner_tags'],
        num_rows: 6196
    })
    validation: Dataset({
        features: ['input_ids', 'tokens', 'ner_tags'],
        num_rows: 1549
    })
    test: Dataset({
        features: ['input_ids', 'tokens', 'ner_tags'],
        num_rows: 1052
    })
})

In [14]:
def tokenize_and_align_labels(examples, label_all_tokens=True): 

    
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True) 

    
    labels = [] 
    for i, label in enumerate(examples["ner_tags"]): 
        word_ids = tokenized_inputs.word_ids(batch_index=i) 
        # word_ids() => Return a list mapping the tokens
        # to their actual word in the initial sentence.
        # It Returns a list indicating the word corresponding to each token. 
        
        previous_word_idx = None 
        label_ids = []
        # Special tokens like `<s>` and `<\s>` are originally mapped to None 
        # We need to set the label to -100 so they are automatically ignored in the loss function.
        
        for word_idx in word_ids: 
            if word_idx is None: 
                # set –100 as the label for these special tokens
                label_ids.append(-100)
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            elif word_idx != previous_word_idx:
                # if current word_idx is != prev then its the most regular case
                # and add the corresponding token                 
                label_ids.append(label[word_idx]) 
            else: 
                # to take care of sub-words which have the same word_idx
                # set -100 as well for them, but only if label_all_tokens == False
                label_ids.append(label[word_idx] if label_all_tokens else -100) 
                # mask the subword representations after the first subword
                 
            previous_word_idx = word_idx 
        labels.append(label_ids) 
    tokenized_inputs["labels"] = labels 
    return tokenized_inputs 

In [15]:
tokenized_datasets = conll2003.map(tokenize_and_align_labels, batched=True)

100%|██████████| 7/7 [00:00<00:00, 14.05ba/s]
100%|██████████| 2/2 [00:00<00:00, 21.90ba/s]
100%|██████████| 2/2 [00:00<00:00, 32.54ba/s]


In [16]:
model = AutoModelForTokenClassification.from_pretrained("bert-base-uncased", num_labels= 69)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

In [17]:
from transformers import TrainingArguments, Trainer 
args = TrainingArguments( 
    "test-ner",
    evaluation_strategy = "epoch", 
    learning_rate=2e-5, 
    per_device_train_batch_size=16, 
    per_device_eval_batch_size=16, 
    num_train_epochs=3, 
    weight_decay=0.01, 
    eval_steps = 100,  
    save_total_limit = 2
) 

In [18]:
data_collator = DataCollatorForTokenClassification(tokenizer) 

In [19]:
metric = datasets.load_metric("seqeval") 

In [20]:
def compute_metrics(eval_preds): 
    pred_logits, labels = eval_preds 
    
    pred_logits = np.argmax(pred_logits, axis=2) 
    print(pred_logits)
    # the logits and the probabilities are in the same order,
    # so we don’t need to apply the softmax
    
    predictions = []
    true_labels = []
    for i in range(len(pred_logits)):
        pred_seq = []
        true_seq = []
        for j in range(len(pred_logits[i])):
            if labels[i][j] != -100:
                pred_seq.append(pred_logits[i][j])
                true_seq.append(labels[i][j])
        predictions.append(pred_seq)
        true_labels.append(true_seq)
    
    results = metric.compute(predictions=predictions, references=true_labels) 
    return { 
   "precision": results["overall_precision"], 
   "recall": results["overall_recall"], 
   "f1": results["overall_f1"], 
  "accuracy": results["overall_accuracy"], 
  } 

In [21]:
trainer = Trainer( 
    model, 
    args, 
   train_dataset=tokenized_datasets["train"], 
   eval_dataset=tokenized_datasets["validation"], 
   data_collator=data_collator, 
   tokenizer=tokenizer, 
   compute_metrics=compute_metrics 
) 

In [22]:
trainer.train() 

The following columns in the training set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, ner_tags. If tokens, ner_tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 6196
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1164
  Number of trainable parameters = 108944709
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.27195,0.934412,0.930821,0.932613,0.943089
2,0.835400,0.155077,0.961286,0.963551,0.962417,0.969448
3,0.174300,0.137509,0.967034,0.967394,0.967214,0.972786


The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, ner_tags. If tokens, ner_tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1549
  Batch size = 16


[[13  9 20 ...  0  0  0]
 [13 23  1 ...  0  0  0]
 [13  2  9 ...  0  0  0]
 ...
 [13  5 20 ...  0  0  0]
 [13 34  2 ...  0  0  0]
 [13  9 18 ...  0  0  0]]


Saving model checkpoint to test-ner/checkpoint-500
Configuration saved in test-ner/checkpoint-500/config.json
Model weights saved in test-ner/checkpoint-500/pytorch_model.bin
tokenizer config file saved in test-ner/checkpoint-500/tokenizer_config.json
Special tokens file saved in test-ner/checkpoint-500/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, ner_tags. If tokens, ner_tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1549
  Batch size = 16


[[13  9 20 ...  0  0  0]
 [13 23  1 ...  0  0  0]
 [13  2  9 ...  0  0  0]
 ...
 [13  5 20 ...  0  0  0]
 [13 34  2 ...  0  0  0]
 [13  9 18 ...  0  0  0]]


Saving model checkpoint to test-ner/checkpoint-1000
Configuration saved in test-ner/checkpoint-1000/config.json
Model weights saved in test-ner/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in test-ner/checkpoint-1000/tokenizer_config.json
Special tokens file saved in test-ner/checkpoint-1000/special_tokens_map.json
Deleting older checkpoint [test-ner/checkpoint-1500] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, ner_tags. If tokens, ner_tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1549
  Batch size = 16


[[13  9 20 ...  0  0  0]
 [13 23  1 ...  0  0  0]
 [13  2  9 ...  0  0  0]
 ...
 [13  5 20 ...  0  0  0]
 [13 34  2 ...  0  0  0]
 [13  9 18 ...  0  0  0]]




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=1164, training_loss=0.45140825514121563, metrics={'train_runtime': 1328.8351, 'train_samples_per_second': 13.988, 'train_steps_per_second': 0.876, 'total_flos': 149864800716216.0, 'train_loss': 0.45140825514121563, 'epoch': 3.0})

In [23]:
model.save_pretrained("bert_english")

Configuration saved in bert_english/config.json
Model weights saved in bert_english/pytorch_model.bin


In [24]:
model_fine_tuned_english = AutoModelForTokenClassification.from_pretrained("bert_english")

loading configuration file bert_english/config.json
Model config BertConfig {
  "_name_or_path": "bert_english",
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LABEL_18",
    "19": "LABEL_19",
    "20": "LABEL_20",
    "21": "LABEL_21",
    "22": "LABEL_22",
    "23": "LABEL_23",
    "24": "LABEL_24",
    "25": "LABEL_25",
    "26": "LABEL_26",
    "27": "LABEL_27",
    "28": "LABEL_28",
    "29": "LAB

In [25]:
from transformers import pipeline
import re
len_test = 1052

In [26]:
nlp = pipeline("ner", model= model_fine_tuned_english, tokenizer=tokenizer)

all_true_labels = []
all_prediction_labels = []
for i in range(0,len_test):
    test_data = conll2003['test'][i] 
    
    true_labels = test_data['ner_tags']
    all_true_labels.append(true_labels)
    
    tokens = test_data['tokens']
    ner_predictions = nlp(tokens)
    
    prediction_labels = []
    for i in range(0, len(ner_predictions)):
        x = ner_predictions[i]
        s = x[0]
        string = s['entity']
        label = int(re.search(r'\d+', string).group())
        prediction_labels.append(label)
    
    
    all_prediction_labels.append(prediction_labels)

Disabling tokenizer parallelism, we're using DataLoader multithreading already


In [27]:
all_prediction_labels

[[0, 1, 6, 3, 21, 0, 1],
 [0, 1, 6, 3, 0, 1],
 [0, 1, 6, 3, 1],
 [0, 1, 6, 3, 0, 1],
 [0, 1, 6, 3, 23, 1],
 [0, 1, 6, 3, 17],
 [23, 1, 21, 16, 1, 6, 12, 3],
 [23, 1, 6, 3, 21, 23, 11],
 [23, 1, 6, 26, 3, 0, 1],
 [23, 1, 6, 26, 3, 21, 0, 1, 1],
 [19, 1, 6, 3, 21, 0, 1, 21, 23, 1],
 [19, 1, 6, 1, 39, 7, 6, 3],
 [5, 18, 9, 13],
 [23, 1, 6, 26, 12, 21, 9, 13],
 [5, 18, 1, 21, 5, 21, 22, 13],
 [5, 6, 21, 50, 13],
 [5, 18, 15, 12, 13],
 [23, 1, 6, 3, 13],
 [6, 9, 6, 0, 1, 34],
 [9, 18, 23, 1, 21, 19, 24, 13],
 [4, 25, 7, 6, 19, 24, 25, 13, 9, 13],
 [6, 9, 11, 1, 34],
 [9, 18, 23, 1, 0, 1, 13],
 [9, 12, 46, 21, 1, 13],
 [9, 1, 39, 1, 13],
 [9, 18, 11, 21, 22, 47, 22, 21, 31, 13],
 [34, 9, 18, 23, 1, 13, 9, 18, 3, 23, 1, 13],
 [12, 13],
 [7, 5, 9, 1, 18, 0, 16, 1, 13],
 [9, 11, 1, 13],
 [34, 12, 21, 1, 6, 43, 34],
 [7, 11, 23, 1, 13],
 [9, 18, 21, 31, 13],
 [43, 1, 18, 26, 1, 13],
 [9, 9, 15, 12, 13],
 [6, 9, 11, 7, 5, 34],
 [23, 1, 1, 6, 19, 13],
 [9, 6, 21, 31, 13],
 [12, 38],
 [39, 5, 39, 5

In [28]:
all_true_labels

[[0, 1, 2, 3, 21, 0, 1],
 [0, 1, 2, 3, 0, 1],
 [0, 1, 2, 3, 1],
 [0, 1, 2, 3, 0, 1],
 [0, 1, 2, 3, 23, 1],
 [0, 1, 2, 3, 1],
 [23, 1, 21, 16, 1, 2, 12, 3],
 [23, 1, 2, 3, 21, 23, 1],
 [23, 1, 2, 26, 3, 0, 1],
 [23, 1, 2, 26, 3, 21, 0, 1, 1],
 [19, 1, 2, 3, 21, 0, 1, 21, 23, 1],
 [19, 1, 2, 3, 39, 1, 2, 3],
 [5, 18, 9, 13],
 [23, 1, 2, 26, 12, 21, 9, 13],
 [17, 18, 17, 21, 17, 21, 22, 13],
 [5, 6, 21, 50, 13],
 [5, 20, 15, 12, 13],
 [23, 1, 2, 3, 13],
 [2, 9, 11, 0, 1, 34],
 [9, 18, 23, 1, 21, 19, 24, 13],
 [4, 25, 7, 2, 19, 24, 25, 21, 9, 13],
 [2, 9, 11, 1, 34],
 [9, 18, 23, 1, 0, 1, 13],
 [9, 18, 46, 21, 1, 13],
 [9, 11, 39, 1, 13],
 [9, 20, 11, 21, 48, 47, 22, 21, 31, 13],
 [21, 9, 18, 23, 1, 13, 9, 20, 3, 23, 1, 13],
 [56, 13],
 [7, 5, 4, 1, 18, 0, 16, 1, 13],
 [9, 18, 59, 13],
 [34, 1, 21, 1, 6, 43, 34],
 [7, 18, 23, 1, 13],
 [9, 18, 21, 31, 13],
 [43, 1, 18, 26, 1, 13],
 [9, 2, 15, 12, 13],
 [2, 9, 11, 7, 5, 34],
 [23, 1, 1, 6, 19, 13],
 [9, 6, 21, 31, 13],
 [56, 38],
 [39, 5, 28

In [29]:
#results = metric.compute(predictions=all_prediction_labels, references=all_true_labels) 
# results

In [31]:
all_true_labels_as_array = []
for i in range(0, len(all_true_labels)):
    sentence_labels = all_true_labels[i]
    for j in range(0,len(sentence_labels)):
        all_true_labels_as_array.append(sentence_labels[j])

all_predicted_labels_as_array = []
for i in range(0, len(all_prediction_labels)):
    sentence_labels = all_prediction_labels[i]
    for j in range(0,len(sentence_labels)):
        all_predicted_labels_as_array.append(sentence_labels[j])
        
from sklearn.metrics import accuracy_score
accuracy_score(all_true_labels_as_array,all_predicted_labels_as_array)

0.8384683098591549

In [37]:
example_text = conll2003['train'][4]

tokenized_input = tokenizer(example_text["tokens"], is_split_into_words=True)

tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])

word_ids = tokenized_input.word_ids()

print(word_ids)

''' As we can see, it returns a list with the same number of elements as our processed input ids, mapping special tokens to None and all other tokens to their respective word. This way, we can align the labels with the processed input ids. '''

tokenized_input

[None, 0, 1, 2, 3, None]


{'input_ids': [101, 3419, 6476, 2984, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1]}

In [38]:
example_text

{'input_ids': 4,
 'tokens': ['Tom', 'kicked', 'Mary', '.'],
 'ner_tags': [5, 18, 5, 13]}