In [None]:
import datasets 
import numpy as np 
from transformers import BertTokenizerFast 
from transformers import DataCollatorForTokenClassification 
from transformers import AutoModelForTokenClassification 
from sklearn.model_selection import train_test_split
from datasets import Dataset

# https://github.com/rohan-paul/MachineLearning-DeepLearning-Code-for-my-YouTube-Channel/blob/master/NLP/YT_Fine_tuning_BERT_NER_v1.ipynb

In [2]:
# This code block reads the data.txt file and outputs a list of lists with the tokens and
# a list of list of the semantic tags

full_sen = []
with open('train.conll.txt') as fh:
    # Skip initial comments that starts with #
    while True:
        line = fh.readline()
        # break while statement if it is not a comment line
        # i.e. does not startwith #
        if not line.startswith('#'):
            full_sen.append(line) 
        if not line:
            break    

tokens = []
tags = []
train_tags = []
train_token =[]
train_data = []
train =[]
for i in range (0, len(full_sen)):
    string = full_sen[i].split("\t")
    if not len(full_sen[i]) == 0: 
        if string[0] == '\n':
            train_token.append(tokens) 
            tokens = []
            train_tags.append(tags)
            tags = []
            train_data.append(train)
            train = []
        else:
            tokens.append(string[0])
            tags.append(string[3])
            train.append((string[0],string[3]))
            
print("The training data set has",len(train_data), "sentences." )

The training data set has 7745 sentences.


In [3]:
full_sen = []
with open('test.txt') as fh:
    # Skip initial comments that starts with #
    while True:
        line = fh.readline()
        # break while statement if it is not a comment line
        # i.e. does not startwith #
        if not line.startswith('#'):
            full_sen.append(line) 
        if not line:
            break   
tokens = []
tags = []
test_token = []
test_tags =[]
test_data = []
test =[]
for i in range (0, len(full_sen)):
    string = full_sen[i].split("\t")
    if not len(full_sen[i]) == 0: 
        if string[0] == '\n':
            test_token.append(tokens) 
            tokens = []
            test_tags.append(tags)
            tags = []
            test_data.append(train)
            test = []
        else:
            tokens.append(string[0])
            tags.append(string[3])
            test.append((string[0],string[3]))

print("The testing data set has",len(test_data), "sentences." )

The testing data set has 1052 sentences.


In [4]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased") 

In [5]:
# we need to transform the labels into integer so we need a mapping to keep track of the corresponding values
j = 0
mydict={}
for sentences in range(0,len(train_tags)):
    sent = train_tags[sentences]
    for i in range(0,len(sent)):
        word = sent[i]
        if(word in mydict):
            continue
        else:
            mydict[word] = j
            j = j+1

In [6]:
def transform_into_ints(data,mydict):
    for sentences in range(0,len(data)):
        sent = data[sentences]
        for i in range(0,len(sent)):
            word = sent[i]
            transformation = mydict[word]
            sent[i] = transformation
    return data

In [7]:
train_tags_transformed = train_tags
test_tags_transformed = test_tags

transformed_input_train = transform_into_ints(train_tags_transformed,mydict)
transformed_input_test = transform_into_ints(test_tags_transformed,mydict)

In [8]:
X_train, X_val, y_train, y_val = train_test_split(
         train_token, transformed_input_train, test_size=0.02, random_state=1)

In [9]:
from collections import Counter

D = y_train

# Flatten the nested list
flattened_list = [item for sublist in D for item in sublist]

# Count the frequencies of each value
counter = Counter(flattened_list)

# Print the frequencies of each value
print(counter)

Counter({13: 7590, 1: 6402, 9: 4002, 21: 3629, 18: 2721, 23: 2468, 2: 2365, 5: 2339, 6: 1731, 12: 1693, 0: 1565, 4: 1410, 11: 1281, 34: 1146, 7: 944, 20: 915, 3: 752, 19: 675, 26: 669, 31: 533, 24: 415, 14: 346, 39: 331, 16: 223, 45: 204, 44: 189, 49: 173, 29: 172, 15: 163, 38: 158, 43: 149, 37: 135, 22: 126, 25: 126, 8: 116, 17: 113, 41: 111, 53: 111, 10: 110, 42: 92, 33: 90, 28: 90, 51: 89, 50: 76, 35: 72, 27: 58, 52: 57, 40: 54, 47: 51, 46: 49, 30: 43, 48: 38, 32: 36, 36: 35, 58: 32, 61: 27, 62: 22, 55: 22, 54: 21, 59: 13, 60: 12, 57: 12, 63: 8, 67: 7, 64: 3, 65: 1, 66: 1, 56: 1, 68: 1})


In [10]:
def get_ids(tokens, tags):
    ids = []
    token = []
    ner_tags = []
    for i in range(0, len(tokens)):
        ids.append(i)
        token.append(tokens[i])
        ner_tags.append(tags[i])
    return ids, token, ner_tags

In [11]:
# To get it into the correct form: https://huggingface.co/docs/datasets/v1.1.1/loading_datasets.html

ids_train, tokens_train, ner_tags_train = get_ids(X_train, y_train)
ids_val, tokens_val, ner_tags_val = get_ids(X_val, y_val)

ids_test, tokens_test, ner_tags_test = get_ids(test_token, transformed_input_test)

train = {'input_ids': ids_train,
            'tokens': tokens_train,
            'ner_tags': ner_tags_train}

validation = {'input_ids': ids_val,
            'tokens': tokens_val,
            'ner_tags': ner_tags_val}

test = {'input_ids': ids_test,
            'tokens': tokens_test,
            'ner_tags': ner_tags_test}

train = Dataset.from_dict(train) 
validation = Dataset.from_dict(validation)
test = Dataset.from_dict(test)

data = {'train': train,
            'validation': validation,
            'test': test}

In [12]:
conll2003 = datasets.load_dataset("conll2003") 
conll2003

conll2003['train'] = data['train']
conll2003['validation'] = data['validation']
conll2003['test'] = data['test']

Found cached dataset conll2003 (/Users/julianbehrendt/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98)
100%|██████████| 3/3 [00:00<00:00, 135.20it/s]


In [13]:
conll2003

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'tokens', 'ner_tags'],
        num_rows: 7590
    })
    validation: Dataset({
        features: ['input_ids', 'tokens', 'ner_tags'],
        num_rows: 155
    })
    test: Dataset({
        features: ['input_ids', 'tokens', 'ner_tags'],
        num_rows: 1052
    })
})

In [14]:
def tokenize_and_align_labels(examples, label_all_tokens=True): 
    print(examples["tokens"])
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True) 
    print(tokenized_inputs)               
    labels = [] 
    for i, label in enumerate(examples["ner_tags"]): 
        word_ids = tokenized_inputs.word_ids(batch_index=i) 
        # word_ids() => Return a list mapping the tokens
        # to their actual word in the initial sentence.
        # It Returns a list indicating the word corresponding to each token. 
        previous_word_idx = None 
        label_ids = []
        # Special tokens like `<s>` and `<\s>` are originally mapped to None 
        # We need to set the label to -100 so they are automatically ignored in the loss function.
        for word_idx in word_ids: 
            if word_idx is None: 
                # set –100 as the label for these special tokens
                label_ids.append(-100)
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            elif word_idx != previous_word_idx:
                # if current word_idx is != prev then its the most regular case
                # and add the corresponding token                 
                label_ids.append(label[word_idx]) 
            else: 
                # to take care of sub-words which have the same word_idx
                # set -100 as well for them, but only if label_all_tokens == False
                label_ids.append(label[word_idx] if label_all_tokens else -100) 
                # mask the subword representations after the first subword
                 
            previous_word_idx = word_idx 
        labels.append(label_ids) 
    tokenized_inputs["labels"] = labels 
    return tokenized_inputs 

In [15]:
tokenized_datasets = conll2003.map(tokenize_and_align_labels, batched=True)

 25%|██▌       | 2/8 [00:00<00:00, 16.38ba/s]

[['I', 'was', "n't", 'scared', 'at~all', '.'], ['Tom', 'lowered', 'his', 'voice', '.'], ['Tom', 'wants', 'to', 'drink', 'a', 'cup', 'of', 'coffee', '.'], ['The', 'police', 'charged', 'Sachiyo', 'with', 'the', 'murder', '.'], ['Eleven', 'is', 'a', 'prime~number', '.'], ['A', 'railroad', 'was', 'constructed', 'in', 'this', 'town', '.'], ['I', 'cleared', 'the', 'roof', 'of', 'snow', '.'], ['I', "'ve", 'seen', 'loads', 'of', 'concerts', '.'], ['He', 'is', 'dishonest', '.'], ['I', "'m", 'homosexual', '.'], ['He', 'named', 'his', 'dog', 'Popeye', '.'], ['How', 'long', 'is', 'the', 'Okavango~River', '?'], ['Tom', 'dyed', 'his', 'hair', '.'], ['He', 'has', 'a', 'son', 'whose', 'name', 'is', 'John', '.'], ['Is', 'this', 'book', 'Takeo', "'s", '?'], ['You', 'are', 'very', 'cute', 'boys', '.'], ['Maybe', 'you', "'ll", 'succeed', '.'], ['Tom', 'looked', 'at', 'the', 'moon', '.'], ['I', 'am', 'undressing', '.'], ['He', 'acted', 'like', 'a', 'madman', '.'], ['That', "'s", 'convenient', ',', 'is', "n

 50%|█████     | 4/8 [00:00<00:00, 17.17ba/s]

[['No', 'one', "'s", 'seen', 'Tom', '.'], ['I', 'apologize', 'again', '.'], ['Ken', 'heard', 'Tom', '.'], ['Tom', 'complained', 'about', 'the', 'excessive', 'noise', '.'], ['This', 'squirrel', 'has', 'a', 'nut', 'allergy', '.'], ['He', 'had', 'his', 'socks', 'on', 'inside~out', '.'], ['I', 'kicked', 'the', 'dog', '.'], ['Warren~Harding', 'was', 'an', 'honest', 'man', '.'], ['Bill', 'was', 'killed', 'with', 'a', 'gun', '.'], ['I', 'am', 'playing', 'a', 'game', 'with', 'my', 'sister', '.'], ['Tom', "'s", 'useless', '.'], ['My', 'brother', 'is', 'not', 'as', 'tall', 'as', 'Jiro', '.'], ['Her', 'son', 'is', 'a', 'jet', 'pilot', '.'], ['He', 'shaved', 'his', 'mustache', '.'], ['"', 'Everybody', 'knows', 'that', '.', '"', 'said', 'Peter', '.', '"'], ['What', 'sort', 'of', 'coalition', 'did', 'Avilov', 'mean', '?'], ['The', 'dog', 'chased', 'the', 'cat', 'up', 'a', 'tree', '.'], ['She', 'is', 'bored', 'with', 'this', 'novel', '.'], ['Tom', 'works', 'for', 'an', 'NGO', 'in', 'Africa', '.'], ['

100%|██████████| 8/8 [00:00<00:00, 13.46ba/s]


{'input_ids': [[101, 2008, 1005, 1055, 4895, 5714, 6442, 4630, 102], [101, 1045, 1005, 2310, 2196, 6052, 2014, 999, 102], [101, 3419, 20934, 14536, 2098, 9928, 1012, 102], [101, 1045, 6187, 1050, 1005, 1056, 2424, 1996, 3780, 1012, 102], [101, 1045, 4149, 1037, 4950, 2005, 2382, 6363, 1012, 102], [101, 5342, 7677, 1066, 2053, 16918, 2001, 1037, 2307, 2158, 1012, 102], [101, 2026, 10007, 13403, 1012, 102], [101, 2984, 4149, 1037, 8275, 3434, 1066, 24728, 12474, 2239, 2192, 16078, 1999, 4291, 1066, 4290, 1012, 102], [101, 1045, 2323, 1050, 1005, 1056, 4392, 1012, 102], [101, 2703, 2003, 2062, 21813, 2084, 7871, 1012, 102], [101, 10555, 2439, 2014, 7877, 1012, 102], [101, 3419, 4782, 2010, 5542, 1012, 102], [101, 1996, 3899, 8881, 2012, 2033, 1012, 102], [101, 3256, 1066, 3873, 2003, 2019, 10990, 4368, 1012, 102], [101, 2019, 9200, 2158, 6573, 2006, 2026, 2341, 1012, 102], [101, 1045, 24501, 4765, 2115, 26881, 12629, 1012, 102], [101, 2002, 2442, 2022, 2012, 1066, 2560, 3486, 2086, 2214, 

100%|██████████| 1/1 [00:00<00:00, 71.60ba/s]


[['Can', 'you', 'see', 'anyone', 'else', '?'], ['Somebody', 'wants', 'a', 'haircut', '.'], ['I', 'was', 'driving', 'at', '120', 'kilometers', 'per', 'hour', 'when', 'the', 'police', 'stopped', 'me', '.'], ['I', 'want', 'to', 'buy', 'this', 'toy', 'doll', '.'], ['Is', 'Tom', 'still', 'happy', '?'], ['She', 'is', 'anything~but', 'a', 'singer', '.'], ['My', 'mom', 'married', 'my', 'dad', 'in', 'the', '90s', '.'], ['I', 'bought', 'an', 'Apple', 'mouse', '.'], ['Thirteen', 'people', 'were', 'arrested', '.'], ['The', 'Caribbean', 'abounds', 'with', 'islands', '.'], ['He', "'s", 'dark', 'and', 'handsome', '.'], ['I', 'am', 'married', 'to', 'a', 'Polish', 'woman', '.'], ['The', 'performance', 'began', 'at', '8:00', '.'], ['Tom', 'swallowed', 'a', 'chewing-gum', '.'], ['She', 'is', 'no', 'less', 'beautiful', 'than', 'her', 'sister', '.'], ['I', "'ll", 'call', 'again', '.'], ['Three', 'of', 'the', 'rooms', 'face', 'the', 'street', '.'], ['Tom', 'hid', 'behind', 'a', 'rock', '.'], ['How', 'many',

  0%|          | 0/2 [00:00<?, ?ba/s]

{'input_ids': [[101, 1037, 4111, 2003, 15400, 1999, 1037, 2492, 102], [101, 1037, 3899, 2003, 17033, 1037, 3336, 102], [101, 1037, 2158, 2003, 15116, 2075, 23126, 102], [101, 1037, 2158, 2003, 2652, 1037, 9368, 102], [101, 1037, 2450, 2003, 2652, 1996, 8928, 102], [101, 1037, 2450, 2003, 26514, 2000, 11263, 102], [101, 1996, 4111, 2007, 2502, 2159, 2003, 29536, 22648, 19426, 5983, 102], [101, 1996, 2158, 2003, 3868, 2005, 1996, 2599, 102], [101, 1996, 2273, 2024, 2025, 8218, 1037, 3137, 102], [101, 1996, 2450, 2003, 2025, 8130, 2039, 1037, 3336, 21652, 102], [101, 2048, 2336, 2024, 2652, 2007, 1037, 3608, 1999, 1996, 2380, 102], [101, 2048, 2111, 2024, 5926, 8758, 2075, 1998, 12405, 2024, 3666, 102], [101, 4116, 3631, 2009, 1012, 102], [101, 1996, 3902, 1066, 2644, 2003, 1050, 1005, 1056, 2521, 2013, 2149, 1012, 102], [101, 26680, 2853, 17617, 1066, 2161, 8613, 2000, 5423, 2669, 1999, 3069, 1012, 102], [101, 9267, 3268, 2006, 1041, 1012, 1066, 15538, 1066, 2395, 1012, 102], [101, 3960,

100%|██████████| 2/2 [00:00<00:00, 28.55ba/s]

[['My', 'yogurt', 'expires', 'in', '2014', '!'], ['My', 'luggage', 'is', 'in', 'the', 'boot', '.'], ['I', 'worked', 'from', 'six~PM', 'until', 'midnight', '.'], ['I', 'missed', 'the', "two~o'clock", 'plane', '.'], ['Tom', 'made', 'a', 'peanut~butter', 'and', 'jelly', 'sandwich', '.'], ['They', 'were', 'not', 'amused', '.'], ['The', 'music', 'lured', 'everyone', '.'], ['Is', 'money', 'important', 'to', 'you', '?'], ['We', "'re", 'dieting', '.'], ['He', 'was', 'christened', 'John', '.'], ['The', 'accident', 'took', 'place', 'at', 'that', 'corner', '.'], ['The', 'traffic~light', 'does', "n't", 'work', '.'], ["Fuckin'", 'earthquake', 'was', 'terrible', '!'], ['Tom', 'misunderstood', '.'], ['There', 'are', 'many', 'movie~theaters', 'in', 'this', 'city', '.'], ['This', 'laptop~computer', 'is', 'very', 'thin', '.'], ['You', 'spilled', 'some', 'ketchup', 'on', 'your', 'tie', '.'], ['Someone', 'entered', 'the', 'room', '.'], ['I', 'accompanied', 'her', 'on', 'the', 'piano', '.'], ['Tom', 'was',




In [16]:
model = AutoModelForTokenClassification.from_pretrained("bert-base-uncased", num_labels= 69)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

In [17]:
from transformers import TrainingArguments, Trainer 
args = TrainingArguments( 
    "test-ner",
    evaluation_strategy = "epoch", 
    learning_rate=2e-5, 
    per_device_train_batch_size=32, 
    per_device_eval_batch_size=32, 
    num_train_epochs=6, 
    weight_decay=0.01, 
    eval_steps = 100,  
    save_total_limit = 2
) 

In [18]:
data_collator = DataCollatorForTokenClassification(tokenizer) 

In [19]:
metric = datasets.load_metric("seqeval") 

  metric = datasets.load_metric("seqeval")


In [20]:
def compute_metrics(eval_preds): 
    pred_logits, labels = eval_preds 
    
    pred_logits = np.argmax(pred_logits, axis=2) 
    print(pred_logits)
    # the logits and the probabilities are in the same order,
    # so we don’t need to apply the softmax
    
    predictions = []
    true_labels = []
    for i in range(len(pred_logits)):
        pred_seq = []
        true_seq = []
        for j in range(len(pred_logits[i])):
            if labels[i][j] != -100:
                pred_seq.append(pred_logits[i][j])
                true_seq.append(labels[i][j])
        predictions.append(pred_seq)
        true_labels.append(true_seq)
    
    results = metric.compute(predictions=predictions, references=true_labels) 
    return { 
   "precision": results["overall_precision"], 
   "recall": results["overall_recall"], 
   "f1": results["overall_f1"], 
  "accuracy": results["overall_accuracy"], 
  } 

In [21]:
trainer = Trainer( 
    model, 
    args, 
   train_dataset=tokenized_datasets["train"], 
   eval_dataset=tokenized_datasets["validation"], 
   data_collator=data_collator, 
   tokenizer=tokenizer, 
   compute_metrics=compute_metrics 
) 

In [None]:
trainer.train() 

The following columns in the training set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, ner_tags. If tokens, ner_tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 7590
  Num Epochs = 6
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 1428
  Number of trainable parameters = 108944709
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


In [None]:
import json

In [None]:
model.save_pretrained("bert_english")

In [None]:
model_fine_tuned_english = AutoModelForTokenClassification.from_pretrained("bert_english")

In [None]:
from transformers import pipeline
import re
len_test = 1052

In [None]:
nlp = pipeline("ner", model= model_fine_tuned_english, tokenizer=tokenizer)

all_true_labels = []
all_prediction_labels = []
for i in range(0,len_test):
    test_data = conll2003['test'][i] 
    
    true_labels = test_data['ner_tags']
    all_true_labels.append(true_labels)
    
    tokens = test_data['tokens']
    ner_predictions = nlp(tokens)
    
    prediction_labels = []
    for i in range(0, len(ner_predictions)):
        x = ner_predictions[i]
        s = x[0]
        string = s['entity']
        label = int(re.search(r'\d+', string).group())
        prediction_labels.append(label)
    
    
    all_prediction_labels.append(prediction_labels)

In [None]:
all_prediction_labels

In [None]:
all_true_labels

In [None]:
results = metric.compute(predictions=all_prediction_labels, references=all_true_labels) 


In [None]:
results