### GPU access and Upload libraries:

In [2]:
#GPU:

import os

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

import numpy as np
import tensorflow as tf

seed = 1

np.random.seed(1)
tf.random.set_seed(1)

In [3]:
import torch 
torch.cuda.is_available()

True

In [4]:
torch.cuda.current_device()

0

In [9]:
import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
from transformers import pipeline, set_seed
import textwrap
from pprint import pprint
import pickle
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split

### Download the Dataset:(well, as you know, we only need the test set)

In [6]:
!wget -nc https://lazyprogrammer.me/course_files/nlp/ner_train.pkl
!wget -nc https://lazyprogrammer.me/course_files/nlp/ner_test.pkl

--2023-04-18 12:45:22--  https://lazyprogrammer.me/course_files/nlp/ner_train.pkl
Resolving lazyprogrammer.me (lazyprogrammer.me)... 172.64.80.1, 2606:4700:130:436c:6f75:6466:6c61:7265
Connecting to lazyprogrammer.me (lazyprogrammer.me)|172.64.80.1|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4757208 (4.5M) [application/octet-stream]
Saving to: ‘ner_train.pkl’


2023-04-18 12:45:24 (3.45 MB/s) - ‘ner_train.pkl’ saved [4757208/4757208]

--2023-04-18 12:45:24--  https://lazyprogrammer.me/course_files/nlp/ner_test.pkl
Resolving lazyprogrammer.me (lazyprogrammer.me)... 172.64.80.1, 2606:4700:130:436c:6f75:6466:6c61:7265
Connecting to lazyprogrammer.me (lazyprogrammer.me)|172.64.80.1|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1201978 (1.1M) [application/octet-stream]
Saving to: ‘ner_test.pkl’


2023-04-18 12:45:26 (1.40 MB/s) - ‘ner_test.pkl’ saved [1201978/1201978]



In [10]:
with open('ner_train.pkl', 'rb') as f1:
    corpus_train = pickle.load(f1)
with open ('ner_test.pkl', 'rb') as f2:
    corpus_test= pickle.load(f2)

In [11]:
corpus_train

[[('EU', 'B-ORG'),
  ('rejects', 'O'),
  ('German', 'B-MISC'),
  ('call', 'O'),
  ('to', 'O'),
  ('boycott', 'O'),
  ('British', 'B-MISC'),
  ('lamb', 'O'),
  ('.', 'O')],
 [('The', 'O'),
  ('European', 'B-ORG'),
  ('Commission', 'I-ORG'),
  ('said', 'O'),
  ('on', 'O'),
  ('Thursday', 'O'),
  ('it', 'O'),
  ('disagreed', 'O'),
  ('with', 'O'),
  ('German', 'B-MISC'),
  ('advice', 'O'),
  ('to', 'O'),
  ('consumers', 'O'),
  ('to', 'O'),
  ('shun', 'O'),
  ('British', 'B-MISC'),
  ('lamb', 'O'),
  ('until', 'O'),
  ('scientists', 'O'),
  ('determine', 'O'),
  ('whether', 'O'),
  ('mad', 'O'),
  ('cow', 'O'),
  ('disease', 'O'),
  ('can', 'O'),
  ('be', 'O'),
  ('transmitted', 'O'),
  ('to', 'O'),
  ('sheep', 'O'),
  ('.', 'O')],
 [('Germany', 'B-LOC'),
  ("'s", 'O'),
  ('representative', 'O'),
  ('to', 'O'),
  ('the', 'O'),
  ('European', 'B-ORG'),
  ('Union', 'I-ORG'),
  ("'s", 'O'),
  ('veterinary', 'O'),
  ('committee', 'O'),
  ('Werner', 'B-PER'),
  ('Zwingmann', 'I-PER'),
  ('said

In [12]:
corpus_test

[[('CRICKET', 'O'),
  ('-', 'O'),
  ('LEICESTERSHIRE', 'B-ORG'),
  ('TAKE', 'O'),
  ('OVER', 'O'),
  ('AT', 'O'),
  ('TOP', 'O'),
  ('AFTER', 'O'),
  ('INNINGS', 'O'),
  ('VICTORY', 'O'),
  ('.', 'O')],
 [('West', 'B-MISC'),
  ('Indian', 'I-MISC'),
  ('all-rounder', 'O'),
  ('Phil', 'B-PER'),
  ('Simmons', 'I-PER'),
  ('took', 'O'),
  ('four', 'O'),
  ('for', 'O'),
  ('38', 'O'),
  ('on', 'O'),
  ('Friday', 'O'),
  ('as', 'O'),
  ('Leicestershire', 'B-ORG'),
  ('beat', 'O'),
  ('Somerset', 'B-ORG'),
  ('by', 'O'),
  ('an', 'O'),
  ('innings', 'O'),
  ('and', 'O'),
  ('39', 'O'),
  ('runs', 'O'),
  ('in', 'O'),
  ('two', 'O'),
  ('days', 'O'),
  ('to', 'O'),
  ('take', 'O'),
  ('over', 'O'),
  ('at', 'O'),
  ('the', 'O'),
  ('head', 'O'),
  ('of', 'O'),
  ('the', 'O'),
  ('county', 'O'),
  ('championship', 'O'),
  ('.', 'O')],
 [('Their', 'O'),
  ('stay', 'O'),
  ('on', 'O'),
  ('top', 'O'),
  (',', 'O'),
  ('though', 'O'),
  (',', 'O'),
  ('may', 'O'),
  ('be', 'O'),
  ('short-lived', 

### Some preprocessing:

In [13]:
len(corpus_test)

2970

In [23]:
inputs = []
targets = []

for sentence in corpus_test:
    tokens = []
    tags = []
    for token, tag in sentence:
        tokens.append(token)
        tags.append(tag)
    inputs.append(tokens)
    targets.append(tags)

In [24]:
inputs

[['CRICKET',
  '-',
  'LEICESTERSHIRE',
  'TAKE',
  'OVER',
  'AT',
  'TOP',
  'AFTER',
  'INNINGS',
  'VICTORY',
  '.'],
 ['West',
  'Indian',
  'all-rounder',
  'Phil',
  'Simmons',
  'took',
  'four',
  'for',
  '38',
  'on',
  'Friday',
  'as',
  'Leicestershire',
  'beat',
  'Somerset',
  'by',
  'an',
  'innings',
  'and',
  '39',
  'runs',
  'in',
  'two',
  'days',
  'to',
  'take',
  'over',
  'at',
  'the',
  'head',
  'of',
  'the',
  'county',
  'championship',
  '.'],
 ['Their',
  'stay',
  'on',
  'top',
  ',',
  'though',
  ',',
  'may',
  'be',
  'short-lived',
  'as',
  'title',
  'rivals',
  'Essex',
  ',',
  'Derbyshire',
  'and',
  'Surrey',
  'all',
  'closed',
  'in',
  'on',
  'victory',
  'while',
  'Kent',
  'made',
  'up',
  'for',
  'lost',
  'time',
  'in',
  'their',
  'rain-affected',
  'match',
  'against',
  'Nottinghamshire',
  '.'],
 ['After',
  'bowling',
  'Somerset',
  'out',
  'for',
  '83',
  'on',
  'the',
  'opening',
  'morning',
  'at',
  'Gra

In [25]:
targets

[['O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
 ['B-MISC',
  'I-MISC',
  'O',
  'B-PER',
  'I-PER',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-ORG',
  'O',
  'B-ORG',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-ORG',
  'O',
  'B-ORG',
  'O',
  'B-ORG',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-ORG',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-ORG',
  'O'],
 ['O',
  'O',
  'B-ORG',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-LOC',
  'I-LOC',
  'O',
  'B-ORG',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-LOC',
  'O',
  'B-PER',
  'I-PER',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O',
  'O',
  'O',
  'O',
  'B-ORG',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  '

In [29]:
#since the transformer model wastrained using sentences (not chunks of tokens that we have in ourdataset), 
# then we have to joing the tokens in teh dataset. We don't want to use the typical 'join' function in python 
#becuase it doesn't nkow that , nedds space after and not before, and also - doesn't need any space.
#Therefore, we use the 'detokenizer' module in the 'nltk' library since it knwows the spaceing rules and such. 
#So, it does a better job at joining the tokens into sentences. 

from nltk.tokenize.treebank import TreebankWordDetokenizer
detokenizer = TreebankWordDetokenizer()

In [32]:
# detokenizing a sample input: inputs[9]
detokenizer.detokenize(inputs[9])

'He was well backed by England hopeful Mark Butcher who made 70 as Surrey closed on 429 for seven, a lead of 234.'

In [30]:
inputs[9]

['He',
 'was',
 'well',
 'backed',
 'by',
 'England',
 'hopeful',
 'Mark',
 'Butcher',
 'who',
 'made',
 '70',
 'as',
 'Surrey',
 'closed',
 'on',
 '429',
 'for',
 'seven',
 ',',
 'a',
 'lead',
 'of',
 '234',
 '.']

In [31]:
targets[9]

['O',
 'O',
 'O',
 'O',
 'O',
 'B-LOC',
 'O',
 'B-PER',
 'I-PER',
 'O',
 'O',
 'O',
 'O',
 'B-ORG',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O']

### NER model:

In [33]:
ner = pipeline('ner', aggregation_strategy='simple', device=0)
#bert-large-cased-finetuned-conll03-english: so, the model was finetned on this exact dataset we are using here Cornell 2003.

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [43]:
ner(detokenizer.detokenize(inputs[9]))

[{'entity_group': 'LOC',
  'score': 0.99967515,
  'word': 'England',
  'start': 22,
  'end': 29},
 {'entity_group': 'PER',
  'score': 0.99974275,
  'word': 'Mark Butcher',
  'start': 38,
  'end': 50},
 {'entity_group': 'ORG',
  'score': 0.9996264,
  'word': 'Surrey',
  'start': 66,
  'end': 72}]

In [35]:
# detokenizing a sample input: inputs[9]
detokenizer.detokenize(inputs[9])

'He was well backed by England hopeful Mark Butcher who made 70 as Surrey closed on 429 for seven, a lead of 234.'

In [41]:
np.array(inputs[9])

array(['He', 'was', 'well', 'backed', 'by', 'England', 'hopeful', 'Mark',
       'Butcher', 'who', 'made', '70', 'as', 'Surrey', 'closed', 'on',
       '429', 'for', 'seven', ',', 'a', 'lead', 'of', '234', '.'],
      dtype='<U7')

### Predictions function:

In [63]:
def compute_predictions(tokens, input_, ner_result):
    # goal: map hugging face ner result to list of tags for later performance assessment
    # tokens is the original tokenized sentence
    # input_ is the detokenized string
    
    predicted_tags = []
    state = 'O'
    current_index = 0 #start from index 0 
    for token in tokens:
        index = input_.find(token) #should be in the beggining 
        current_index += index # where we are currently pointing to
        
        #checking to see if this token is already in the ner_result:
        tag = 'O'
        for entity in ner_result:
            if current_index>=entity['start'] and current_index<entity['end']:
                #then the token already belongs to an entity in the ner_result
                if state == 'O':
                    state = 'B'
                else: #or if state == 'B':
                    state = 'I'
                tag = f"{state}-{entity['entity_group']}" #f-string
                break
                
        if tag == 'O':
            state = 'O' # reset the state
        predicted_tags.append(tag)
        current_index+= len(token) # update current_index
        input_ = input_[index+len(token):]
    assert(len(predicted_tags) == len(tokens))
    return predicted_tags

In [64]:
tokens = inputs[9]
input_ = detokenizer.detokenize(inputs[9])
ner_result = ner(input_)
predicted_tags = compute_predictions(tokens, input_, ner_result)

In [65]:
predicted_tags

['O',
 'O',
 'O',
 'O',
 'O',
 'B-LOC',
 'O',
 'B-PER',
 'I-PER',
 'O',
 'O',
 'O',
 'O',
 'B-ORG',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O']

In [66]:
accuracy_score(targets[9], predicted_tags)

1.0

In [68]:
for targs, preds in zip(targets[9], predicted_tags):
    print(targs, preds)

O O
O O
O O
O O
O O
B-LOC B-LOC
O O
B-PER B-PER
I-PER I-PER
O O
O O
O O
O O
B-ORG B-ORG
O O
O O
O O
O O
O O
O O
O O
O O
O O
O O
O O


### Now, test it on all the tokens in the test file:

In [70]:
detok_inputs = [] #detok_inputs VS inputs
for tokens in inputs:
    text = detokenizer.detokenize(tokens)
    detok_inputs.append(text)

In [72]:
ner_results = ner(detok_inputs) 

In [76]:
preds_lst = []
for tokens, input_, ner_result in zip(inputs, detok_inputs, ner_results):
    preds = compute_predictions(tokens, input_, ner_result)
    preds_lst.append(preds)

In [94]:
def flattening(lst_of_lst):
    flat_lst = [val for sublst in lst_of_lst for val in sublst]
    return flat_lst

flat_preds = flattening(preds_lst)
flat_targets = flattening(targets)

In [96]:
accuracy_score(flat_targets, flat_preds)

0.9916563354782848

In [98]:
f1_score(flat_targets, flat_preds, average='macro')

0.95403328229255