<a href="https://colab.research.google.com/github/Shoaib7897/FIRST/blob/master/NER_ELMO_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets tokenizers seqeval -q
!pip install tensorflow


In [None]:
import datasets 
import numpy as np 
from transformers import BertTokenizerFast 
from transformers import DataCollatorForTokenClassification 
from transformers import AutoModelForTokenClassification 
conll2003 = datasets.load_dataset("conll2003") 

In [None]:
conll2003

In [7]:
conll2003.shape


{'train': (14041, 5), 'validation': (3250, 5), 'test': (3453, 5)}

In [8]:
conll2003["train"][0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [9]:
conll2003["train"].features["ner_tags"]

Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)

In [10]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased") 

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [11]:
example_text = conll2003['train'][0]

tokenized_input = tokenizer(example_text["tokens"], is_split_into_words=True)

tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])

word_ids = tokenized_input.word_ids()

print(word_ids)

''' As we can see, it returns a list with the same number of elements as our processed input ids, mapping special tokens to None and all other tokens to their respective word. This way, we can align the labels with the processed input ids. '''

tokenized_input

[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, None]


{'input_ids': [101, 7327, 19164, 2446, 2655, 2000, 17757, 2329, 12559, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [12]:
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
tokens

['[CLS]',
 'eu',
 'rejects',
 'german',
 'call',
 'to',
 'boycott',
 'british',
 'lamb',
 '.',
 '[SEP]']

In [13]:
def tokenize_and_align_labels(examples, label_all_tokens=True): 
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True) 
    labels = [] 
    for i, label in enumerate(examples["ner_tags"]): 
        word_ids = tokenized_inputs.word_ids(batch_index=i) 
        # word_ids() => Return a list mapping the tokens
        # to their actual word in the initial sentence.
        # It Returns a list indicating the word corresponding to each token. 
        previous_word_idx = None 
        label_ids = []
        # Special tokens like `<s>` and `<\s>` are originally mapped to None 
        # We need to set the label to -100 so they are automatically ignored in the loss function.
        for word_idx in word_ids: 
            if word_idx is None: 
                # set –100 as the label for these special tokens
                label_ids.append(-100)
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            elif word_idx != previous_word_idx:
                # if current word_idx is != prev then its the most regular case
                # and add the corresponding token                 
                label_ids.append(label[word_idx]) 
            else: 
                # to take care of sub-words which have the same word_idx
                # set -100 as well for them, but only if label_all_tokens == False
                label_ids.append(label[word_idx] if label_all_tokens else -100) 
                # mask the subword representations after the first subword
                 
            previous_word_idx = word_idx 
        labels.append(label_ids) 
    tokenized_inputs["labels"] = labels 
    return tokenized_inputs 

In [14]:
q = tokenize_and_align_labels(conll2003['train'][4:5]) 
print(q) 

{'input_ids': [[101, 2762, 1005, 1055, 4387, 2000, 1996, 2647, 2586, 1005, 1055, 15651, 2837, 14121, 1062, 9328, 5804, 2056, 2006, 9317, 10390, 2323, 4965, 8351, 4168, 4017, 2013, 3032, 2060, 2084, 3725, 2127, 1996, 4045, 6040, 2001, 24509, 1012, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[-100, 5, 0, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 1, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, -100]]}


In [15]:
for token, label in zip(tokenizer.convert_ids_to_tokens(q["input_ids"][0]),q["labels"][0]): 
    print(f"{token:_<40} {label}")

[CLS]___________________________________ -100
germany_________________________________ 5
'_______________________________________ 0
s_______________________________________ 0
representative__________________________ 0
to______________________________________ 0
the_____________________________________ 0
european________________________________ 3
union___________________________________ 4
'_______________________________________ 0
s_______________________________________ 0
veterinary______________________________ 0
committee_______________________________ 0
werner__________________________________ 1
z_______________________________________ 2
##wing__________________________________ 2
##mann__________________________________ 2
said____________________________________ 0
on______________________________________ 0
wednesday_______________________________ 0
consumers_______________________________ 0
should__________________________________ 0
buy_____________________________________ 0
sheep___

In [16]:
tokenized_datasets = conll2003.map(tokenize_and_align_labels, batched=True)

  0%|          | 0/15 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

In [None]:
model = AutoModelForTokenClassification.from_pretrained("bert-base-uncased", num_labels=9)# as there are nine labels

In [18]:
from transformers import TrainingArguments, Trainer 
args = TrainingArguments( 
"test-ner",
evaluation_strategy = "epoch", 
learning_rate=2e-5, 
per_device_train_batch_size=16, 
per_device_eval_batch_size=16, 
num_train_epochs=3, 
weight_decay=0.01, 
) 

In [19]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [None]:
metric = datasets.load_metric("seqeval") 

In [None]:
example = conll2003['train'][0]
label_list = conll2003["train"].features["ner_tags"].feature.names 

label_list

In [None]:
labels = [label_list[i] for i in example["ner_tags"]] 

metric.compute(predictions=[labels], references=[labels]) 

In [23]:
def compute_metrics(eval_preds): 
    pred_logits, labels = eval_preds 
    
    pred_logits = np.argmax(pred_logits, axis=2) 
    # the logits and the probabilities are in the same order,
    # so we don’t need to apply the softmax
    
    # We remove all the values where the label is -100
    predictions = [ 
        [label_list[eval_preds] for (eval_preds, l) in zip(prediction, label) if l != -100] 
        for prediction, label in zip(pred_logits, labels) 
    ] 
    
    true_labels = [ 
      [label_list[l] for (eval_preds, l) in zip(prediction, label) if l != -100] 
       for prediction, label in zip(pred_logits, labels) 
   ] 
    results = metric.compute(predictions=predictions, references=true_labels) 
    return { 
   "precision": results["overall_precision"], 
   "recall": results["overall_recall"], 
   "f1": results["overall_f1"], 
  "accuracy": results["overall_accuracy"], 
  } 

In [24]:
trainer = Trainer( 
    model, 
    args, 
   train_dataset=tokenized_datasets["train"], 
   eval_dataset=tokenized_datasets["validation"], 
   data_collator=data_collator, 
   tokenizer=tokenizer, 
   compute_metrics=compute_metrics 
) 

In [None]:
trainer.train() 

In [54]:
x=model.save_pretrained("ner_model")

Configuration saved in ner_model/config.json
Model weights saved in ner_model/pytorch_model.bin


In [57]:
model.save('my_model.h5')

AttributeError: ignored

In [None]:
tokenizer.save_pretrained("tokenizer")

In [None]:
id2label = {
    str(i): label for i,label in enumerate(label_list)
}
label2id = {
    label: str(i) for i,label in enumerate(label_list)
}
import json
config = json.load(open("ner_model/config.json"))
config["id2label"] = id2label
config["label2id"] = label2id
json.dump(config, open("ner_model/config.json","w"))
model_fine_tuned = AutoModelForTokenClassification.from_pretrained("ner_model")

In [29]:
from transformers import pipeline
nlp = pipeline("ner", model=model_fine_tuned, tokenizer=tokenizer)
example = "Kevin C. Chang is a Professor in Computer Science, University of Illinois at Urbana-Champaign. He received a BS from National Taiwan University and PhD from Stanford University, in Electrical Engineering. His research addresses large scale information access, for search, mining, and integration across structured and unstructured big data, with current focuses on Web search/mining and social media analytics. He received two Best Paper Selections in VLDB 2000 and 2013, an NSF CAREER Award in 2002, an NCSA Faculty Fellow Award in 2003, IBM Faculty Awards in 2004 and 2005, Academy for Entrepreneurial Leadership Faculty Fellow Award in 2008, and the Incomplete List of Excellent Teachers at University of Illinois in 2001, 2004, 2005, 2006, 2010, and 2011. He is passionate to bring research results to the real world and, with his students, co-founded Cazoodle, a startup from the University of Illinois, for deepening vertical search over the web."
ner_results = nlp(example)

print(ner_results)

[{'entity': 'B-PER', 'score': 0.9976694, 'index': 1, 'word': 'kevin', 'start': 0, 'end': 5}, {'entity': 'I-PER', 'score': 0.9844226, 'index': 2, 'word': 'c', 'start': 6, 'end': 7}, {'entity': 'I-PER', 'score': 0.9908793, 'index': 3, 'word': '.', 'start': 7, 'end': 8}, {'entity': 'I-PER', 'score': 0.9965083, 'index': 4, 'word': 'chang', 'start': 9, 'end': 14}, {'entity': 'B-ORG', 'score': 0.97524136, 'index': 12, 'word': 'university', 'start': 51, 'end': 61}, {'entity': 'I-ORG', 'score': 0.95972896, 'index': 13, 'word': 'of', 'start': 62, 'end': 64}, {'entity': 'I-ORG', 'score': 0.9344367, 'index': 14, 'word': 'illinois', 'start': 65, 'end': 73}, {'entity': 'I-ORG', 'score': 0.91412055, 'index': 15, 'word': 'at', 'start': 74, 'end': 76}, {'entity': 'I-ORG', 'score': 0.9347133, 'index': 16, 'word': 'urbana', 'start': 77, 'end': 83}, {'entity': 'I-ORG', 'score': 0.92561316, 'index': 17, 'word': '-', 'start': 83, 'end': 84}, {'entity': 'I-ORG', 'score': 0.90242046, 'index': 18, 'word': 'ch

In [43]:
i=0
entity=""
name=''
while(i<len(ner_results)):
    tmp=ner_results[i]['entity'][0]
    new_entity=ner_results[i]['entity'][2:]
    new_name=ner_results[i]['word']
    if (not(new_entity == entity)) or tmp=="B":
      print(name+"     "+entity)
      entity=new_entity
      
      name=new_name
    else:
      name=name+" "+new_name
    #print(ner_results[i]['entity']+"  "+ner_results[i]['word'])
    i=i+1

     
kevin c . chang     PER
university of illinois at urbana - champaign     ORG
national taiwan university     ORG
stanford university     ORG
v     MISC
##ld     MISC
##b     MISC
ns     ORG
##f     ORG
nc     ORG
##sa     ORG
ibm     ORG
academy for entrepreneur ##ial leadership     ORG
university of illinois     ORG
ca     ORG
##zoo     ORG
##dle     ORG


In [None]:
from tensorflow.keras.models import load_model
saved_model.save('/content/saved_model')