## **1. Find the corresponding positive values for NER, POS, Chunk tags**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **2. Data Preprocessing for BERT Model (Apply Hugging Face Data)**

### (1) Hugging Face Dataset Conll2003 Exploration

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-1.16.1-py3-none-any.whl (298 kB)
[K     |████████████████████████████████| 298 kB 4.2 MB/s 
[?25hCollecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 497 kB/s 
Collecting xxhash
  Downloading xxhash-2.0.2-cp37-cp37m-manylinux2010_x86_64.whl (243 kB)
[K     |████████████████████████████████| 243 kB 39.4 MB/s 
[?25hCollecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 38.9 MB/s 
Collecting fsspec[http]>=2021.05.0
  Downloading fsspec-2021.11.1-py3-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 52.8 MB/s 
Collecting aiosignal>=1.1.2
  Downloading aiosignal-1.2.0-py3-none-any.whl (8.2 kB)
Collecting multidict<7.0,>=4.5
  Downloading multidict-5.2.0-cp37-cp37m-manylinux_2_5_x86

In [None]:
from datasets import load_dataset
# dataset = load_dataset('conll2003')

### (2) Covert Data to BERT Input Style

In [None]:
!pip install transformers seqeval[gpu]

Collecting transformers
  Downloading transformers-4.12.5-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 4.0 MB/s 
[?25hCollecting seqeval[gpu]
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[K     |████████████████████████████████| 43 kB 2.0 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 40.6 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 36.4 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 53.2 MB/s 
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel fo

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertConfig, BertForTokenClassification

In [None]:
MAX_LEN = 128     
TRAIN_BATCH_SIZE = 4
TEST_BATCH_SIZE = 2
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
data = pd.read_csv("/content/drive/MyDrive/NLP/New Dataset/Bank/ner_datasetreference.csv", encoding='unicode_escape')
data.head()

data.count()

Sentence #      47959
Word          1048575
POS           1048575
Tag           1048575
dtype: int64

In [None]:
'''
step 2a: process NE tags and POS tags
'''
# NE 
"""There are 8 category tags, each with a "beginning" and "inside" variant, and the "outside" tag. It is not really clear what these tags mean - "geo" probably stands for geographical entity, "gpe" for geopolitical entity, and so on. They do not seem to correspond with what the publisher says on Kaggle. Some tags seem to be underrepresented. Let's print them by frequency (highest to lowest): """

# tags = {}
# for tag, count in zip(frequencies_NE.index, frequencies_NE):
#     if tag != "O":
#         if tag[2:5] not in tags.keys():
#             tags[tag[2:5]] = count
#         else:
#             tags[tag[2:5]] += count
#     continue

# print(sorted(tags.items(), key=lambda x: x[1], reverse=True))

"""Let's remove "art", "eve" and "nat" named entities, as performance on them will probably be not comparable to the other named entities. """

entities_to_remove = ["B-art", "I-art", "B-eve", "I-eve", "B-nat", "I-nat"]
data = data[~data.Tag.isin(entities_to_remove)]
data.head()
data.count()


Sentence #      47920
Word          1047063
POS           1047063
Tag           1047063
dtype: int64

In [None]:
"""We create 2 dictionaries for NE: one that maps individual tags to indices, and one that maps indices to their individual tags. This is necessary in order to create the labels (as computers work with numbers = indices, rather than words = tags) - see further in this notebook."""

labels_to_ids = {k: v for v, k in enumerate(data.Tag.unique())}
ids_to_labels = {v: k for v, k in enumerate(data.Tag.unique())}
print(labels_to_ids)
print(ids_to_labels)


{'O': 0, 'B-geo': 1, 'B-gpe': 2, 'B-per': 3, 'I-geo': 4, 'B-org': 5, 'I-org': 6, 'B-tim': 7, 'I-per': 8, 'I-gpe': 9, 'I-tim': 10}
{0: 'O', 1: 'B-geo', 2: 'B-gpe', 3: 'B-per', 4: 'I-geo', 5: 'B-org', 6: 'I-org', 7: 'B-tim', 8: 'I-per', 9: 'I-gpe', 10: 'I-tim'}


In [None]:
# count NE tag
print("Number of NE tags: {}".format(len(data.Tag.unique()))) # 17个
frequencies_NE = data.Tag.value_counts()
frequencies_NE
Ner_Tag = list(data.Tag.unique())
Ner_Number = [i for i in range(len(Ner_Tag))]
Ner = list(zip(Ner_Tag,Ner_Number))
print(Ner)


Number of NE tags: 11
[('O', 0), ('B-geo', 1), ('B-gpe', 2), ('B-per', 3), ('I-geo', 4), ('B-org', 5), ('I-org', 6), ('B-tim', 7), ('I-per', 8), ('I-gpe', 9), ('I-tim', 10)]


In [None]:
# pandas has a very handy "forward fill" function to fill missing values based on the last upper non-nan value
data = data.fillna(method='ffill')
data.head()


Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O


In [None]:
# let's create a new column called "sentence" which groups the words by sentence 
data['sentence'] = data[['Sentence #','Word','Tag', 'POS']].groupby(['Sentence #'])['Word'].transform(lambda x: ' '.join(x))
# let's also create a new column called "word_labels" which groups the tags by sentence 
data['word_labels'] = data[['Sentence #','Word','Tag', 'POS']].groupby(['Sentence #'])['Tag'].transform(lambda x: ','.join(x))
data.head()


Unnamed: 0,Sentence #,Word,POS,Tag,sentence,word_labels
0,Sentence: 1,Thousands,NNS,O,Thousands of demonstrators have marched throug...,"O,O,O,O,O,O,B-geo,O,O,O,O,O,B-geo,O,O,O,O,O,B-..."
1,Sentence: 1,of,IN,O,Thousands of demonstrators have marched throug...,"O,O,O,O,O,O,B-geo,O,O,O,O,O,B-geo,O,O,O,O,O,B-..."
2,Sentence: 1,demonstrators,NNS,O,Thousands of demonstrators have marched throug...,"O,O,O,O,O,O,B-geo,O,O,O,O,O,B-geo,O,O,O,O,O,B-..."
3,Sentence: 1,have,VBP,O,Thousands of demonstrators have marched throug...,"O,O,O,O,O,O,B-geo,O,O,O,O,O,B-geo,O,O,O,O,O,B-..."
4,Sentence: 1,marched,VBN,O,Thousands of demonstrators have marched throug...,"O,O,O,O,O,O,B-geo,O,O,O,O,O,B-geo,O,O,O,O,O,B-..."


In [None]:
"""Let's only keep the "sentence" and "word_labels" columns, and drop duplicates:"""

data = data[["sentence", "word_labels"]].drop_duplicates().reset_index(drop=True)
data.head()

len(data)
"""Let's verify that a random sentence and its corresponding tags are correct:"""

print(data.iloc[41].sentence)
print(data.iloc[41].word_labels)


Bedfordshire police said Tuesday that Omar Khayam was arrested in Bedford for breaching the conditions of his parole .
B-gpe,O,O,B-tim,O,B-per,I-per,O,O,O,B-geo,O,O,O,O,O,O,O,O


In [None]:
train_df, validate_df, test_df = \
              np.split(data.sample(frac=1, random_state=42), 
                       [int(.85*len(data)), int(.925*len(data))])


In [None]:
train_df = train_df.reset_index(drop=True)
validate_df = validate_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

data_combine_dict = {'train':train_df, 'validation':validate_df, 'test':test_df}


In [None]:
class Preprocess_Data(Dataset):
  def __init__(self, dataset, tokenizer, max_len, usage): #usage -> train, validation, test

        self.len = len(dataset[usage])
        self.data = dataset[usage]
        self.tokenizer = tokenizer
        self.max_len = max_len

  def __getitem__(self, index):

        # step 1: get the sentence and word labels 
        sentence = self.data['sentence'][index].strip().split()
        word_labels = self.data['word_labels'][index].split(",")


        # step 2: use tokenizer to encode sentence (includes padding/truncation up to max length)
        # BertTokenizerFast provides a handy "return_offsets_mapping" functionality for individual tokens
        encoding = self.tokenizer(sentence,
                              is_split_into_words=True,
                              return_offsets_mapping=True,  #Set to True to return (char_start, char_end) for each token (default False)
                              padding='max_length', 
                              truncation=True, 
                              max_length=self.max_len)
        
        
        # step 3: create token labels only for first word pieces of each tokenized word

        labels = [labels_to_ids[label] for label in word_labels]

        # create an empty array of -100 of length max_length
        encoded_labels = np.ones(len(encoding["offset_mapping"]), dtype=int) * -100
        
        # set only labels whose first offset position is 0 and the second is not 0
        i = 0
        for idx, mapping in enumerate(encoding["offset_mapping"]):
          if mapping[0] == 0 and mapping[1] != 0:
            # overwrite label
            encoded_labels[idx] = labels[i]
            i += 1

        # step 4: turn everything into PyTorch tensors
        item = {key: torch.as_tensor(val) for key, val in encoding.items()}
        item['labels'] = torch.as_tensor(encoded_labels)
        
        return item

  def __len__(self):
        return self.len

In [None]:
training_set = Preprocess_Data(data_combine_dict, tokenizer, MAX_LEN, 'train')
validation_set = Preprocess_Data(data_combine_dict, tokenizer, MAX_LEN, 'validation')
testing_set = Preprocess_Data(data_combine_dict, tokenizer, MAX_LEN, 'test')
print(len(training_set),len(validation_set),len(testing_set))

40435 3568 3568


In [None]:
training_set[0]

{'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]),
 'input_ids': tensor([  101,  1996, 11953,  6259,  3003,  2040, 12011,  2287,  1011,  2214,
          6481,  1997,  3521,  1998, 13986,  2038,  2908,  2152,  1011,  6627,
          1010,  5241,  1996,  3784, 24732,  2326,  1012,   102,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     

In [None]:
#Verify the encoding result
for token, label in zip(tokenizer.convert_ids_to_tokens(training_set[0]["input_ids"]), training_set[0]["labels"]):
  print('{0:10}  {1}'.format(token, label))

[CLS]       -100
the         0
tibetan     2
spiritual   0
leader      0
who         0
teaches     0
age         0
-           -100
old         -100
principles  0
of          0
peace       0
and         0
tolerance   0
has         0
gone        0
high        0
-           -100
tech        -100
,           0
joining     0
the         0
online      0
messaging   0
service     0
.           0
[SEP]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       

In [None]:
# Define the Dataloader
training_loader = DataLoader(training_set, batch_size = TRAIN_BATCH_SIZE, shuffle=True,num_workers=0)
validation_loader = DataLoader(validation_set,batch_size = TRAIN_BATCH_SIZE, shuffle=True,num_workers=0)
testing_loader = DataLoader(testing_set,batch_size = TEST_BATCH_SIZE, shuffle=True,num_workers=0)

In [None]:
print(len(training_loader),len(validation_loader),len(testing_loader))

10109 892 1784


# **3. Define the Model**

Ref:https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/BERT/Custom_Named_Entity_Recognition_with_BERT_only_first_wordpiece.ipynb
Ref:https://huggingface.co/transformers/model_doc/bert.html#bertfortokenclassification

### 1) Train the Model

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Thu Dec  9 06:14:54 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   73C    P8    33W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


In [None]:
EPOCHS = 3
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 10

In [None]:
# Define the model by just BertForTokenClassification
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(Ner_Tag))
model.to(device)

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [None]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

In [None]:

def train(model,epoch):
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    # put model in training mode
    model.train()
    
    for idx, batch in enumerate(training_loader):

        # if idx >200:
        #   break
        
        ids = batch['input_ids'].to(device, dtype = torch.long)
        mask = batch['attention_mask'].to(device, dtype = torch.long)
        labels = batch['labels'].to(device, dtype = torch.long)

        outputs = model(input_ids=ids, attention_mask=mask, labels=labels)
        loss = outputs[0]
        tr_logits = outputs[1]

        tr_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += labels.size(0)
        
        if idx % 100==0:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training loss per 100 training steps: {loss_step}")
           
        # compute training accuracy
        flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
        
        # only compute accuracy at active labels
        active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        #active_labels = torch.where(active_accuracy, labels.view(-1), torch.tensor(-100).type_as(labels))
        
        labels = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)
        
        tr_labels.extend(labels)
        tr_preds.extend(predictions)

        tmp_tr_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy
    
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=MAX_GRAD_NORM
        )
        
        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")

    # --------------------------------------------------------------------------------------------------------------------
    # After the completion of each training epoch
    # measure our performance on validation set.

    model.eval()
    
    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []
    
    with torch.no_grad():
        for idx, batch in enumerate(validation_loader):

            # if idx >200:
            #   break
            
            ids = batch['input_ids'].to(device, dtype = torch.long)
            mask = batch['attention_mask'].to(device, dtype = torch.long)
            labels = batch['labels'].to(device, dtype = torch.long)
            
            outputs= model(input_ids=ids, attention_mask=mask, labels=labels)
            loss = outputs[0]
            eval_logits = outputs[1]
            eval_loss += loss.item()

            nb_eval_steps += 1
            nb_eval_examples += labels.size(0)
        
            if idx % 100==0:
                loss_step = eval_loss/nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")
              
            # compute evaluation accuracy
            flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
            active_logits = eval_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
            
            # only compute accuracy at active labels
            active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        
            labels = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)
            
            eval_labels.extend(labels)
            eval_preds.extend(predictions)
            
            tmp_eval_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy

    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps
    print(f"Validation Loss: {eval_loss}")
    print(f"Validation Accuracy: {eval_accuracy}")

In [None]:
len(training_loader),len(validation_loader),len(testing_loader)

(10109, 892, 1784)

In [None]:
for epoch in range(EPOCHS):
    print(f"Training epoch: {epoch + 1}")
    train(model,epoch)
    torch.save(model.state_dict(), '/content/drive/MyDrive/baseline_checkpoint/model_weights_'+str(epoch+1)+'.pth')
    


Training epoch: 1
Training loss per 100 training steps: 2.6810076236724854
Training loss per 100 training steps: 0.9293056806125263
Training loss per 100 training steps: 0.6659221629153437
Training loss per 100 training steps: 0.5444063720389261
Training loss per 100 training steps: 0.46878840041316655
Training loss per 100 training steps: 0.4214830209915628
Training loss per 100 training steps: 0.38263830820876427
Training loss per 100 training steps: 0.3549394405469532
Training loss per 100 training steps: 0.3312296115115434
Training loss per 100 training steps: 0.3148143668498815
Training loss per 100 training steps: 0.30120351100126497


KeyboardInterrupt: ignored

### 2) Evaluate the Model

In [None]:
model_1 = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(Ner_Tag))
model_1.load_state_dict(torch.load('/content/drive/MyDrive/baseline_checkpoint/model_weights_1.pth'))
model_1.to(device)
model_2 = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(Ner_Tag))
model_2.load_state_dict(torch.load('/content/drive/MyDrive/baseline_checkpoint/model_weights_2.pth'))
model_2.to(device)
model_3 = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(Ner_Tag))
model_3.load_state_dict(torch.load('/content/drive/MyDrive/baseline_checkpoint/model_weights_3.pth'))
model_3.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [None]:
def valid(model_1,model_2,model_3, testing_loader):
    # put model in evaluation mode
    model_1.eval()
    model_2.eval()
    model_3.eval()
    eval_loss, eval_accuracy = 0, 0
    eval_loss_1, eval_accuracy_1 = 0, 0
    eval_loss_2, eval_accuracy_2 = 0, 0
    eval_loss_3, eval_accuracy_3 = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []
    
    eval_preds_1 = []
    eval_preds_2 = []
    eval_preds_3 = []
    
    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):
            
            ids = batch['input_ids'].to(device, dtype = torch.long)
            mask = batch['attention_mask'].to(device, dtype = torch.long)
            labels = batch['labels'].to(device, dtype = torch.long)

        #             outputs = model(input_ids=ids, attention_mask=mask, labels=labels)
        # loss = outputs[0]
        # tr_logits = outputs[1]
        # tr_loss += loss.item()
            
            outputs_1 = model_1(input_ids=ids, attention_mask=mask, labels=labels)
            loss_1 = outputs_1[0]
            eval_logits_1 = outputs_1[1]
            eval_loss_1 += loss_1.item()

            outputs_2 = model_2(input_ids=ids, attention_mask=mask, labels=labels)
            loss_2 = outputs_2[0]
            eval_logits_2 = outputs_2[1]
            eval_loss_2 += loss_2.item()

            outputs_3 = model_3(input_ids=ids, attention_mask=mask, labels=labels)
            loss_3 = outputs_3[0]
            eval_logits_3 = outputs_3[1]
            eval_loss_3 += loss_3.item()

            nb_eval_steps += 1
            nb_eval_examples += labels.size(0)
        
            if idx % 100==0:
                loss_step_1 = eval_loss_1/nb_eval_steps
                print(f"Validation loss per 100 evaluation steps for bert 1: {loss_step_1}")
                loss_step_2 = eval_loss_2/nb_eval_steps
                print(f"Validation loss per 100 evaluation steps for bert 2: {loss_step_2}")
                loss_step_3 = eval_loss_3/nb_eval_steps
                print(f"Validation loss per 100 evaluation steps for bert 3: {loss_step_3}")
              
            # compute evaluation accuracy
            flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
            active_logits_1 = eval_logits_1.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions_1 = torch.argmax(active_logits_1, axis=1) # shape (batch_size * seq_len,)
            active_logits_2 = eval_logits_2.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions_2 = torch.argmax(active_logits_2, axis=1) # shape (batch_size * seq_len,)
            active_logits_3 = eval_logits_3.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions_3 = torch.argmax(active_logits_3, axis=1) # shape (batch_size * seq_len,)
            
            # only compute accuracy at active labels
            active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        
            labels = torch.masked_select(flattened_targets, active_accuracy)
            predictions_1 = torch.masked_select(flattened_predictions_1, active_accuracy)
            predictions_2 = torch.masked_select(flattened_predictions_2, active_accuracy)
            predictions_3 = torch.masked_select(flattened_predictions_3, active_accuracy)
            
            #ensemble_Fusion of three Bert results
            length = len(labels)
            num = np.zeros((length,9),dtype=int)
            predictions = []
            for i in range(0,length):
              val_1 = predictions_1[i]
              val_2 = predictions_2[i]
              val_3 = predictions_3[i]
              num[i][val_1] += 1
              num[i][val_2] += 1
              num[i][val_3] += 1
            
            for i in range(0,length):
              flag = 0
              for j in range(0,9):
                if num[i][j] >= 2:
                  predictions.append(j)
                  flag = 1
              if flag == 0:
                max_1 = torch.max(active_logits_1[i],0)[0]
                maxNo_1 = torch.max(active_logits_1[i],0)[1]
                maxNo_1 = int(maxNo_1)
                max_2 = torch.max(active_logits_2[i],0)[0]
                maxNo_2 = torch.max(active_logits_2[i],0)[1]
                maxNo_2 = int(maxNo_2)
                max_3 = torch.max(active_logits_3[i],0)[0]
                maxNo_3 = torch.max(active_logits_3[i],0)[1]
                maxNo_3 = int(maxNo_3)   
                if torch.gt(max_1,max_2):
                  if torch.gt(max_1,max_3):
                    predictions.append(maxNo_1)
                  else:
                    predictions.append(maxNo_3)
                else:
                  if torch.gt(max_2,max_3):
                    predictions.append(maxNo_2)
                  else:
                    predictions.append(maxNo_3)
                    

            eval_labels.extend(labels)
            eval_preds.extend(predictions)
            eval_preds_1.extend(predictions_1)
            eval_preds_2.extend(predictions_2)
            eval_preds_3.extend(predictions_3)

            #predictions_tensor = torch.tensor(predictions).to(device)
            
            tmp_eval_accuracy = accuracy_score(labels.cpu().numpy(), torch.tensor(predictions).cpu().numpy())
            eval_accuracy += tmp_eval_accuracy

            tem_eval_accuracy_1 = accuracy_score(labels.cpu().numpy(),predictions_1.cpu().numpy())
            eval_accuracy_1 += tem_eval_accuracy_1
            tem_eval_accuracy_2 = accuracy_score(labels.cpu().numpy(),predictions_2.cpu().numpy())
            eval_accuracy_2 += tem_eval_accuracy_2
            tem_eval_accuracy_3 = accuracy_score(labels.cpu().numpy(),predictions_3.cpu().numpy())
            eval_accuracy_3 += tem_eval_accuracy_3

    
    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps
    print(f"Validation Loss: {eval_loss}")
    print(f"Validation Accuracy: {eval_accuracy}")

    eval_accuracy_1 = eval_accuracy_1 / nb_eval_steps
    eval_accuracy_2 = eval_accuracy_2 / nb_eval_steps
    eval_accuracy_3 = eval_accuracy_3 / nb_eval_steps

    print(f"Validation Accuracy of epoch 1: {eval_accuracy_1}")
    print(f"Validation Accuracy of epoch 2: {eval_accuracy_2}")
    print(f"Validation Accuracy of epoch 3: {eval_accuracy_3}")

    return eval_labels, eval_preds


In [None]:
labels, predictions = valid(model_1,model_2,model_3,testing_loader)

Validation loss per 100 evaluation steps for bert 1: 0.0005527503089979291
Validation loss per 100 evaluation steps for bert 2: 0.00018127560906577855
Validation loss per 100 evaluation steps for bert 3: 0.0001076681146514602
Validation loss per 100 evaluation steps for bert 1: 0.12272651051228756
Validation loss per 100 evaluation steps for bert 2: 0.14219734729005434
Validation loss per 100 evaluation steps for bert 3: 0.15099737038286945
Validation loss per 100 evaluation steps for bert 1: 0.13761617202805215
Validation loss per 100 evaluation steps for bert 2: 0.13794038675698528
Validation loss per 100 evaluation steps for bert 3: 0.17454892652298512
Validation loss per 100 evaluation steps for bert 1: 0.13316019640193535
Validation loss per 100 evaluation steps for bert 2: 0.13849709209910807
Validation loss per 100 evaluation steps for bert 3: 0.1677743196671864
Validation loss per 100 evaluation steps for bert 1: 0.15432354984828212
Validation loss per 100 evaluation steps for 

In [None]:
len(testing_loader)

In [None]:
model.load_state_dict(torch.load('/content/drive/MyDrive/NLP/Final Project/Code/Baseline/Baseline Saved Model/pytorch_model.bin'))

In [None]:
 labels, predictions = valid(model, testing_loader)

In [None]:
New_NerDict = dict((v,k) for k,v in dict(Ner).items())
New_NerDict

{0: 'O',
 1: 'B-PER',
 2: 'I-PER',
 3: 'B-ORG',
 4: 'I-ORG',
 5: 'B-LOC',
 6: 'I-LOC',
 7: 'B-MISC',
 8: 'I-MISC'}

In [None]:
from sklearn.metrics import classification_report

labels_value = [[New_NerDict[i.item()] for i in labels]]
pred_value = [[New_NerDict[i] for i in predictions]]

print(classification_report(labels_value, pred_value))

ValueError: ignored

In [None]:
from seqeval.metrics import classification_report

print(classification_report(labels_value, pred_value))

### 3) Save Model

In [None]:
import os

directory = "./model"

if not os.path.exists(directory):
    os.makedirs(directory)

# save vocabulary of the tokenizer
tokenizer.save_vocabulary(directory)
# save the model weights and its configuration file
model.save_pretrained(directory)
print('All files saved')

In [None]:
#torch.save(model, 'model.pth')

#torch.save(model.state_dict(), 'model_weights.pth')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
