In [None]:
!pip install transformers datasets torch scikit-learn==1.2.2 scipy==1.10.1  seqeval==1.2.2 pytorch-crf==0.7.2

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd /content/drive/MyDrive/BERT-BiGRU

In [None]:
import os
import json
import torch
import numpy as np
import torch.nn as nn
from tqdm import tqdm
from torchcrf import CRF
from datasets import load_dataset
from torch.utils.data import DataLoader
from seqeval.metrics import classification_report
from transformers import BertModel, BertConfig, BertTokenizer, AdamW, get_linear_schedule_with_warmup, AutoTokenizer, DataCollatorForTokenClassification

In [None]:
tag_2_id = {'B-application': 0, 'B-cve id': 1, 'B-edition': 2, 'B-file': 3, 'B-function': 4, 'B-hardware': 5, 'B-language': 6, 'B-method': 7, 'B-os': 8, 'B-parameter': 9, 'B-programming language': 10, 'B-relevant_term': 11, 'B-update': 12, 'B-vendor': 13, 'B-version': 14, 'I-application': 15, 'I-edition': 16, 'I-hardware': 17, 'I-os': 18, 'I-relevant_term': 19, 'I-update': 20, 'I-vendor': 21, 'I-version': 22, 'O': 23}
id_2_tag = {0: 'B-application', 1: 'B-cve id', 2: 'B-edition', 3: 'B-file', 4: 'B-function', 5: 'B-hardware', 6: 'B-language', 7: 'B-method', 8: 'B-os', 9: 'B-parameter', 10: 'B-programming language', 11: 'B-relevant_term', 12: 'B-update', 13: 'B-vendor', 14: 'B-version', 15: 'I-application', 16: 'I-edition', 17: 'I-hardware', 18: 'I-os', 19: 'I-relevant_term', 20: 'I-update', 21: 'I-vendor', 22: 'I-version', 23: 'O'}

In [None]:
label_names = [v for k,v in id_2_tag.items()]

In [None]:
class NerConfig:
    def __init__(self):
        self.bert_dir = "thongnef/bert-finetuned-ner-cti"

        self.output_dir = "./checkpoint/"
        if not os.path.exists(self.output_dir):
            os.mkdir(self.output_dir)

        self.bio_labels = label_names
        self.num_labels = len(self.bio_labels)
        self.label2id = tag_2_id
        self.id2label = id_2_tag

        self.max_seq_len = 512
        self.epochs = 5
        self.train_batch_size = 8
        self.dev_batch_size = 8
        self.bert_learning_rate = 2e-5
        self.crf_learning_rate = 3e-3
        self.adam_epsilon = 1e-8
        self.weight_decay = 0.01
        self.warmup_proportion = 0.01
        self.save_step = 500

In [None]:
import torch.nn as nn

class ModelOutput:
    def __init__(self, logits, labels, loss=None):
        self.logits = logits
        self.labels = labels
        self.loss = loss

class BertNer(nn.Module):
    def __init__(self, args):
        super(BertNer, self).__init__()
        self.bert = BertModel.from_pretrained(args.bert_dir)
        self.bert_config = BertConfig.from_pretrained(args.bert_dir)
        hidden_size = self.bert_config.hidden_size
        self.gru_hidden = 128  # Change the name to gru_hidden
        self.max_seq_len = args.max_seq_len
        self.bigru = nn.GRU(hidden_size, self.gru_hidden, 1, bidirectional=True, batch_first=True, dropout=0.1)  # Change to nn.GRU
        self.linear = nn.Linear(self.gru_hidden * 2, args.num_labels)  # Change to gru_hidden

    def forward(self, input_ids, attention_mask, labels=None):
        bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        seq_out = bert_output[0]  # [batchsize, max_len, 768]
        batch_size = seq_out.size(0)
        seq_out, _ = self.bigru(seq_out)  # Change to bigru
        seq_out = seq_out.contiguous().view(-1, self.gru_hidden * 2)
        seq_out = seq_out.contiguous().view(batch_size, self.max_seq_len, -1)
        seq_out = self.linear(seq_out)

        loss = None
        if labels is not None:
            criterion = nn.CrossEntropyLoss()
            loss = criterion(seq_out.view(-1, args.num_labels), labels.view(-1))

        model_output = ModelOutput(seq_out, labels, loss)
        return model_output


In [None]:
class Trainer:
    def __init__(self,
                 output_dir=None,
                 model=None,
                 train_loader=None,
                 save_step=500,
                 dev_loader=None,
                 test_loader=None,
                 optimizer=None,
                 schedule=None,
                 epochs=1,
                 device="cpu",
                 id2label=None):
        self.output_dir = output_dir
        self.model = model
        self.train_loader = train_loader
        self.dev_loader = dev_loader
        self.test_loader = test_loader
        self.epochs = epochs
        self.device = device
        self.optimizer = optimizer
        self.schedule = schedule
        self.id2label = id2label
        self.save_step = save_step
        self.total_step = len(self.train_loader) * self.epochs

    def train(self):
        global_step = 1
        for epoch in range(1, self.epochs + 1):
            for step, batch_data in enumerate(self.train_loader):
                self.model.train()
                for key, value in batch_data.items():
                    batch_data[key] = value.to(self.device)
                input_ids = batch_data["input_ids"]
                attention_mask = batch_data["attention_mask"]
                labels = batch_data["labels"]
                output = self.model(input_ids, attention_mask, labels)
                loss = output.loss
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()
                self.schedule.step()
                print(f"【train】{epoch}/{self.epochs} {global_step}/{self.total_step} loss:{loss.item()}")
                global_step += 1
                if global_step % self.save_step == 0:
                    torch.save(self.model.state_dict(), os.path.join(self.output_dir, "pytorch_model_ner.bin"))


        torch.save(self.model.state_dict(), os.path.join(self.output_dir, "pytorch_model_ner.bin"))

    def test(self):
        self.model.load_state_dict(torch.load(os.path.join(self.output_dir, "pytorch_model_ner.bin")))
        self.model.eval()
        preds = []
        trues = []
        for step, batch_data in enumerate(tqdm(self.test_loader)):
            for key, value in batch_data.items():
                batch_data[key] = value.to(self.device)
            input_ids = batch_data["input_ids"]
            attention_mask = batch_data["attention_mask"]
            labels = batch_data["labels"]
            output = self.model(input_ids, attention_mask, labels)
            logits = output.logits
            attention_mask = attention_mask.detach().cpu().numpy()
            labels = labels.detach().cpu().numpy()

            batch_size = input_ids.size(0)
            for i in range(batch_size):
                length = sum(attention_mask[i])
                logit = logits[i][1:length]
                logit = [self.id2label[i] for i in logit]
                label = labels[i][1:length]
                label = [self.id2label[i] for i in label]
                preds.append(logit)
                trues.append(label)

        report = classification_report(trues, preds, digits=7)
        return report

In [None]:
def build_optimizer_and_scheduler(args, model, t_total):
    module = (
        model.module if hasattr(model, "module") else model
    )

    no_decay = ["bias", "LayerNorm.weight"]
    model_param = list(module.named_parameters())

    bert_param_optimizer = []
    other_param_optimizer = []

    for name, para in model_param:
        space = name.split('.')
        # print(name)
        if space[0] == 'bert_module' or space[0] == "bert":
            bert_param_optimizer.append((name, para))
        else:
            other_param_optimizer.append((name, para))

    optimizer_grouped_parameters = [
        # bert other module
        {"params": [p for n, p in bert_param_optimizer if not any(nd in n for nd in no_decay)],
         "weight_decay": args.weight_decay, 'lr': args.bert_learning_rate},
        {"params": [p for n, p in bert_param_optimizer if any(nd in n for nd in no_decay)],
         "weight_decay": 0.0, 'lr': args.bert_learning_rate},

        {"params": [p for n, p in other_param_optimizer if not any(nd in n for nd in no_decay)],
         "weight_decay": args.weight_decay, 'lr': args.crf_learning_rate},
        {"params": [p for n, p in other_param_optimizer if any(nd in n for nd in no_decay)],
         "weight_decay": 0.0, 'lr': args.crf_learning_rate},
    ]

    optimizer = AdamW(optimizer_grouped_parameters, lr=args.bert_learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=int(args.warmup_proportion * t_total), num_training_steps=t_total
    )

    return optimizer, scheduler

In [None]:
#process data
def preprocess_data():
  args = NerConfig()
  raw_datasets = load_dataset("thongnef/dataset_dacn")
  # tokenizer = BertTokenizer.from_pretrained(args.bert_dir)
  tokenizer = AutoTokenizer.from_pretrained(args.bert_dir)

  converted_dict = {0:15, 1:1,2:16, 5:17,8:18, 10:19,12:20,13:21, 14:22 }

  def align_labels_with_tokens(labels, word_ids):
      new_labels = []
      current_word = None
      for word_id in word_ids:
          if word_id != current_word:
              # Start of a new word!
              current_word = word_id
              label = -100 if word_id is None else labels[word_id]
              new_labels.append(label)
          elif word_id is None:
              # Special token
              new_labels.append(-100)
          else:
              # Same word as previous token
              label = labels[word_id]
              if label in converted_dict.keys():
                label = converted_dict[label]
              # if label % 2 == 1:
              #     label += 1
              new_labels.append(label)
      new_labels = [0 if x == -100 else x for x in new_labels]
      return new_labels

  def tokenize_and_align_labels(examples):
      tokenized_inputs = tokenizer(
          examples["words"], truncation=True, is_split_into_words=True, padding="max_length"
      )
      all_labels = examples["tag"]
      new_labels = []
      for i, labels in enumerate(all_labels):
          word_ids = tokenized_inputs.word_ids(i)
          new_labels.append(align_labels_with_tokens(labels, word_ids))

      tokenized_inputs["labels"] = new_labels
      return tokenized_inputs

  tokenized_datasets = raw_datasets.map(
      tokenize_and_align_labels,
      batched=True,
      remove_columns=raw_datasets["train"].column_names,
  )

  tokenized_datasets = tokenized_datasets.remove_columns("token_type_ids")
  data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

  return tokenized_datasets, data_collator


In [None]:
def main():
    args = NerConfig()

    with open(os.path.join(args.output_dir, "ner_args.json"), "w") as fp:
        json.dump(vars(args), fp, ensure_ascii=False, indent=2)

    # tokenizer = BertTokenizer.from_pretrained(args.bert_dir)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    tokenized_datasets, data_collator = preprocess_data()

    train_dataset = tokenized_datasets["train"]
    dev_dataset = tokenized_datasets["test"]
    train_loader = DataLoader(train_dataset, shuffle=True, batch_size=args.train_batch_size, num_workers=2, collate_fn=data_collator)
    dev_loader = DataLoader(dev_dataset, shuffle=False, batch_size=args.dev_batch_size, num_workers=2, collate_fn=data_collator)

    model = BertNer(args)

    # for name,_ in model.named_parameters():
    #   print(name)

    model.to(device)
    t_toal = len(train_loader) * args.epochs
    optimizer, schedule = build_optimizer_and_scheduler(args, model, t_toal)

    train = Trainer(
        output_dir=args.output_dir,
        model=model,
        train_loader=train_loader,
        dev_loader=dev_loader,
        test_loader=dev_loader,
        optimizer=optimizer,
        schedule=schedule,
        epochs=args.epochs,
        device=device,
        id2label=args.id2label
    )

    train.train()

    report = train.test()
    print(report)


In [None]:
main()