In [1]:
import random
import torch
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch.nn as nn
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup,BertConfig
from ark_nlp.model.ner.global_pointer_bert import Tokenizer
from ark_nlp.model.ner.global_pointer_bert import Dataset as Dt
from ark_nlp.factory.utils.conlleval import get_entity_bio
from model import GlobalPointer, GlobalPointerNERPredictor, GlobalPointerCrossEntropy

In [2]:
from config import parse_args

args = parse_args()
def setup_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
setup_seed(args.seed)

In [3]:
data_path = '../dataset/train.csv'
df = pd.read_csv(data_path, delimiter="\t")
df['tag'] = df['tag'].apply(lambda x: eval(x))
df.info()

df.head(5)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6000 entries, 0 to 5999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    6000 non-null   object
 1   tag     6000 non-null   object
dtypes: object(2)
memory usage: 93.9+ KB


Unnamed: 0,text,tag
0,会安博物馆等，漫步会安古镇各精致的工艺品店、品尝路边的小吃摊，体验当地的风土民情。,[会安古镇]
1,贝蒂斯vs西班牙人,"[贝蒂斯, 西班牙人]"
2,最终橘子熊在特种部队项目以7：2，跑跑卡丁车项目以7：1痛击曜越太阳神，,[橘子熊]
3,2008年11月22日，北京的气温陡降到零下4度，但雍和宫星光现场里“beijing,[北京]
4,光谱代理《大战略PERFECT3》繁体版,[光谱]


In [4]:
bio_list = []
for i in tqdm(range(len(df))):
    text = df['text'][i]
    tags = df['tag'][i]
    bios = ['O']*len(text)
    for t in tags:
        idx = text.find(t)
        bios[idx] = 'B-0'
        for j in range(idx+1, idx+len(t)):
            bios[j] = 'I-0'
    bio_list.append(bios)

100%|██████████| 6000/6000 [00:00<00:00, 117641.29it/s]


In [5]:
# bio_list

In [6]:
df['BIO'] = bio_list

In [7]:
df['text'][0]

'会安博物馆等，漫步会安古镇各精致的工艺品店、品尝路边的小吃摊，体验当地的风土民情。'

In [8]:
all_entity_labels = []

for i in range(len(df)):
    entity_labels=[]
    for _type, _start_idx, _end_idx in get_entity_bio(df['BIO'][i], id2label=None):
        entity_labels.append({
            'start_idx': _start_idx,
            'end_idx': _end_idx,
            'type': _type,
            'entity': df['text'][i][_start_idx: _end_idx + 1]
        })
    all_entity_labels.append(entity_labels)

In [9]:
df['label'] = all_entity_labels

In [10]:
df = df.drop(columns=['tag'], axis=1)
df.head(5)

Unnamed: 0,text,BIO,label
0,会安博物馆等，漫步会安古镇各精致的工艺品店、品尝路边的小吃摊，体验当地的风土民情。,"[O, O, O, O, O, O, O, O, O, B-0, I-0, I-0, I-0...","[{'start_idx': 9, 'end_idx': 12, 'type': '0', ..."
1,贝蒂斯vs西班牙人,"[B-0, I-0, I-0, O, O, B-0, I-0, I-0, I-0]","[{'start_idx': 0, 'end_idx': 2, 'type': '0', '..."
2,最终橘子熊在特种部队项目以7：2，跑跑卡丁车项目以7：1痛击曜越太阳神，,"[O, O, B-0, I-0, I-0, O, O, O, O, O, O, O, O, ...","[{'start_idx': 2, 'end_idx': 4, 'type': '0', '..."
3,2008年11月22日，北京的气温陡降到零下4度，但雍和宫星光现场里“beijing,"[O, O, O, O, O, O, O, O, O, O, O, O, B-0, I-0,...","[{'start_idx': 12, 'end_idx': 13, 'type': '0',..."
4,光谱代理《大战略PERFECT3》繁体版,"[B-0, I-0, O, O, O, O, O, O, O, O, O, O, O, O,...","[{'start_idx': 0, 'end_idx': 1, 'type': '0', '..."


In [11]:
from sklearn.model_selection import train_test_split
train_data, valid_data = train_test_split(df, test_size = 0.2, random_state=args.seed)
train_data.index = list(range(len(train_data)))
valid_data.index = list(range(len(valid_data)))
train_data['label'] = train_data['label'].apply(lambda x: str(x))
valid_data['label'] = valid_data['label'].apply(lambda x: str(x))
print('训练集大小：',len(train_data))
print('验证集大小：',len(valid_data))

训练集大小： 4800
验证集大小： 1200


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [12]:

# args.tag2idx = {'O':0, 'B-0':1, 'I-0':2}
# args.idx2tag = {0: 'O', 1: 'B-0', 2:'I-0'}

In [13]:
label_list = ['0', 'O']


train_dataset = Dt(train_data, categories=label_list)
dev_dataset = Dt(valid_data, categories=label_list)
print('pretrained save_model use'+args.bert_dir)
tokenizer = BertTokenizer.from_pretrained(args.bert_dir)
ark_tokenizer = Tokenizer(vocab=tokenizer, max_seq_len=54)

train_dataset.convert_to_ids(ark_tokenizer)
dev_dataset.convert_to_ids(ark_tokenizer)


Ent2id = train_dataset.cat2id
id2Ent = train_dataset.id2cat

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['label'] = data['label'].apply(lambda x: str(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_df['text'] = data_df['text'].apply(lambda x: x.strip())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lambda x: eval(x) if type(x) == str else x)


pretrained save_model use../hfl/chinese-roberta-wwm-ext


In [14]:
Ent2id, id2Ent

({'0': 0, 'O': 1}, {0: '0', 1: 'O'})

In [14]:
from data_helper import create_data_loader
train_data_loader, valid_data_loader = create_data_loader(train_data_df=train_data,
                                                          dev_data_df=valid_data,
                                                          ark_tokenizer=ark_tokenizer,
                                                          args=args,
                                                          train_dataset=train_dataset,
                                                          dev_dataset=dev_dataset,
                                                          bert_tokenizer=tokenizer)

print(len(train_data_loader), len(valid_data_loader))

300 38


In [15]:
def jaccard_score(pred, label):
    return len(set(pred) & set(label)) / len(set(pred) | set(label))

In [16]:
def train_epoch(model, data_loader, optimizer, args, scheduler):
    # 训练模式
    model = model.train()
    train_loss = 0
    for token_id, at_mask, label_id, token_type_ids in tqdm(data_loader):
        outputs = model(token_id.to(args.device), at_mask.to(args.device), token_type_ids.to(args.device))
        loss = loss_fn(outputs, label_id.to(args.device))
        train_loss+=loss.item()
        loss.backward()

        # -----------------------------------对抗攻击------------------------------------------------
        if args.use_fgm:
            fgm.attack()
            outputs = model(token_id.to(args.device), at_mask.to(args.device), token_type_ids.to(args.device))
            loss_fgm = loss_fn(outputs, label_id.to(args.device)).mean()
            loss_fgm.backward()
            fgm.restore()
        if args.use_pgd:
            pgd.backup_grad()
            for t in range(K):
                pgd.attack(is_first_attack=(t == 0))
                if t != K - 1:
                    model.zero_grad()
                else:
                    pgd.restore_grad()
                outputs = model(token_id.to(args.device), at_mask.to(args.device), token_type_ids.to(args.device))
                loss_pgd = loss_fn(outputs, label_id.to(args.device)).mean()
                loss_pgd.backward()
            pgd.restore()
            # ----------------------------------------------------------------------------------------




        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()

        optimizer.zero_grad()
        scheduler.step()

        if args.ema != False:
            args.ema.update()



    return train_loss/len(data_loader)


def return_entity(label):
    entity_labels = []
    for _type, _start_idx, _end_idx in get_entity_bio(label, id2label=None):
            entity_labels.append({
                'start_idx': _start_idx,
                'end_idx': _end_idx,
                'type': _type
            })
    entity_labels = [str(dic['start_idx'])+'-'+str(dic['end_idx']) for dic in entity_labels]
    return entity_labels


def eval_epoch(model, data_loader, args):
    # 验证模式
    model = model.eval()
    if args.ema!=False:
        args.ema.apply_shadow()
    val_loss = 0
    jc_score_list = []
    # 关闭自动求导，省内存加速，因为是不是训练模式了，没必要求导
    with torch.no_grad():
        for token_id, at_mask, label_id, token_type_ids in tqdm(data_loader):
            outputs = model(token_id.to(args.device), at_mask.to(args.device), token_type_ids.to(args.device))
            loss = loss_fn(outputs, label_id.to(args.device))
            val_loss += loss.item()


            y_pred = outputs
            y_true = label_id.to(args.device)
            y_pred = y_pred.cpu().numpy()
            y_true = y_true.cpu().numpy()
            pred = []
            true = []
            for b, l, start, end in zip(*np.where(y_pred > 0)):
                pred.append((b, l, start, end))
            for b, l, start, end in zip(*np.where(y_true > 0)):
                true.append((b, l, start, end))

            jc_score_list.append(jaccard_score(pred=pred, label=true))

    return val_loss/len(data_loader), np.mean(jc_score_list)

In [17]:
if torch.cuda.is_available():
    args.device = 'cuda:0'
    print('使用：', args.device,' ing........')


model = GlobalPointer(args, len(Ent2id), 64).to(args.device)  # (encoder, ent_type_size, inner_dim)


print('batch_size: ',args.batch_size, 'epochs: ',args.max_epochs)
num_total_steps = len(train_data_loader) * args.max_epochs
from util import build_optimizer
optimizer, scheduler = build_optimizer(args, model, num_total_steps=num_total_steps)
loss_fn = GlobalPointerCrossEntropy().to(args.device)



if args.ema==True:
    print('-'*10,'采用EMA机制训练','-'*10)
    from tricks import EMA
    args.ema = EMA(model, 0.999)
    args.ema.register()

if args.use_fgm==True:
    print('-' * 10, '采用FGM对抗训练', '-' * 10)
    from tricks import FGM
    # 初始化
    fgm = FGM(model)

if args.use_pgd==True:
    print('-' * 10, '采用PGD对抗训练', '-' * 10)
    from tricks import PGD
    # 初始化
    pgd = PGD(model=model)
    K = 3

使用： cuda:0  ing........
batch_size:  16 epochs:  6
learning_rate:  5e-05
num_training_steps:  1800
warmup_steps:  180.0


Some weights of the model checkpoint at ../hfl/chinese-roberta-wwm-ext were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [18]:
best_jc_score = 0
for epoch in range(args.max_epochs):
    print('——'*10, f'Epoch {epoch + 1}/{args.max_epochs}', '——'*10)
    train_loss = train_epoch(model, train_data_loader, optimizer, args, scheduler)
    # #scheduler.step()
    # print('-'*20)
    print(f'Train loss : {round(train_loss, 2)}\n')
    val_loss, jc_score = eval_epoch(model, valid_data_loader, args)



    if jc_score>best_jc_score:
        best_jc_score = jc_score
        print(f'val loss : {round(val_loss, 3)}')
        print(f"jc_score: {round(jc_score, 3)}")
        print('-'*20)
        torch.save(model.state_dict(), './save_model/best_model.pth')
        print('+'*6,'best save_model saved','+'*6)

    if args.ema != False:
        args.ema.restore()

———————————————————— Epoch 1/6 ————————————————————
Train loss : 2.04

val loss : 0.69
jc_score: 0.688
--------------------
++++++ best save_model saved ++++++
———————————————————— Epoch 2/6 ————————————————————
Train loss : 0.56

val loss : 0.63
jc_score: 0.708
--------------------
++++++ best save_model saved ++++++
———————————————————— Epoch 3/6 ————————————————————
Train loss : 0.37

val loss : 0.733
jc_score: 0.73
--------------------
++++++ best save_model saved ++++++
———————————————————— Epoch 4/6 ————————————————————
Train loss : 0.24

val loss : 0.855
jc_score: 0.731
--------------------
++++++ best save_model saved ++++++
———————————————————— Epoch 5/6 ————————————————————
Train loss : 0.17

val loss : 0.903
jc_score: 0.733
--------------------
++++++ best save_model saved ++++++
———————————————————— Epoch 6/6 ————————————————————
Train loss : 0.11

val loss : 1.064
jc_score: 0.741
--------------------
++++++ best save_model saved ++++++


  self.ark_data[index]['label_ids'].to_dense()), torch.tensor(self.ark_data[index]['token_type_ids'],
100%|██████████| 300/300 [00:34<00:00,  8.67it/s]
100%|██████████| 38/38 [00:02<00:00, 14.04it/s]
100%|██████████| 300/300 [00:40<00:00,  7.46it/s]
100%|██████████| 38/38 [00:03<00:00, 12.28it/s]
100%|██████████| 300/300 [00:47<00:00,  6.29it/s]
100%|██████████| 38/38 [00:03<00:00, 11.54it/s]
100%|██████████| 300/300 [00:51<00:00,  5.78it/s]
100%|██████████| 38/38 [00:03<00:00, 10.54it/s]
100%|██████████| 300/300 [00:53<00:00,  5.64it/s]
100%|██████████| 38/38 [00:03<00:00, 10.71it/s]
100%|██████████| 300/300 [00:55<00:00,  5.38it/s]
100%|██████████| 38/38 [00:04<00:00,  9.24it/s]
