In [1]:
import random
import torch
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch.nn as nn
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup,BertConfig
from nezha import NeZhaConfig, NeZhaModel, NeZhaForMaskedLM

In [2]:
from config import parse_args

args = parse_args()
def setup_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
setup_seed(args.seed)

In [3]:
data_path = '../dataset/train.csv'
df = pd.read_csv(data_path, delimiter="\t")
df['tag'] = df['tag'].apply(lambda x: eval(x))
df.info()

df.head(5)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6000 entries, 0 to 5999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    6000 non-null   object
 1   tag     6000 non-null   object
dtypes: object(2)
memory usage: 93.9+ KB


Unnamed: 0,text,tag
0,会安博物馆等，漫步会安古镇各精致的工艺品店、品尝路边的小吃摊，体验当地的风土民情。,[会安古镇]
1,贝蒂斯vs西班牙人,"[贝蒂斯, 西班牙人]"
2,最终橘子熊在特种部队项目以7：2，跑跑卡丁车项目以7：1痛击曜越太阳神，,[橘子熊]
3,2008年11月22日，北京的气温陡降到零下4度，但雍和宫星光现场里“beijing,[北京]
4,光谱代理《大战略PERFECT3》繁体版,[光谱]


In [4]:
bio_list = []
for i in tqdm(range(len(df))):
    text = df['text'][i]
    tags = df['tag'][i]
    bios = ['O']*len(text)
    for t in tags:
        idx = text.find(t)
        bios[idx] = 'B-0'
        for j in range(idx+1, idx+len(t)):
            bios[j] = 'I-0'
    bio_list.append(bios)

100%|██████████| 6000/6000 [00:00<00:00, 120001.26it/s]


In [5]:
bio_list = [' '.join(i) for i in bio_list]
df['bio'] = bio_list

In [6]:

# extra_df = pd.read_csv('../阿里天池中文NLP预训练模型泛化能力评估数据集/TNEWS_train.csv')
# extra_df = pd.read_csv('../阿里天池地址解析数据集/my_train.csv')
# extra_df['tag'] = extra_df['tag'].apply(lambda x: eval(x))
# extra_df['text'] = extra_df['text'].apply(lambda x: x.rstrip().rstrip('\u200b').replace('\u200b',' '))
# extra_df.info()
# 
# extra_df.head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19322 entries, 0 to 19321
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    19322 non-null  object
 1   tag     19322 non-null  object
dtypes: object(2)
memory usage: 302.0+ KB


Unnamed: 0,text,tag
0,2018年去南美区看世界杯得花多少钱?,[南美区]
1,“产地办展”模式为“世界杯制造”送创新情报,[世界杯]
2,加快产城融合 以科技创新引领英超建设,[英超]
3,探秘、鸭绿江关东特大地震!,[鸭绿江]
4,花旗区:让文明新风吹进千家万户,[花旗]


In [7]:
# extra_bio_list = []
# for i in tqdm(range(len(extra_df))):
#     text = extra_df['text'][i]
#     tags = extra_df['tag'][i]
#     bios = ['O']*len(text)
#     for t in tags:
#         idx = text.find(t)
#         bios[idx] = 'B-0'
#         for j in range(idx+1, idx+len(t)):
#             bios[j] = 'I-0'
# 
#     assert len(list(text))==len(bios)
# 
#     extra_bio_list.append(bios)
# 

100%|██████████| 19322/19322 [00:00<00:00, 114329.81it/s]


In [8]:
# extra_bio_list = [' '.join(i) for i in extra_bio_list]
# extra_df['bio'] = extra_bio_list
# 
# 

In [9]:
# extra_df.head(5)
# df.head(5)




# pd.concat([df1,df2],ignore_index=True)

Unnamed: 0,text,tag,bio
0,会安博物馆等，漫步会安古镇各精致的工艺品店、品尝路边的小吃摊，体验当地的风土民情。,[会安古镇],O O O O O O O O O B-0 I-0 I-0 I-0 O O O O O O ...
1,贝蒂斯vs西班牙人,"[贝蒂斯, 西班牙人]",B-0 I-0 I-0 O O B-0 I-0 I-0 I-0
2,最终橘子熊在特种部队项目以7：2，跑跑卡丁车项目以7：1痛击曜越太阳神，,[橘子熊],O O B-0 I-0 I-0 O O O O O O O O O O O O O O O ...
3,2008年11月22日，北京的气温陡降到零下4度，但雍和宫星光现场里“beijing,[北京],O O O O O O O O O O O O B-0 I-0 O O O O O O O ...
4,光谱代理《大战略PERFECT3》繁体版,[光谱],B-0 I-0 O O O O O O O O O O O O O O O O O O


In [10]:
# x = extra_df.iloc[4997]
# x

In [11]:
# x['bio']

In [12]:
# len(list(x['text'])), len(x['bio'].split(' '))

In [13]:
# list(x['text'])

In [14]:


from sklearn.model_selection import train_test_split
train_data, valid_data = train_test_split(df, test_size = 0.2, random_state=args.seed)
train_data.index = list(range(len(train_data)))
# extra_df = extra_df[:5000]
# train_data = pd.concat([train_data, extra_df], ignore_index=True)

valid_data.index = list(range(len(valid_data)))
# print(len(train_data), len(valid_data))
print('训练集大小：',len(train_data))
print('验证集大小：',len(valid_data))

训练集大小： 9800
验证集大小： 1200


In [15]:

args.tag2idx = {'O':0, 'B-0':1, 'I-0':2}
args.idx2tag = {0: 'O', 1: 'B-0', 2:'I-0'}

In [16]:

print('pretrained save_model use'+args.bert_dir)
tokenizer = BertTokenizer.from_pretrained(args.bert_dir)

from data_helper import create_data_loader
train_data_loader = create_data_loader(train_data['text'], train_data['bio'], args, tokenizer)
valid_data_loader = create_data_loader(valid_data['text'], valid_data['bio'], args, tokenizer)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


pretrained save_model use../pretrain_models/nezha-cn-base


In [17]:
print(len(train_data_loader), len(valid_data_loader))

613 75


In [18]:
def jaccard_score(pred, label):
    return len(set(pred) & set(label)) / len(set(pred) | set(label))

In [19]:
def train_epoch(model, data_loader, optimizer, args, scheduler):
    # 训练模式
    model = model.train()
    train_loss = 0
    for sample in tqdm(data_loader):
        input_ids = sample['input_ids'].to(args.device)
        attention_mask = sample['attention_mask'].to(args.device)
        label_ids = sample['label_ids'].to(args.device)
        out, loss = model(input_ids=input_ids,
                        label_ids=label_ids,
                        attention_mask=attention_mask)

        train_loss += loss.item()
        loss.backward()

        # -----------------------------------对抗攻击------------------------------------------------
        if args.use_fgm:
            # 对抗训练
            fgm.attack()  # 在embedding上添加对抗扰动
            # loss_adv = model(batch_input, batch_label)
            out, loss_adv = model(input_ids=input_ids,
                        label_ids=label_ids,
                        attention_mask=attention_mask)
            loss_adv = loss_adv.mean()
            loss_adv.backward()  # 反向传播，并在正常的grad基础上，累加对抗训练的梯度
            fgm.restore()  # 恢复embedding参数

        if args.use_pgd:
            pgd.backup_grad()
            for t in range(K):
                pgd.attack(is_first_attack=(t == 0))
                if t != K - 1:
                    model.zero_grad()
                else:
                    pgd.restore_grad()

                out, loss_adv = model(input_ids=input_ids,
                                      label_ids=label_ids,
                                      attention_mask=attention_mask)
                loss_adv = loss_adv.mean()
                loss_adv.backward()  # 反向传播，并在正常的grad基础上，累加对抗训练的梯度

            pgd.restore()


            # ----------------------------------------------------------------------------------------


        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()

        optimizer.zero_grad()
        scheduler.step()

        if args.ema != False:
            args.ema.update()



    return train_loss/len(data_loader)

from ark_nlp.factory.utils.conlleval import get_entity_bio
def return_entity(label):
    entity_labels = []
    for _type, _start_idx, _end_idx in get_entity_bio(label, id2label=None):
            entity_labels.append({
                'start_idx': _start_idx,
                'end_idx': _end_idx,
                'type': _type
            })
    entity_labels = [str(dic['start_idx'])+'-'+str(dic['end_idx']) for dic in entity_labels]
    return entity_labels


def eval_epoch(model, data_loader, args):
    # 验证模式
    model = model.eval()
    if args.ema!=False:
        args.ema.apply_shadow()
    val_loss = 0
    jc_score_list = []
    # 关闭自动求导，省内存加速，因为是不是训练模式了，没必要求导
    with torch.no_grad():
        for sample in tqdm(data_loader):
            input_ids = sample['input_ids'].to(args.device)
            attention_mask = sample['attention_mask'].to(args.device)
            label_ids = sample['label_ids'].to(args.device)
            out, loss = model(input_ids=input_ids,
                        label_ids=label_ids,
                        attention_mask=attention_mask)

            val_loss += loss.item()


            predict_ids = out
            # predict_ids
            #%%
            label_ids = sample['label_ids'].numpy().tolist()

            entity_all_label_ids = []
            entity_all_predict_ids = []
            for i in range(len(label_ids)):
                tmp_label, tmp_predict = [], []
                # 因为我crf有做mask所以这里的len(len(predict_tag[i]))是不带有pad的长度
                for j in range(0, len(predict_ids[i])):
                    tmp_label.append(args.idx2tag[label_ids[i][j]])
                    tmp_predict.append(args.idx2tag[predict_ids[i][j]])
                entity_all_label_ids.append(tmp_label)
                entity_all_predict_ids.append(tmp_predict)


            for label, pred in zip(entity_all_label_ids, entity_all_predict_ids):
                label_entity = return_entity(label)
                pred_entity = return_entity(pred)
                jc_score_list.append(jaccard_score(pred=pred_entity, label=label_entity))

    return val_loss/len(data_loader), np.mean(jc_score_list)

In [20]:
# torch.cuda.is_available()

In [21]:
from model import BERT_CRF

if torch.cuda.is_available():
    args.device = 'cuda:0'
    print('使用：', args.device,' ing........')

model = BERT_CRF(args=args).to(args.device)


print('batch_size: ',args.batch_size, 'epochs: ',args.max_epochs)
num_total_steps = len(train_data_loader) * args.max_epochs
from util import build_optimizer, build_optimizer_diff_lr
optimizer, scheduler = build_optimizer(args, model, num_total_steps=num_total_steps)
# optimizer, scheduler = build_optimizer_diff_lr(args, model, num_total_steps=num_total_steps)



if args.ema==True:
    print('-'*10,'采用EMA机制训练','-'*10)
    from tricks import EMA
    args.ema = EMA(model, 0.995)
    args.ema.register()

if args.use_fgm==True:
    print('-' * 10, '采用FGM对抗训练', '-' * 10)
    from tricks import FGM
    # 初始化
    fgm = FGM(model)

if args.use_pgd==True:
    print('-' * 10, '采用PGD对抗训练', '-' * 10)
    from tricks import PGD
    # 初始化
    pgd = PGD(model=model)
    K = 3

使用： cuda:0  ing........
batch_size:  16 epochs:  6
learning_rate:  5e-05
num_training_steps:  3678
warmup_steps:  220.67999999999998
---------- 采用EMA机制训练 ----------
---------- 采用FGM对抗训练 ----------


Some weights of the model checkpoint at ../pretrain_models/nezha-cn-base were not used when initializing NeZhaModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing NeZhaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NeZhaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of NeZhaModel were not initialized from the model checkpoint at ../pretrain_models/nezha-cn-base and are newly initialized: 

In [22]:
best_jc_score = 0
for epoch in range(args.max_epochs):
    print('——'*10, f'Epoch {epoch + 1}/{args.max_epochs}', '——'*10)
    train_loss = train_epoch(model, train_data_loader, optimizer, args, scheduler)
    # #scheduler.step()
    # print('-'*20)
    print(f'Train loss : {round(train_loss, 2)}\n')
    val_loss, jc_score = eval_epoch(model, valid_data_loader, args)



    if jc_score>best_jc_score:
        best_jc_score = jc_score
        print(f'val loss : {round(val_loss, 3)}')
        print(f"jc_score: {round(jc_score, 3)}")
        print('-'*20)
        torch.save(model.state_dict(), './save_model/best_model.pth')
        print('+'*6,'best save_model saved','+'*6)

    if args.ema != False:
        args.ema.restore()

———————————————————— Epoch 1/6 ————————————————————
Train loss : 102.84

val loss : 86.103
jc_score: 0.685
--------------------
++++++ best save_model saved ++++++
———————————————————— Epoch 2/6 ————————————————————
Train loss : 33.73

val loss : 83.367
jc_score: 0.718
--------------------
++++++ best save_model saved ++++++
———————————————————— Epoch 3/6 ————————————————————
Train loss : 19.78

———————————————————— Epoch 4/6 ————————————————————
Train loss : 12.62

val loss : 101.338
jc_score: 0.726
--------------------
++++++ best save_model saved ++++++
———————————————————— Epoch 5/6 ————————————————————


100%|██████████| 613/613 [03:28<00:00,  2.93it/s]
100%|██████████| 75/75 [00:05<00:00, 14.91it/s]
100%|██████████| 613/613 [03:37<00:00,  2.82it/s]
100%|██████████| 75/75 [00:05<00:00, 13.79it/s]
100%|██████████| 613/613 [03:39<00:00,  2.79it/s]
100%|██████████| 75/75 [00:05<00:00, 12.84it/s]
100%|██████████| 613/613 [03:47<00:00,  2.69it/s]
100%|██████████| 75/75 [00:05<00:00, 13.29it/s]
 23%|██▎       | 143/613 [00:53<02:56,  2.67it/s]


KeyboardInterrupt: 