In [143]:
import sys
sys.path.append('../')
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

import warnings
warnings.filterwarnings('ignore')
from data.pipe import BartNERPipe
from model.bart import BartSeq2SeqModel
import fitlog
import random
import numpy as np

import torch
from fastNLP import Trainer
from model.metrics import Seq2SeqSpanMetric
from model.losses import Seq2SeqLoss
from torch import optim
from fastNLP import BucketSampler, GradientClipCallback, cache_results

from model.callbacks import WarmupCallback
from fastNLP.core.sampler import SortedSampler
from model.generater import SequenceGeneratorModel
from fastNLP.core.sampler import  ConstTokenNumSampler
from model.callbacks import FitlogCallback

fitlog.debug()
fitlog.set_log_dir('logs')

import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--dataset_name', default='en_ace04', type=str)

def set_seed(seed=1996):
    print("[SET SEED]: ",seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

args= parser.parse_args([])
dataset_name = args.dataset_name
args.length_penalty = 1
args.save_model = 1

# word: 生成word的start; bpe: 生成所有的bpe; span: 每一段按照start end生成; span_bpe: 每一段都是start的所有bpe，end的所有bpe
args.target_type = 'word'
args.bart_name = '/disk1/wxl/Desktop/DeepKE/example/ner/huggingface/bart-large'
args.schedule = 'linear'
args.decoder_type = 'avg_feature'
args.n_epochs = 30
args.num_beams = 1
args.batch_size = 16
args.use_encoder_mlp = 1
args.lr = 1e-5
args.warmup_ratio = 0.01
eval_start_epoch = 1

# the following hyper-parameters are for target_type=word
if dataset_name == 'conll2003':  # three runs get 93.18/93.18/93.36 F1
    max_len, max_len_a = 10, 0.6
elif dataset_name == 'en-ontonotes':  # three runs get 90.46/90.4/90/52 F1
    max_len, max_len_a = 10, 0.8
elif dataset_name == 'CADEC':
    max_len, max_len_a = 10, 1.6
    args.num_beams = 4
    args.lr = 2e-5
    args.n_epochs = 30
    eval_start_epoch=10
elif dataset_name == 'Share_2013':
    max_len, max_len_a = 10, 0.6
    args.use_encoder_mlp = 0
    args.num_beams = 4
    args.lr = 2e-5
    eval_start_epoch = 5
elif dataset_name == 'Share_2014':
    max_len, max_len_a = 10, 0.6
    args.num_beams = 4
    eval_start_epoch = 5
    args.n_epochs = 30
elif dataset_name == 'genia':  # three runs: 79.29/79.13/78.75
    max_len, max_len_a = 10, 0.5
    args.target_type = 'span'
    args.lr = 2e-5
    args.warmup_ratio = 0.01
elif dataset_name == 'en_ace04':  # four runs: 86.84/86.33/87/87.17
    max_len, max_len_a = 50, 1.1
    args.n_epochs = 55
    args.batch_size = 48
    args.lr = 4e-5
    seed = 4373
elif dataset_name == 'en_ace05':  # three runs: 85.39/84.54/84.75
    max_len, max_len_a = 50, 0.7
    args.lr = 3e-5
    args.batch_size = 12
    args.num_beams = 4
    args.warmup_ratio = 0.1

set_seed(seed)
# with open("/disk1/wxl/Desktop/DeepKE/example/baseline/BARTNER/loss_log/D.json","r") as f:
#     b=f.readlines()
# with open("/disk1/wxl/Desktop/DeepKE/example/baseline/BARTNER/loss_log/E.json","r") as f:
#     c=f.readlines()
# for x,y in zip(b,c):
#     assert x==y,print(x,y)
# exit()

save_model = args.save_model
del args.save_model
lr = args.lr
n_epochs = args.n_epochs
batch_size = args.batch_size
num_beams = args.num_beams

length_penalty = args.length_penalty
if isinstance(args.decoder_type, str) and args.decoder_type.lower() == 'none':
    args.decoder_type = None
decoder_type = args.decoder_type
target_type = args.target_type
bart_name = args.bart_name
schedule = args.schedule
use_encoder_mlp = args.use_encoder_mlp

fitlog.add_hyper(args)

#######hyper
#######hyper

demo = False
if demo:
    cache_fn = f"caches/data_{bart_name}_{dataset_name}_{target_type}_demo.pt"
else:
    cache_fn = f"caches/data_{bart_name}_{dataset_name}_{target_type}.pt"

@cache_results(cache_fn, _refresh=False)
def get_data():
    pipe = BartNERPipe(tokenizer=bart_name, dataset_name=dataset_name, target_type=target_type)
    if dataset_name == 'conll2003':
        paths = {'test': "./data/conll2003/test.txt",
                 'train': "./data/conll2003/train.txt",
                 'dev': "./data/conll2003/dev.txt"}
        data_bundle = pipe.process_from_file(paths, demo=demo)
    elif dataset_name == 'en-ontonotes':
        paths = './data/en-ontonotes/english'
        data_bundle = pipe.process_from_file(paths)
    else:
        data_bundle = pipe.process_from_file(f'./data/{dataset_name}', demo=demo)
    return pipe, data_bundle, pipe.tokenizer, pipe.mapping2id


pybuilddir.txt
pybuilddir.txt
/usr/share/zoneinfo/UTC
/usr/lib/ssl/certs/ca-certificates.crt
[SET SEED]:  4373


In [2]:
pipe, data_bundle, tokenizer, mapping2id = get_data()
ds = data_bundle.get_dataset("train")
len(ds)

Read cache from caches/data_/disk1/wxl/Desktop/DeepKE/example/ner/huggingface/bart-large_en_ace04_word.pt.


6195

In [3]:
from fastNLP import DataSet,DataSetIter
from fastNLP import SequentialSampler
# ds_ = DataSet()
# ds_ =ds.load("/disk1/wxl/Desktop/DeepKE/example/baseline/BARTNER/caches/ds_final_11_18.pt")
# ds_ = data_bundle.get_dataset('test')
sampler = SequentialSampler()
batch = DataSetIter(batch_size=48, dataset=ds, sampler=sampler)
# model2.eval()

In [4]:
model2=torch.load("/disk1/wxl/Desktop/DeepKE/example/baseline/BARTNER/save_models/en_ace04_3257/best_SequenceGeneratorModel_f_2024-11-18-22-16-54-082796").to('cuda')
model2.train()

SequenceGeneratorModel(
  (seq2seq_model): BartSeq2SeqModel(
    (encoder): FBartEncoder(
      (bart_encoder): BartEncoder(
        (embed_tokens): Embedding(50277, 1024)
        (embed_positions): LearnedPositionalEmbedding(1026, 1024, padding_idx=1)
        (layers): ModuleList(
          (0-11): 12 x EncoderLayer(
            (self_attn): Attention(
              (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
            )
            (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (fc1): Linear(in_features=1024, out_features=4096, bias=True)
            (fc2): Linear(in_features=4096, out_features=1024, bias=True)
            (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise

In [5]:
losser = Seq2SeqLoss(max_type_id=9)
for i in batch:
    torch.set_printoptions(threshold=float('Inf'), linewidth=120)
    pred = model2(src_tokens = i[0]["src_tokens"].to('cuda'), tgt_tokens = i[0]["tgt_tokens"].to('cuda'), src_seq_len=i[0]["src_seq_len"].to('cuda'), tgt_seq_len = i[0]["tgt_seq_len"].to('cuda'), first = i[0]["first"].to('cuda'))["pred"]
    
    try:
        x = losser.get_loss(i[0]["tgt_tokens"].to('cuda'), i[0]["tgt_seq_len"].to('cuda'), pred)
    except:
        x = losser.get_loss(i[0]["tgt_tokens"].to('cuda'), i[0]["tgt_seq_len"].to('cuda'), pred)
        break
    print(x)
    x.backward()



tensor(0.2806, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.2363, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.3205, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.2595, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.2931, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.1913, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.3844, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.2923, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.2923, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.4129, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.3870, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.3671, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.2767, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.4033, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor

In [129]:

import torch
import torch.nn.functional as F
x=torch.rand(3,8)
print(x)
eos = x[:,1] >1
tag = x[:, 2:4].argmax(dim=-1) + 2
mask = x[:,4:]>0.5
rows, cols = torch.where(mask)
print(rows, cols)
word = []
max_len = 1
eos_element = torch.tensor([1])
for r in range(x.size(0)):
    if eos[r]:
        word.append(eos_element)
    else:
        add_v = torch.cat((cols[rows == r]+4, tag[r:r+1]))
        if len(add_v) == 1:
            word.append(eos_element) # 虽然有实体类型，并且不是eos，但是实体内容为空，则按照eos处理
        else:
            word.append(add_v)
        max_len = max(max_len, len(add_v))
word = torch.cat([F.pad(i, (0, max_len-len(i)), mode='constant', value=-1).unsqueeze(0) for i in word])
print(tag, word.squeeze(-1).shape, eos)
print(max_len)

tensor([[0.1247, 0.1124, 0.8228, 0.2431, 0.8098, 0.4280, 0.2890, 0.8662],
        [0.6910, 0.6694, 0.3225, 0.4668, 0.6682, 0.7839, 0.3350, 0.6885],
        [0.0898, 0.1108, 0.8681, 0.3213, 0.4025, 0.1291, 0.2293, 0.8221]])
tensor([0, 0, 1, 1, 1, 2]) tensor([0, 3, 0, 1, 3, 3])
tensor([2, 3, 2]) torch.Size([3, 4]) tensor([False, False, False])
4


In [142]:
x= torch.rand(4,5)
x[0][2:]=-1
x[1][3:]=-1
x[2][4:]=-1
result = [row[row != -1] for row in x]
print(x,torch.cat([result[0],result[1]],dim=-1))
print(x.squeeze(1).shape)

tensor([[ 0.4849,  0.7694, -1.0000, -1.0000, -1.0000],
        [ 0.4539,  0.5514,  0.8879, -1.0000, -1.0000],
        [ 0.9258,  0.9665,  0.8730,  0.9829, -1.0000],
        [ 0.7957,  0.6666,  0.4052,  0.0504,  0.5711]]) tensor([0.4849, 0.7694, 0.4539, 0.5514, 0.8879])
torch.Size([4, 5])
