In [1]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer,AutoModelForPreTraining
from transformers import AutoTokenizer,BertTokenizerFast,AutoModel,BertTokenizer
from torch import nn
from torch.optim import AdamW
import pandas as pd
import numpy as np
import torch
import gc
from itertools import repeat
from tqdm import tqdm
from sklearn.model_selection import KFold
from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_model_forward
from model.modeling_nezha import NeZhaPreTrainedModel,NeZhaModel,NeZhaForTokenClassification
from model.configuration_nezha import NeZhaConfig
import  torch.nn.functional as F
import random
from transformers.models.bert.modeling_bert import (
    BertOutput,
    BertPooler,
    BertSelfOutput,
    BertIntermediate,
    BertOnlyMLMHead,
    BertOnlyNSPHead,
    BertPreTrainingHeads,
    BERT_START_DOCSTRING,
    BERT_INPUTS_DOCSTRING,
)

In [2]:
class NeZhaForSequenceClassification(NeZhaPreTrainedModel):
    def __init__(self, config,model_name,num_labels1,num_labels2):
        super().__init__(config)
        self.num_labels1 = num_labels1
        self.num_labels2=num_labels2
        self.bert = NeZhaModel.from_pretrained(model_name)
        self.embeddings=self.bert.embeddings
        self.attn1=Attn(config.hidden_size)
        self.attn2=Attn(config.hidden_size)

        self.dropout=nn.Dropout(0.1)
        self.classifier1 = nn.Linear(config.hidden_size, self.num_labels1)
        self.classifier2 = nn.Linear(config.hidden_size, self.num_labels2)
        self.classifier3=nn.Linear(config.hidden_size,30)
        self.predict=nn.Sigmoid()
#         self.init_weights()
#         if True:
#             for p in self.bert.parameters(): # 冻结所有bert层
#                 p.requires_grad = False

    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    def forward(
            self,
            input_ids=None,
        pair_ids=None,
    
         labels1=None,
        labels2=None,
        labels3=None
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
            Labels for computing the sequence classification/regression loss.
            Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
            If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).

    Returns:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
            Classification (or regression if config.num_labels==1) loss.
        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Examples::

        from transformers import BertTokenizer, BertForSequenceClassification
        import torch

        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
        outputs = model(input_ids, labels=labels)

        loss, logits = outputs[:2]

        """
        
        outputs = self.bert(
#             input_ids=input_ids,
             input_ids=pair_ids
        )        
        att1=self.attn1(outputs[0])
        att2=self.attn2(outputs[0])
        pooled_output1 = self.dropout(att1)
        pooled_output2 = self.dropout(att2)
        logits1 = self.classifier1(pooled_output1)
        logits2 = self.classifier2(pooled_output2)
 
        predict1=self.predict(logits1) # 任务一
        predict2=self.predict(logits2) # 任务二
        ### ------------- 句子编辑距离任务
        logits3=outputs[1]
#         self.bert(
#             input_ids=pair_ids
#         )[1]
        logits3= self.classifier3(logits3)
        ### --------------- 类别数任务
#         att4=self.attn4(outputs[0])
#         logits4=self.classifier4(att4)
#         att5=self.attn5(outputs[0])
#         logits5=self.classifier5(att5)
        
        outputs = (predict1,predict2) + outputs[2:]  # add hidden states and attention if they are here

        if labels1 is not None:
            loss_fct = nn.BCELoss()
            loss_fct2=nn.CrossEntropyLoss()
#                 
            loss1 = loss_fct(predict1.view(-1, self.num_labels1), labels1.view(-1, self.num_labels1))
            loss2 = loss_fct(predict2.view(-1, self.num_labels2), labels2.view(-1, self.num_labels2))
#             print(labels3)
            loss3=loss_fct2(logits3.view(-1,30),labels3.view(-1))
#             loss4=loss_fct2(logits4.view(-1,self.num_labels1),torch.sum(labels1,axis=1).long())
#             loss5=loss_fct2(logits5.view(-1,self.num_labels2),torch.sum(labels2,axis=1).long())
            loss=(loss1+loss2+loss3*0.5)
            outputs = (loss,loss1+loss2) + outputs

        return outputs  # (loss), predict1,predict2, (hidden_states), (attentions)

class SpatialDropout(nn.Module):
    """
    空间dropout，即在指定轴方向上进行dropout，常用于Embedding层和CNN层后
    如对于(batch, timesteps, embedding)的输入，若沿着axis=1则可对embedding的若干channel进行整体dropout
    若沿着axis=2则可对某些token进行整体dropout
    """
    def __init__(self, drop=0.5):
        super(SpatialDropout, self).__init__()
        self.drop = drop
        
    def forward(self, inputs, noise_shape=None):
        """
        @param: inputs, tensor
        @param: noise_shape, tuple, 应当与inputs的shape一致，其中值为1的即沿着drop的轴
        """
        outputs = inputs.clone()
        if noise_shape is None:
            noise_shape = (inputs.shape[0], *repeat(1, inputs.dim()-2), inputs.shape[-1])   # 默认沿着中间所有的shape
        
        self.noise_shape = noise_shape
        if not self.training or self.drop == 0:
            return inputs
        else:
            noises = self._make_noises(inputs)
            if self.drop == 1:
                noises.fill_(0.0)
            else:
                noises.bernoulli_(1 - self.drop).div_(1 - self.drop)
            noises = noises.expand_as(inputs)    
            outputs.mul_(noises)
            return outputs
            
    def _make_noises(self, inputs):
        return inputs.new().resize_(self.noise_shape)

class Attn(nn.Module):
    def __init__(self,hidden_size):
        super(Attn, self).__init__()
        self.attn = nn.Linear(hidden_size,1)
    def forward(self, x):
        '''
        :param hidden: 
            previous hidden state of the decoder, in shape (layers*directions,B,H)
        :param encoder_outputs:
            encoder outputs from Encoder, in shape (T,B,H)
        :param src_len:
            used for masking. NoneType or tensor in shape (B) indicating sequence length
        :return
            attention energies in shape (B,T)
        '''   
        att=self.attn(x)
        att=F.tanh(att)
        att=F.softmax(att,1)
        att_x=att*x
        return att_x.sum(1)

from sklearn.utils import shuffle as reset
def train_test_split(data_df, test_size=0.2, shuffle=True, random_state=None):
    if shuffle:
        data_df = reset(data_df, random_state=random_state)

    train = data_df[int(len(data_df)*test_size):].reset_index(drop = True)
    test  = data_df[:int(len(data_df)*test_size)].reset_index(drop = True)

    return train, test

from torch.nn.functional import cross_entropy,binary_cross_entropy


def eval(model, optimizer, validation_dataloader,output_model = './train_class/model.pth'):

    model.eval()
    eval_loss, eval_accuracy, nb_eval_steps = 0, 0, 0
    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        with torch.no_grad():
            _,_,predict1,predict2 = model(*batch)
            predict1,predict2 = predict1.detach().cpu(),predict2.detach().cpu()
            label_ids1,label_ids2 = batch[2].cpu(),batch[3].cpu()
            
            tmp_eval_accuracy = binary_cross_entropy(predict1, label_ids1.float()).item()+binary_cross_entropy(predict2, label_ids2.float()).item()
            
            eval_accuracy += tmp_eval_accuracy
            nb_eval_steps += 1

    print("Validation mlogloss: {}".format(eval_accuracy / nb_eval_steps))
    global best_score
    if best_score > eval_accuracy / nb_eval_steps:
        best_score = eval_accuracy / nb_eval_steps
        save(model, optimizer,output_model)
        return 0
    return 1
def save(model, optimizer,output_model):
    # save
    torch.save({
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()
    }, output_model)
    print('The best model has been saved')
def flat_accuracy(preds, labels):
#     print(preds,labels)
    return -np.mean(labels*np.log(preds+1.e-7)+(1-labels)*np.log(preds+1.e-7))*10

# 对抗训练
class FGM():
    def __init__(self, model):
        self.model = model
        self.backup = {}
    def attack(self, epsilon=1, emb_name='word_emb'):
        # emb_name这个参数要换成你模型中embedding的参数名
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name:
                self.backup[name] = param.data.clone()
                norm = torch.norm(param.grad)
                if norm != 0 and not torch.isnan(norm):
                    r_at = epsilon * param.grad / norm
                    param.data.add_(r_at)
    def restore(self, emb_name='word_emb'):
        # emb_name这个参数要换成你模型中embedding的参数名
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name: 
                assert name in self.backup
                param.data = self.backup[name]
        self.backup.clear()
        
from collections import defaultdict
from torch.optim import Optimizer
import torch


class Lookahead(Optimizer):
    def __init__(self, optimizer, k=5, alpha=0.5):
        self.optimizer = optimizer
        self.k = k
        self.alpha = alpha
        self.param_groups = self.optimizer.param_groups
        self.state = defaultdict(dict)
        self.fast_state = self.optimizer.state
        for group in self.param_groups:
            group["counter"] = 0

    def update(self, group):
        for fast in group["params"]:
            param_state = self.state[fast]
            if "slow_param" not in param_state:
                param_state["slow_param"] = torch.zeros_like(fast.data)
                param_state["slow_param"].copy_(fast.data)
            slow = param_state["slow_param"]
            slow += (fast.data - slow) * self.alpha
            fast.data.copy_(slow)

    def update_lookahead(self):
        for group in self.param_groups:
            self.update(group)

    def step(self, closure=None):
        loss = self.optimizer.step(closure)
        for group in self.param_groups:
            if group["counter"] == 0:
                self.update(group)
            group["counter"] += 1
            if group["counter"] >= self.k:
                group["counter"] = 0
        return loss

    def state_dict(self):
        fast_state_dict = self.optimizer.state_dict()
        slow_state = {
            (id(k) if isinstance(k, torch.Tensor) else k): v
            for k, v in self.state.items()
        }
        fast_state = fast_state_dict["state"]
        param_groups = fast_state_dict["param_groups"]
        return {
            "fast_state": fast_state,
            "slow_state": slow_state,
            "param_groups": param_groups,
        }

    def load_state_dict(self, state_dict):
        slow_state_dict = {
            "state": state_dict["slow_state"],
            "param_groups": state_dict["param_groups"],
        }
        fast_state_dict = {
            "state": state_dict["fast_state"],
            "param_groups": state_dict["param_groups"],
        }
        super(Lookahead, self).load_state_dict(slow_state_dict)
        self.optimizer.load_state_dict(fast_state_dict)
        self.fast_state = self.optimizer.state

    def add_param_group(self, param_group):
        param_group["counter"] = 0
        self.optimizer.add_param_group(param_group)

from torch.optim.lr_scheduler import LambdaLR
class WarmupLinearSchedule(LambdaLR):
    """ Linear warmup and then linear decay.
        Multiplies the learning rate defined in the optimizer by a dynamic variable determined by the current step.
        Linearly increases the multiplicative variable from 0. to 1. over `warmup_steps` training steps.
        Linearly decreases the multiplicative variable from 1. to 0. over remaining `t_total - warmup_steps` steps.
    """
    def __init__(self, optimizer, warmup_steps, t_total, last_epoch=-1):
        self.warmup_steps = warmup_steps
        self.t_total = t_total
        super(WarmupLinearSchedule, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch)

    def lr_lambda(self, step):
        if step < self.warmup_steps:
            return float(step) / float(max(1, self.warmup_steps))
        return max(0.0, float(self.t_total - step) / float(max(1.0, self.t_total - self.warmup_steps)))

class AdamW(Optimizer):
    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-6, weight_decay=0.0, correct_bias=True):
        if lr < 0.0:
            raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
        if not 0.0 <= betas[0] < 1.0:
            raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[0]))
        if not 0.0 <= betas[1] < 1.0:
            raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[1]))
        if not 0.0 <= eps:
            raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(eps))
        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay,
                        correct_bias=correct_bias)
        super(AdamW, self).__init__(params, defaults)

    def step(self, closure=None):
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data
                if grad.is_sparse:
                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
                state = self.state[p]
                if len(state) == 0:
                    state['step'] = 0
                    state['exp_avg'] = torch.zeros_like(p.data)
                    state['exp_avg_sq'] = torch.zeros_like(p.data)
                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                beta1, beta2 = group['betas']
                state['step'] += 1
                exp_avg.mul_(beta1).add_(1.0 - beta1, grad)
                exp_avg_sq.mul_(beta2).addcmul_(1.0 - beta2, grad, grad)
                denom = exp_avg_sq.sqrt().add_(group['eps'])
                step_size = group['lr']
                if group['correct_bias']:  # No bias correction for Bert
                    bias_correction1 = 1.0 - beta1 ** state['step']
                    bias_correction2 = 1.0 - beta2 ** state['step']
                    step_size = step_size * math.sqrt(bias_correction2) / bias_correction1
                p.data.addcdiv_(-step_size, exp_avg, denom)
                if group['weight_decay'] > 0.0:
                    p.data.add_(-group['lr'] * group['weight_decay'], p.data)
        return loss
def build_optimizer(model, train_steps, learning_rate):
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0}
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, correct_bias=False, eps=1e-8)
    optimizer = Lookahead(optimizer, 5, 2)
    scheduler = WarmupLinearSchedule(optimizer, warmup_steps=train_steps * 0.1, t_total=train_steps)
    return optimizer, scheduler
def to_predict(model, dataloader,output_model, with_labels=False):
    
    # load model
    checkpoint = torch.load(output_model, map_location='cuda')
#     print(checkpoint)
    model.load_state_dict(checkpoint['model_state_dict'])
    model.to(device)

    print('-----Testing-----')

    pred_label =np.zeros((len(test),29))
    model.eval()
    for i, batch in enumerate(tqdm(dataloader)):
        batch = tuple(t.to(device) for t in batch)
        with torch.no_grad():
            predict1,predict2 = model(*batch)
            
            predict1 = predict1.detach().cpu().numpy()
#             print(predict1)
            predict2 = predict2.detach().cpu().numpy()
            predict=np.concatenate([predict1,predict2],axis=-1)
            pred_label[i*batch_size:(i+1)*batch_size]=predict
    return pred_label

# M as ascent steps, alpha as ascent step size
# X denotes input node features, y denotes labels
def flag(model, X, y, optimizer, criterion, M, alpha):
    """
    model：模型
    X：输入节点特征矩阵
    y：节点标签
    optimizer：优化器
    criterion：损失函数
    M：每个epoch的step数
    alpha：每个step的步长
    """
    model.train()
    optimizer.zero_grad()

    # pert初始化为和X形状相同、服从(-alpha, alpha)均匀分布的矩阵
    pert = torch.FloatTensor(*X.shape).uniform_(-alpha, alpha)
    # pert带有梯度
    pert.requires_grad_()
    # 为输入数据增加对抗性扰动pert
    out = model(X+pert)
    # 因为loss的梯度一直是累加的，所以每个step贡献1/M的grad值
    loss = criterion(out, y)/M

    # 每个epoch分为M个step，M个loss的grad进行累加，得到最终的loss
    for _ in range(M-1):
        loss.backward()
        # 根据pert的grad来更新pert，alpha可以看作是pert的学习率
        pert_data = pert.detach() + alpha*torch.sign(pert.grad.detach())
        pert.data = pert_data.data
        # pert梯度grad清零
        pert.grad[:] = 0
        # 重复对抗性扰动的训练过程
        out = model(X+pert)
        loss = criterion(out, y)/M

    # 通过M个step累加的grad，更新model的参数
    loss.backward()
    optimizer.step()

import contextlib    
@contextlib.contextmanager
def _disable_tracking_bn_stats(model):

    def switch_attr(m):
        if hasattr(m, 'track_running_stats'):
            m.track_running_stats ^= True
            
    model.apply(switch_attr)
    yield
    model.apply(switch_attr)


def _l2_normalize(d):
    d_reshaped = d.view(d.shape[0], -1, *(1 for _ in range(d.dim() - 2)))
    d /= torch.norm(d_reshaped, dim=1, keepdim=True) + 1e-8
    return d


class VATLoss(nn.Module):

    def __init__(self, xi=10.0, eps=1.0, ip=1):
        """VAT loss
        :param xi: hyperparameter of VAT (default: 10.0)
        :param eps: hyperparameter of VAT (default: 1.0)
        :param ip: iteration times of computing adv noise (default: 1)
        """
        super(VATLoss, self).__init__()
        self.xi = xi
        self.eps = eps
        self.ip = ip

    def forward(self, model, x):
        with torch.no_grad():
            pred1,pred2 = model(x)

        # prepare random unit tensor
        d = torch.rand(x.shape).sub(0.5).to(x.device)
        d = _l2_normalize(d)

        with _disable_tracking_bn_stats(model):
            # calc adversarial direction
            for _ in range(self.ip):
                d.requires_grad_()
                pred_hat1,pred_hat2 = model(x + self.xi * d)
                adv_distance1 = F.kl_div(pred_hat1, pred1, reduction='batchmean')
                adv_distance2 = F.kl_div(pred_hat2, pred2, reduction='batchmean')
                adv_distance=adv_distance1+adv_distance2
                adv_distance.backward()
                d = _l2_normalize(d.grad)
                model.zero_grad()
    
            # calc LDS
            r_adv = d * self.eps
            pred_hat1,pred_hat2 = model(x + r_adv)
            lds1 = F.kl_div(pred_hat1, pred1, reduction='batchmean')
            lds2 = F.kl_div(pred_hat12, pred2, reduction='batchmean')
            lds = lds1+lds2

        return lds

In [3]:
import torch
from torch.nn.modules.loss import _Loss
import torch.nn.functional as F
import torch.nn as nn
# from enum import IntEnum

def stable_kl(logit, target, epsilon=1e-6, reduce=True):
    logit = logit.view(-1, logit.size(-1)).float()
    target = target.view(-1, target.size(-1)).float()
    bs = logit.size(0)
    p = F.log_softmax(logit, 1).exp()
    y = F.log_softmax(target, 1).exp()
    rp = -(1.0/(p + epsilon) -1 + epsilon).detach().log()
    ry = -(1.0/(y + epsilon) -1 + epsilon).detach().log()
    if reduce:
        return (p* (rp- ry) * 2).sum() / bs
    else:
        return (p* (rp- ry) * 2).sum()


def generate_noise(embed, mask, epsilon=1e-5):
	#生成与embed 同尺寸方差为epsion的符合正态分布的noise
    noise = embed.data.new(embed.size()).normal_(0, 1) *  epsilon
    noise.detach()
    noise.requires_grad_()
    return noise

class SmartPerturbation():
    def __init__(self,
                 epsilon=1e-6,
                 multi_gpu_on=False,
                 step_size=0.1,
                 noise_var=1e-5,
                 norm_p='inf',
                 k=1, # 扰动次数
                 fp16=False,
                 encoder_type=1,#EncoderModelType.BERT, # 4
                 loss_map=nn.BCELoss(), # 这个是用来与taskid 对应的，不同的任务对应不同的loss，我们可以直接固定住
                 norm_level=0):
        super(SmartPerturbation, self).__init__()
        self.epsilon = epsilon 
        # eta 更新扰动后的x_i的学习率
        self.step_size = step_size
        self.multi_gpu_on = multi_gpu_on
        self.fp16 = fp16
        self.K = k
        # sigma 生成扰动噪音的方差
        self.noise_var = noise_var 
        self.norm_p = norm_p
        self.encoder_type = encoder_type 
        self.loss_map = loss_map 
        self.norm_level = norm_level > 0
#         assert len(loss_map) > 0


    def _norm_grad(self, grad, eff_grad=None, sentence_level=False):
    	# 计算梯度 以及 有效梯度的 方向
        if self.norm_p == 'l2':
            if sentence_level:
                direction = grad / (torch.norm(grad, dim=(-2, -1), keepdim=True) + self.epsilon)
            else:
                direction = grad / (torch.norm(grad, dim=-1, keepdim=True) + self.epsilon)
        elif self.norm_p == 'l1':
            direction = grad.sign()
        else:
            if sentence_level:
                direction = grad / (grad.abs().max((-2, -1), keepdim=True)[0] + self.epsilon)
            else:
                direction = grad / (grad.abs().max(-1, keepdim=True)[0] + self.epsilon)
                eff_direction = eff_grad / (grad.abs().max(-1, keepdim=True)[0] + self.epsilon)
        return direction, eff_direction

    def forward(self, model,
                logits1, # 因为我是双任务
                logits2,
                inputs, # 数据的输入
                task_id=0,
                task_type=1,#TaskType.Classification,
                pairwise=1):
        # adv training
        assert task_type in set([1,2,3,4]), 'Donot support {} yet'.format(task_type)
        

        # init delta
        # 输出 embded 
        embed = model.embeddings(inputs['input_ids']) #得到输入矩阵
        noise = generate_noise(embed, inputs['attention_mask'], epsilon=self.noise_var)
        vat_args = inputs
        vat_args.pop('input_ids',1)
        for step in range(0, self.K):
            vat_args.update({'inputs_embeds':embed + noise})
            
            # 使用加入噪音的embed 输出预测结果
            _,adv_logits1,adv_logits2 = model(**vat_args) # 双任务
            if task_type == 2: # 回归任务
            	# 回归问题使用 mse loss 评估与原始embedded输出的差异
                adv_loss = F.mse_loss(adv_logits, logits.detach(), reduction='sum')
            else:
                if task_type == 3:  # 排序任务
                    adv_logits = adv_logits.view(-1, pairwise)
                # 排序或者分类使用kl散度衡量两者之间的差异  （其他任务）
                adv_loss = stable_kl(adv_logits1, logits1.detach(), reduce=False)+stable_kl(adv_logits2, logits2.detach(), reduce=False)
            #  分布损失与 扰动之间的梯度
            delta_grad, = torch.autograd.grad(adv_loss, noise, only_inputs=True, retain_graph=False)
            # 梯度的范数
            norm = delta_grad.norm()
            if (torch.isnan(norm) or torch.isinf(norm)):
                return 0
            # 更新到主要训练过程中的梯度 为扰动与原始输出差异损失对扰动求出的梯度 乘以 扰动的学习率
            eff_delta_grad = delta_grad * self.step_size
            # 
            delta_grad = noise + delta_grad * self.step_size
            noise, eff_noise = self._norm_grad(delta_grad, eff_grad=eff_delta_grad, sentence_level=self.norm_level)
            noise = noise.detach()
            noise.requires_grad_()
        vat_args.update({'inputs_embeds':embed + noise})
        
#         vat_args.pop('input_ids',default=1)
#         adv_loss,_,_ = model(**vat_args)
        _,adv_logits1,adv_logits2 = model(**vat_args) # 双任务
        if task_type == 3: # 排序任务
            adv_logits = adv_logits.view(-1, pairwise)
#         adv_lc = self.loss_map
#         adv_loss = adv_lc(logits, adv_logits, ignore_index=-1) #其实这里就是算个Loss,我的模型会自动返回loss
        adv_loss=stable_kl(adv_logits1, logits1.detach(), reduce=False)+stable_kl(adv_logits2, logits2.detach(), reduce=False)
        return adv_loss, embed.detach().abs().mean(), eff_noise.detach().abs().mean()

In [4]:
import torch.utils.data as Data
def read_data(df,tokenizer,maxlen,with_labels):
    outputs = defaultdict(list)
    def get_label(x,num):
        label=[0]*num
       
        x=x.strip().split(' ')

        for l in x:              
            if l and l!='nan':
                label[int(l)]=1
        return label
    for index in tqdm(range(len(df))):
        sent=df.loc[index, 'sentence']
        pair_idx=random.randint(0,len(df)-1)
        pair_sent=str(df.loc[pair_idx,'sentence'])
        # Tokenize the pair of sentences to get token ids, attention masks and token type ids
        outputs['input_ids'].append(tokenizer.encode_plus(sent,
#                                       padding='max_length',  # Pad to max_length
                                      truncation=True,       # Truncate to max_length
                                      max_length=maxlen,return_token_type_ids=False,  
                                      return_tensors='pt')['input_ids'].squeeze(0)) # Return torch.Tensor objects
        
        outputs['pair_ids'].append(tokenizer.encode_plus(sent,pair_sent,
#                                       padding='max_length',  # Pad to max_length
                                      truncation=True,max_length=maxlen,  
                                      return_tensors='pt')['input_ids'].squeeze(0))
  
        if with_labels:  # True if the dataset has labels
            lab1=get_label(str(df.loc[index, 'label1']),17)
            lab2=get_label(str(df.loc[index, 'label2']),12)
            pair_lab1=get_label(str(df.loc[pair_idx, 'label1']),17)
            pair_lab2=get_label(str(df.loc[pair_idx, 'label2']),12)
            #计算编辑距离
            outputs['labels3'].append(torch.Tensor([sum([ lab1[i]!=pair_lab1[i] for i in range(len(lab1))])+ sum([ lab2[i]!=pair_lab2[i] for i in range(len(lab2))])]).long())
            outputs['labels1'].append(torch.Tensor(lab1))
            outputs['labels2'].append(torch.Tensor(lab2))
            
    return outputs
    
class CustomDataset(Data.Dataset):
    def __init__(self, data,with_labels=True):
        self.data = data  # pandas dataframe

        #Initialize the tokenizer
        self.tokenizer = tokenizer#AutoTokenizer.from_pretrained(model_name, use_fast=True)  
        self.with_labels = with_labels
        
    def __len__(self):
        return len(self.data['input_ids'])
    def __getitem__(self, index):
        
        outputs=self.data['input_ids'][index],self.data['pair_ids'][index]
        if self.with_labels:
            return self.data['input_ids'][index],self.data['pair_ids'][index],self.data['labels1'][index],self.data['labels2'][index],self.data['labels3'][index]
           
        return self.data['input_ids'][index],self.data['pair_ids'][index]
    


In [5]:
class CustomDataset(Data.Dataset):
    def __init__(self, data,tokenizer,maxlen,with_labels=True):
        self.data = data  # pandas dataframe

        #Initialize the tokenizer
        self.tokenizer = tokenizer#AutoTokenizer.from_pretrained(model_name, use_fast=True)  
        self.maxlen=maxlen
        self.with_labels = with_labels
    def get_label(self,x,num):
        label=[0]*num
       
        x=x.strip().split(' ')

        for l in x:              
            if l and l!='nan':
                label[int(l)]=1
        return label
    def __len__(self):
        return len(self.data)
    def __getitem__(self, index):
        outputs=[]
        sent=self.data.loc[index, 'sentence']
        pair_idx=random.randint(0,len(self.data)-1)
        pair_sent=str(self.data.loc[pair_idx,'sentence'])
        # Tokenize the pair of sentences to get token ids, attention masks and token type ids
        outputs.append(self.tokenizer.encode_plus(sent,
                                      padding='max_length',  # Pad to max_length
                                      truncation=True,       # Truncate to max_length
                                      max_length=100,return_token_type_ids=False,
                                                 return_tensors='pt')['input_ids'].squeeze(0)) # Return torch.Tensor objects
        
        outputs.append(self.tokenizer.encode_plus(sent,pair_sent,
                                      padding='max_length',  # Pad to max_length
                                      truncation=True,max_length=self.maxlen, 
                                     return_tensors='pt')['input_ids'].squeeze(0))  
        if self.with_labels:  # True if the dataset has labels
            lab1=self.get_label(str(self.data.loc[index, 'label1']),17)
            lab2=self.get_label(str(self.data.loc[index, 'label2']),12)
            pair_lab1=self.get_label(str(self.data.loc[pair_idx, 'label1']),17)
            pair_lab2=self.get_label(str(self.data.loc[pair_idx, 'label2']),12)
            #计算编辑距离          
            outputs.append(torch.Tensor(lab1))
            outputs.append(torch.Tensor(lab2))
            outputs.append(torch.Tensor([sum([ lab1[i]!=pair_lab1[i] for i in range(len(lab1))])+ sum([ lab2[i]!=pair_lab2[i] for i in range(len(lab2))])]).long())
        return outputs


In [6]:
train=pd.read_csv('../tcdata/train.csv',header=None)
# test=train.iloc[-2000:].copy().reset_index(drop=True)
# train=train.iloc[:-2000]
test=pd.read_csv('../tcdata/track1_round1_testB.csv',header=None)
# test=pd.read_csv('../tcdata/testA.csv',header=None)
model_path='../model_weight/nezha/'
output_model='../tmp/nezha.pth'
batch_size=32
# 合并训练集与测试集 制作特征
for i in range(1,3):
    train[i]=train[i].apply(lambda x:x.replace('|','').strip())
for i in range(1,2):
    test[i]=test[i].apply(lambda x:x.replace('|','').strip())
train.columns=['idx','sentence','label1','label2']
test.columns=['idx','sentence']
# test.columns=['idx','sentence','label1','label2']
tokenizer=BertTokenizerFast.from_pretrained(model_path)

config=NeZhaConfig.from_pretrained(model_path,num_labels=17,hidden_dropout_prob=0.1) # config.output_attentions=True

In [7]:
def train_model(train_set,val_set,test_oof):
    
        ###--------------------
    early_stop=0
    print("Reading training data...")
    train_loader = Data.DataLoader(train_set, batch_size=batch_size, num_workers=8, shuffle=True,pin_memory=True)
    print("Reading validation data...")
    val_loader = Data.DataLoader(val_set, batch_size=batch_size, num_workers=8, shuffle=True,pin_memory=True)
    # 准备模型
    model=NeZhaForSequenceClassification(config=config,model_name=model_path,num_labels1=17,num_labels2=12)
    ### 训练
    model.to(device)
    fgm = FGM(model)
    adv_teacher=SmartPerturbation()
# vat
#     vat_loss = VATLoss(xi=10.0, eps=1.0, ip=1)
    
    train_num = len(train_set)
    train_steps = int(train_num * epochs / batch_size) + 1

    optimizer, scheduler = build_optimizer(model, train_steps, learning_rate=6e-5)
    print('-----Training-----')
    for epoch in range(epochs):
        model.train()
        model.zero_grad()
        print('Epoch', epoch)
        for i, batch in enumerate(tqdm(train_loader)):
            batch = tuple(t.to(device) for t in batch)
            loss,r_loss,predict1,predict2 = model(*batch)    
            if i % 50 == 0:
                print(i, r_loss.item())
            optimizer.zero_grad()
            loss.backward()
            # 对抗训练
#             fgm.attack()
#             loss_adv,_, _,_  = model(*batch)
#             loss_adv.backward()
#             fgm.restore()
#----------------------------
#             smart
            # adv training
            
#             adv_inputs={'model':model,'logits1':predict1,'logits2':predict2,'inputs':inputs}
#             adv_loss, emb_val, eff_perturb = adv_teacher.forward(**adv_inputs)
#             loss = loss + 0.1* adv_loss
            optimizer.step()
            scheduler.step()
        if epoch>-1:
            early_stop+=eval(model, optimizer, val_loader, output_model=output_model)
        if early_stop==2:
            break

    test_oof.append(to_predict(model, test_loader,output_model, with_labels=False))
    torch.cuda.empty_cache()
    gc.collect()
    return test_oof   

In [8]:
n_fold=KFold(8,shuffle=True,random_state=1080)
test_oof=0
epochs = 12
test_oof=[]
ixdfold=0
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
test_set = CustomDataset(test,tokenizer=tokenizer,maxlen=128,with_labels=False)
test_loader = Data.DataLoader(test_set, batch_size=batch_size, num_workers=8, shuffle=False,pin_memory=True)
for trn_idx,val_idx in n_fold.split(train):
    print('----kfold:%d --------'%ixdfold)
    ixdfold+=1
    train_df=train.iloc[trn_idx].reset_index(drop=True)
    val_df=train.iloc[val_idx].reset_index(drop=True)
    train_data=CustomDataset(train_df,tokenizer=tokenizer,maxlen=100,with_labels=True)
    val_data=CustomDataset(val_df,tokenizer=tokenizer,maxlen=100,with_labels=True)
    best_score = float('inf')
    train_model(train_data,val_data,test_oof)


----kfold:0 --------
Reading training data...
Reading validation data...


  0%|          | 0/547 [00:00<?, ?it/s]

-----Training-----
Epoch 0


	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  /opt/conda/conda-bld/pytorch_1607370117127/work/torch/csrc/utils/python_arg_parser.cpp:882.)
  exp_avg.mul_(beta1).add_(1.0 - beta1, grad)


0 1.4648200273513794


  9%|▉         | 50/547 [00:14<02:20,  3.55it/s]

50 0.3786814212799072


 18%|█▊        | 100/547 [00:31<04:01,  1.85it/s]

100 0.32473745942115784


 27%|██▋       | 150/547 [01:12<07:41,  1.16s/it]

150 0.3613997995853424


 37%|███▋      | 200/547 [02:01<05:41,  1.02it/s]

200 0.3402408957481384


 46%|████▌     | 250/547 [02:48<04:13,  1.17it/s]

250 0.2397443950176239


 55%|█████▍    | 300/547 [03:37<03:53,  1.06it/s]

300 0.17236624658107758


 64%|██████▍   | 350/547 [04:24<04:12,  1.28s/it]

350 0.2792940139770508


 73%|███████▎  | 400/547 [05:15<02:26,  1.00it/s]

400 0.21835115551948547


 82%|████████▏ | 450/547 [06:02<01:04,  1.49it/s]

450 0.2310740202665329


 91%|█████████▏| 500/547 [06:48<00:31,  1.50it/s]

500 0.20069976150989532


100%|██████████| 547/547 [07:30<00:00,  1.21it/s]


Validation mlogloss: 0.27589923572502556


  0%|          | 0/547 [00:00<?, ?it/s]

The best model has been saved
Epoch 1


  0%|          | 1/547 [00:00<07:53,  1.15it/s]

0 0.2549590468406677


  9%|▉         | 48/547 [00:37<06:32,  1.27it/s]


KeyboardInterrupt: 

In [None]:
#  原始，分开，FGM，word_emb，多任务,xlnet,多任务改回归 ，更多任务,多任务权重微调,nezha_new
# epoch0=0.1363  -> 0.1165->0.087-> 0.0855    ->0.095 ->0.070  ->0.135 ->0.183- >0.066 ->0.073
# epoch1=0.160 -> 0.0893          -> 0.072    ->0.0678-> 0.0473 ->0.0877->0.133     ->0.0476
# epoch2=0.110 -> 0.059                       ->0.0596                               ->0.038
#                                               -> 0.0523
#                                             ->0.041

In [10]:
sub=pd.DataFrame()
test=pd.read_csv('../tcdata/testA.csv',header=None)
sub['report_ID']=test[0]
sub['Prediction']=[ '|'+' '.join(['%.12f'%j for j in i]) for i in test_oof ]
sub.to_csv('../result.csv',index=False,header=False)

KeyError: 0