In [1]:
# 首先表示出一个文档，然后来衡量他们的相似度
from transformers import BertTokenizer, BertConfig, BertModel
from DataIter import DataIter
import torch
import numpy as np

class Config:
    def __init__(self):
        self.bert_path = "/home/featurize/Ernie"
        self.tokenizer = BertTokenizer.from_pretrained(self.bert_path)
        self.max_length = 100
        self.input_size = self.tokenizer.vocab_size
        self.emb_size = 300
        self.hidden_size = 256
        self.dropout = 0.2
        self.num_classes = 2
        self.device = torch.device("cuda") \
            if torch.cuda.is_available() else torch.device("cpu")
        self.batch_size = 32
        self.num_epochs = 10
        self.learning_rate = 2e-5
        self.weight_decay = 2e-3
        self.adam_epsilon = 1e-8
        self.warmup_steps = 0
        self.filter_sizes = [2,3,4,5]
        self.num_filters = 128
        self.num_layers = 2
        self.max_question_len = 27
        self.max_answer_len = 65

In [None]:
# 设置种子

torch.manual_seed(2020)
np.random.seed(2020)
torch.manual_seed(2020)
torch.cuda.manual_seed_all(2020)
torch.backends.cudnn.deterministic = True  # cudnn 使用确定性算法，保证每次结果一样

import torch.nn as nn

In [None]:
# 孪生网络，没啥用
class Model(nn.Module):
    
    def __init__(self, config):
        super().__init__()
        self.bert_config = BertConfig.from_pretrained(config.bert_path)
        self.bert = BertModel.from_pretrained(config.bert_path, config = self.bert_config)
        self.FC_mult = nn.Sequential(
            nn.Linear(self.bert_config.hidden_size, config.hidden_size),
            nn.BatchNorm1d(config.hidden_size, 1),
            nn.ReLU()
        )
        self.FC_minus = nn.Sequential(
            nn.Linear(self.bert_config.hidden_size, config.hidden_size),
            nn.BatchNorm1d(config.hidden_size, 1),
            nn.ReLU()
        )
        self.final = nn.Linear(config.hidden_size * 2, 1)

        
    def forward(self, text1, text2, mask1, mask2):
        text1 = self.bert(text1, attention_mask=mask1)[0]
        text2 = self.bert(text2, attention_mask=mask2)[0]
        u = text1[:,-1,:].clone()
        v = text2[:,-1,:].clone()
        minus = u-v
        mult = u*v
        interation_feature = torch.cat([self.FC_mult(mult),self.FC_minus(minus)], 1)
        out = self.final(interation_feature)
        return out



In [3]:
# Bert-TextCNN
class Model(nn.Module):
    
    def __init__(self, config):
        super().__init__()
        self.bert_config = BertConfig.from_pretrained(config.bert_path)
        self.bert = BertModel.from_pretrained(config.bert_path, config = self.bert_config)
        # 然后是提取特征
        self.convs = nn.ModuleList(
            [nn.Conv1d(self.bert_config.hidden_size, config.num_filters,x) for x in config.filter_sizes]
        )
        self.dropout = nn.Dropout(config.dropout)
        # 最后都要经过池化层，使得输出为
        self.fc = nn.Linear(config.num_filters * len(config.filter_sizes), config.num_classes)
        self.relu = nn.ReLU()
        
    def pool(self, out, conv):
        out = self.relu(conv(out))
        max_pool = nn.MaxPool1d(out.shape[-1])
        out = max_pool(out)
        out = out.squeeze(2)
        return out
        
    def forward(self,input_ids, attention_masks, token_type_ids):
        embedded = self.bert(input_ids, attention_mask = attention_masks, token_type_ids = token_type_ids)[0]
        # embedding = [batch_size, seq_len, emb_dim]
        embedded = embedded.permute(0,2,1)
        # embedded = [batch_size, seq_len, emb_dim]
        output = [self.pool(embedded, conv) for conv in self.convs]
        # output = num_filter_sizes * [batch_size, num_filters]
        out = torch.cat(output, dim=1)
        # out = [batch_size, num_filter_sizes * num_filters]
        out = self.dropout(out)
        out = self.fc(out)
        return out



In [3]:
# Bert
class Model(nn.Module):
    
    def __init__(self, config):
        super().__init__()
        self.bert_config = BertConfig.from_pretrained(config.bert_path)
        self.bert = BertModel.from_pretrained(config.bert_path, config = self.bert_config)
        self.fc = nn.Linear(self.bert_config.hidden_size * 2, 2)
        self.dropout = nn.Dropout(config.dropout)

    def forward(self,input_ids, attention_masks, token_type_ids):
#         _, pooled = self.bert(input_ids, attention_mask = attention_masks, token_type_ids = token_type_ids)
        last_hidden_state,pooler_output,hidden_states=self.bert(input_ids, attention_mask = attention_masks, token_type_ids = token_type_ids, output_hidden_states=True)
        output = torch.cat((pooler_output,last_hidden_state[:, 0, :]),1)
#         out = self.dropout(pooled)
        out = self.fc(output)
        return out



In [3]:
class Model(torch.nn.Module):
    def __init__(self, config):
        super().__init__()
        self.bert_config = BertConfig.from_pretrained(config.bert_path)
        self.bert = BertModel.from_pretrained(config.bert_path, config = self.bert_config)
        self.cls_token_head = nn.Sequential(
            nn.Dropout(config.dropout),
            nn.Linear(self.bert_config.hidden_size * 4, self.bert_config.hidden_size),
            nn.ReLU(inplace=True),
        )
        self.qa_sep_token_head = nn.Sequential(
            nn.Dropout(config.dropout),
            nn.Linear(self.bert_config.hidden_size * 4, self.bert_config.hidden_size),
            nn.ReLU(inplace=True),
        )
        self.linear = nn.Sequential(
            nn.Dropout(config.dropout),
            nn.Linear(self.bert_config.hidden_size * 2, 2),
        )
        
    def forward(self, input_ids, attention_masks, token_type_ids):
        question_answer_seps = (torch.sum((token_type_ids == 0) * attention_masks, -1) - 1)
        _, _, hidden_states = self.bert(input_ids,
                                        attention_mask=attention_masks,
                                        token_type_ids=token_type_ids,
                                        output_hidden_states=True)
        hidden_states_cls_embeddings = [x[:, 0] for x in hidden_states[-4:]]
        x = torch.cat(hidden_states_cls_embeddings, dim=-1)
        x_cls = self.cls_token_head(x)
        
        # Gather [SEP] hidden states
        tmp = torch.arange(0, len(input_ids), dtype=torch.long)
        hidden_states_qa_sep_embeddings = [x[tmp, question_answer_seps] for x in hidden_states[-4:]]
        x = torch.cat(hidden_states_qa_sep_embeddings, dim=-1)
        
        x_qa_sep = self.qa_sep_token_head(x)
        x = torch.cat([x_cls, x_qa_sep], -1)
        x = self.linear(x)
        return x

In [3]:
# SA-Bert
class Model(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.bert_config = BertConfig.from_pretrained(config.bert_path)
        self.bert = BertModel.from_pretrained(config.bert_path, config = self.bert_config)
        self.output_weights = nn.Parameter(self.truncated_normal_(torch.randn(self.bert_config.hidden_size, config.num_classes).requires_grad_()))
        self.output_bias = nn.Parameter(torch.zeros(config.num_classes).requires_grad_())
        self.dropout = nn.Dropout(config.dropout)
        self.target_loss_weight = nn.Parameter(torch.tensor([1., 1.]).requires_grad_())
        self.bce_loss = nn.BCEWithLogitsLoss(reduction='none')
        
        
    # 截断正态分布
    def truncated_normal_(self, tensor,mean=0,std=0.02):
        with torch.no_grad():
            size = tensor.shape
            tmp = tensor.new_empty(size+(4,)).normal_()
            valid = (tmp < 2) & (tmp > -2)
            ind = valid.max(-1, keepdim=True)[1]
            tensor.data.copy_(tmp.gather(-1, ind).squeeze(-1))
            tensor.data.mul_(std).add_(mean)
            return tensor
        
    def forward(self, input_ids, attention_masks, token_type_ids, labels):
        
        flagx = (labels>0).float()
        flagy = (labels==0).float()
        all_target_loss = flagx * self.target_loss_weight[1] + flagy * self.target_loss_weight[0]
        # pooled = [batch_size, hidden_size]
        _, pooled = self.bert(input_ids, attention_mask = attention_masks, token_type_ids = token_type_ids)
        pooled = self.dropout(pooled)
        # out = [batch_size, num_classes]
        out = torch.matmul(pooled, self.output_weights)
        out = out + self.output_bias
        out = torch.sigmoid(out)
        # out = [batch_size]
        logits = out.squeeze(-1)
        loss = self.bce_loss(logits, labels)
#         losses = all_target_loss * loss
        mean_loss = torch.mean(loss)
        return mean_loss, logits
    
    def predict(self, input_ids, attention_masks, token_type_ids, labels):
        # pooled = [batch_size, hidden_size]
        _, pooled = self.bert(input_ids, attention_mask = attention_masks, token_type_ids = token_type_ids)
        pooled = self.dropout(pooled)
        # out = [batch_size, num_classes]
        out = torch.matmul(pooled, self.output_weights)
        out = out + self.output_bias
        out = torch.sigmoid(out)
        logits = out.squeeze(-1)
        prob = (logits > 0.5).float()
        return prob
        

In [3]:
# BertRCNN
class Model(nn.Module):
    
    def __init__(self, config):
        super().__init__()
        self.bert_config = BertConfig.from_pretrained(config.bert_path)
        self.bert = BertModel.from_pretrained(config.bert_path, config = self.bert_config)
        self.rnn = nn.LSTM(self.bert_config.hidden_size, config.hidden_size, batch_first=True, bidirectional=True, num_layers=config.num_layers)
        self.dropout = nn.Dropout(config.dropout)
        self.fc = nn.Linear(config.hidden_size*2, config.num_classes)
        self.w = nn.Parameter(torch.randn(2*config.hidden_size + self.bert_config.hidden_size, 2 * config.hidden_size))
        
    def forward(self,input_ids, attention_masks, token_type_ids):
        embedded = self.bert(input_ids, attention_mask = attention_masks, token_type_ids = token_type_ids)[0]
        # embedded = [batch_size, seq_len, 768]
        out,_ = self.rnn(embedded)
        # 将输出和嵌入层连接起来
        # out = [batch_size, seq_len, 128*2]
        out = torch.cat((out, embedded), dim=2)
        # out = [batch_size, seq_len, hidden_size * 2 + emb_dim
        out = torch.tanh(torch.matmul(out, self.w))
        # out = [batch_size, seq_len, hidden_size * 2
        out = out.permute(0,2,1)
        # out = [batch_size, hidden_size * 2, seq_len]
        out = nn.functional.max_pool1d(out, out.shape[-1]).squeeze(2)
        out = self.fc(out)
        return out


In [3]:
# BertRNN_Att
class Model(nn.Module):
    
    def __init__(self, config):
        super().__init__()
        self.bert_config = BertConfig.from_pretrained(config.bert_path)
        self.bert = BertModel.from_pretrained(config.bert_path, config = self.bert_config)
        self.rnn = nn.LSTM(self.bert_config.hidden_size, 256, bidirectional=True, num_layers = config.num_layers
                           ,batch_first=True)
        self.dropout = nn.Dropout(config.dropout)
        self.tanh1 = nn.Tanh()
        self.tanh2 = nn.Tanh()
        self.w = nn.Parameter(torch.randn(2 * 256))
        self.fc1 = nn.Linear(256*2, 512)
        self.fc2 = nn.Linear(512, config.num_classes)
        
    def forward(self,input_ids, attention_masks, token_type_ids):
        embedded = self.bert(input_ids, attention_mask = attention_masks, token_type_ids = token_type_ids)[0]
        out,_ = self.rnn(embedded)
        # out = [batch_size, seq_len, hidden_size * num_directions]
        M = self.tanh1(out)
        score = torch.matmul(out, self.w)
        att = torch.softmax(score, dim=1).unsqueeze(-1)
        # att = [batch_size, seq_len, 1]
        out = out * att
        # out = [batch_size, seq_len, hidden_size * 2]
        out = torch.sum(out, 1)
        # out = [batch_size, hidden_size * 2]
        out = torch.relu(out)
        out = self.fc1(out)
        out = self.fc2(out)
        return out



In [None]:
class Model(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.n_use_layer = 4
        self.bert_config = BertConfig.from_pretrained(config.bert_path)
        self.bert = BertModel.from_pretrained(config.bert_path, config = self.bert_config)
        self.dropout = nn.Dropout(config.dropout)
        self.dense1 = nn.Linear(self.bert_config.hidden_size*self.n_use_layer, self.bert_config.hidden_size*self.n_use_layer)
        self.dense2 = nn.Linear(self.bert_config.hidden_size*self.n_use_layer, self.bert_config.hidden_size*self.n_use_layer)    
        self.classifier = nn.Linear(self.bert_config.hidden_size*self.n_use_layer, config.num_classes)

    def forward(self, input_ids, attention_masks, token_type_ids):

        outputs = self.bert(input_ids,
                            attention_mask = attention_masks,
                            token_type_ids = token_type_ids,
                            output_hidden_states=True)
        pooled_output = torch.cat([outputs[2][-1*i][:,0] for i in range(1, self.n_use_layer+1)], dim=1)
        pooled_output = self.dense1(pooled_output)
        pooled_output = self.dense2(pooled_output)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        return logits

In [None]:
class Model(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.bert_config = BertConfig.from_pretrained(config.bert_path)
        self.bert = BertModel.from_pretrained(config.bert_path, config = self.bert_config)
        self.dropout = nn.Dropout(0.2)
        self.high_dropout = nn.Dropout(0.5)

        n_weights = self.bert_config.num_hidden_layers + 1
        weights_init = torch.zeros(n_weights).float()
        weights_init.data[:-1] = -3
        self.layer_weights = torch.nn.Parameter(weights_init)
        self.classifier = nn.Linear(self.bert_config.hidden_size, config.num_classes)


    def forward(self,input_ids, attention_masks, token_type_ids):
        outputs = self.bert(
            input_ids,
            attention_mask=attention_masks,
            token_type_ids=token_type_ids,
            output_hidden_states=True
        )
        hidden_layers = outputs[2]
        cls_outputs = torch.stack(
            [self.dropout(layer[:, 0, :]) for layer in hidden_layers], dim=2
        )
        cls_output = (torch.softmax(self.layer_weights, dim=0) * cls_outputs).sum(-1)
        # multisample dropout (wut): https://arxiv.org/abs/1905.09788
        logits = torch.mean(
            torch.stack(
                [self.classifier(self.high_dropout(cls_output)) for _ in range(5)],
                dim=0,
            ),
            dim=0,
        )
        return logits


In [None]:
class FGM():
    def __init__(self, model):
        self.model = model
        self.backup = {}

    def attack(self, epsilon=1., emb_name='word_embeddings'):
        # emb_name这个参数要换成你模型中embedding的参数名
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name:
                self.backup[name] = param.data.clone()
                norm = torch.norm(param.grad)
                if norm != 0:
                    r_at = epsilon * param.grad / norm
                    param.data.add_(r_at)

    def restore(self, emb_name='word_embeddings'):
        # emb_name这个参数要换成你模型中embedding的参数名
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name:
                assert name in self.backup
                param.data = self.backup[name]
        self.backup = {}

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score,accuracy_score, recall_score
import numpy as np
import os
from tqdm import tqdm
from transformers import get_linear_schedule_with_warmup, AdamW

def predict(config, model, test_iter):
    all_pred = []
    for i, batch in enumerate(test_iter):
        print("\r 正在预测输出%d/%d"%(i, len(test_iter)), end="")
        model.eval()
        with torch.no_grad():
            input_ids, attention_masks, token_type_ids = batch
            out = model(input_ids, attention_masks, token_type_ids)
            y_pred = torch.softmax(out, dim=-1).detach().cpu().numpy()

            all_pred.append(y_pred)
    print("\r预测完成.\n")
    return np.concatenate(all_pred)

def evaluate(config, model, data_iter):
    all_pred = np.array([])
    all_true = np.array([])
    total_loss = 0.
    criterion = nn.CrossEntropyLoss()
    for i, batch in enumerate(data_iter):
        model.eval()
        with torch.no_grad():
            input_ids, attention_masks, token_type_ids, label = batch
            out = model(input_ids, attention_masks, token_type_ids)
            loss = criterion(out, label)
            total_loss += loss.item()
            y_pred = torch.argmax(out, dim=-1).float().detach().cpu().numpy()
            y_true = label.detach().cpu().numpy()
            all_pred = np.append(all_pred, y_pred, axis=0)
            all_true = np.append(all_true, y_true, axis=0)
    score = f1_score(all_true, all_pred)
    accuracy = accuracy_score(all_true, all_pred)
    recall = recall_score(all_true, all_pred)
    return total_loss / len(data_iter), score, accuracy, recall

def train(config, model, train_iter, test_iter, data_iter):

    save_path = "/home/featurize/output/Ernie/"
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    gkf = StratifiedKFold(n_splits=5, shuffle=True).split(data_iter.train_df.q1, data_iter.train_df.label)
    for fold, (train_idx, valid_idx) in enumerate(gkf):
        train_inputs = [train_iter[i][train_idx] for i in range(len(train_iter))]
        valid_inputs = [train_iter[i][valid_idx] for i in range(len(train_iter))]
        train, dev = data_iter.build_iterator(train_inputs, valid_inputs)
        model = Model(config)
        model.to(config.device)
        fgm = FGM(model)
        best_score = 0.
        best_loss = float('inf')
        stop_steps = 0
        early_stop = 2000
        flag = False

        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
             'weight_decay': config.weight_decay},
            {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
        t_total = len(train) * config.num_epochs
        optimizer = AdamW(optimizer_grouped_parameters, lr=config.learning_rate, eps=config.adam_epsilon)
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level="O0")
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=config.warmup_steps, num_training_steps=t_total)
        criterion = nn.CrossEntropyLoss()
        print("*************************************Fold:[{}/{}]**********************************".format(fold+1, 5))
        for epoch in range(config.num_epochs):
            epoch_loss = 0.
            print("EPOCH:[{}/{}]".format(epoch+1, config.num_epochs))
            for i, batch in enumerate(tqdm(train)):
                model.train()

                input_ids, attention_masks, token_type_ids, label = batch
                out = model(input_ids, attention_masks, token_type_ids)
                loss = criterion(out, label)
                # loss.backward()
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
                fgm.attack() # 在embedding上添加对抗扰动
                out_adv = model(input_ids, attention_masks, token_type_ids)
                loss_adv = criterion(out_adv, label)
                with amp.scale_loss(loss_adv, optimizer) as scaled_loss:
                    scaled_loss.backward()
                # loss_adv.backward() # 反向传播，并在正常的grad基础上，累加对抗训练的梯度
                fgm.restore() # 恢复embedding参数
                epoch_loss += loss.item()
                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), 1.0)
                optimizer.step()
                model.zero_grad()
                
                msg = "ITER:{}, TRAIN_LOSS:{:.3f}, TRAIN_ACC:{:.2%}, TRAIN_RECALL:{:.2%}, TRAIN_F1:{:.2%}\n DEV_LOSS:{:.3f},\
                        DEV_ACC:{:.2%}, DEV_RECALL:{:.2%}, DEV_F1:{:.2%},NO_IMPROVEMENT:{}"
                if stop_steps > early_stop:
                    print("超过{}步没有提升，停止迭代".format(stop_steps))
                    flag = True
                    break
                if i % 100 == 0:
                    y_pred = torch.argmax(out, dim=-1).float().detach().cpu().numpy()
                    y_true = label.detach().cpu().numpy()
                    score = f1_score(y_true, y_pred)
                    accuracy = accuracy_score(y_true, y_pred)
                    recall = recall_score(y_true, y_pred)
                    dev_loss, dev_score, dev_acc, dev_recall = evaluate(config, model, dev)
                    print(msg.format(i, loss.item(), accuracy, recall, score, dev_loss, dev_acc, dev_recall, dev_score, stop_steps))
                        
                    if dev_score > best_score:
                        best_score = dev_score
                        stop_steps = 0
                        torch.save({'state_dict': model.state_dict()}, '/home/featurize/output/Ernie/checkpoint_cnn{}.pth.tar'.format(fold))
                stop_steps += 1
            if flag:
                break
        print("*********************************************************************************")
        data_iter._gc()

In [6]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score,accuracy_score, recall_score
import numpy as np
import os
from tqdm import tqdm
from transformers import get_linear_schedule_with_warmup, AdamW

def predict(config, model, test_iter):
    all_pred = []
    for i, batch in enumerate(test_iter):
        print("\r 正在预测输出%d/%d"%(i, len(test_iter)), end="")
        model.eval()
        with torch.no_grad():
            input_ids, attention_masks, token_type_ids = batch
            out = model(input_ids, attention_masks, token_type_ids)
            y_pred = torch.softmax(out, dim=-1).detach().cpu().numpy()

            all_pred.append(y_pred)
    print("\r预测完成.\n")
    return np.concatenate(all_pred)

def evaluate(config, model, data_iter):
    all_pred = np.array([])
    all_true = np.array([])
    total_loss = 0.
    criterion = nn.CrossEntropyLoss()
    for i, batch in enumerate(data_iter):
        model.eval()
        with torch.no_grad():
            input_ids, attention_masks, token_type_ids, label = batch
            out = model(input_ids, attention_masks, token_type_ids)
            loss = criterion(out, label)
            total_loss += loss.item()
            y_pred = torch.argmax(out, dim=-1).float().detach().cpu().numpy()
            y_true = label.detach().cpu().numpy()
            all_pred = np.append(all_pred, y_pred, axis=0)
            all_true = np.append(all_true, y_true, axis=0)
    score = f1_score(all_true, all_pred)
    accuracy = accuracy_score(all_true, all_pred)
    recall = recall_score(all_true, all_pred)
    return total_loss / len(data_iter), score, accuracy, recall

def train(config, model, train_iter, test_iter, data_iter):

    save_path = "/home/featurize/output/Ernie/"
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    gkf = StratifiedKFold(n_splits=5, shuffle=True).split(data_iter.train_df.q1, data_iter.train_df.label)
    for fold, (train_idx, valid_idx) in enumerate(gkf):
        train_inputs = [train_iter[i][train_idx] for i in range(len(train_iter))]
        valid_inputs = [train_iter[i][valid_idx] for i in range(len(train_iter))]
        train, dev = data_iter.build_iterator(train_inputs, valid_inputs)
        model = Model(config)
        model.to(config.device)
        fgm = FGM(model)
        best_score = 0.
        best_loss = float('inf')
        stop_steps = 0
        early_stop = 2000
        flag = False

        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
             'weight_decay': config.weight_decay},
            {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
        t_total = len(train) * config.num_epochs
        optimizer = AdamW(optimizer_grouped_parameters, lr=config.learning_rate, eps=config.adam_epsilon)
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=config.warmup_steps, num_training_steps=t_total)
        criterion = nn.CrossEntropyLoss()
        print("*************************************Fold:[{}/{}]**********************************".format(fold+1, 5))
        for epoch in range(config.num_epochs):
            epoch_loss = 0.
            print("EPOCH:[{}/{}]".format(epoch+1, config.num_epochs))
            for i, batch in enumerate(tqdm(train)):
                model.train()

                input_ids, attention_masks, token_type_ids, label = batch
                out = model(input_ids, attention_masks, token_type_ids)
                loss = criterion(out, label)
                loss.backward()
                fgm.attack() # 在embedding上添加对抗扰动
                out_adv = model(input_ids, attention_masks, token_type_ids)
                loss_adv = criterion(out_adv, label)
                loss_adv.backward() # 反向传播，并在正常的grad基础上，累加对抗训练的梯度
                fgm.restore() # 恢复embedding参数
                epoch_loss += loss.item()
                optimizer.step()
                model.zero_grad()
                
                msg = "ITER:{}, TRAIN_LOSS:{:.3f}, TRAIN_ACC:{:.2%}, TRAIN_RECALL:{:.2%}, TRAIN_F1:{:.2%}\n DEV_LOSS:{:.3f},\
                        DEV_ACC:{:.2%}, DEV_RECALL:{:.2%}, DEV_F1:{:.2%},NO_IMPROVEMENT:{}"
                if stop_steps > early_stop:
                    print("超过{}步没有提升，停止迭代".format(stop_steps))
                    flag = True
                    break
                if i % 100 == 0:
                    y_pred = torch.argmax(out, dim=-1).float().detach().cpu().numpy()
                    y_true = label.detach().cpu().numpy()
                    score = f1_score(y_true, y_pred)
                    accuracy = accuracy_score(y_true, y_pred)
                    recall = recall_score(y_true, y_pred)
                    dev_loss, dev_score, dev_acc, dev_recall = evaluate(config, model, dev)
                    print(msg.format(i, loss.item(), accuracy, recall, score, dev_loss, dev_acc, dev_recall, dev_score, stop_steps))
                        
                    if dev_score > best_score:
                        best_score = dev_score
                        stop_steps = 0
                        torch.save({'state_dict': model.state_dict()}, '/home/featurize/output/Ernie/checkpoint_cnn{}.pth.tar'.format(fold))
                stop_steps += 1
            if flag:
                break
        print("*********************************************************************************")
        data_iter._gc()

In [7]:
config = Config()
model = Model(config)
model.to(config.device)
data_iter = DataIter(config)

In [8]:
test_iter = data_iter.build_test(test = True, is_match=True)
train_iter = data_iter.build_examples(data_iter.train_df, is_match=True)

53757it [00:53, 1013.35it/s]
21585it [00:21, 1018.31it/s]


In [None]:
train(config, model, train_iter, test_iter, data_iter)

  0%|          | 0/540 [00:00<?, ?it/s]

*************************************Fold:[1/5]**********************************
EPOCH:[1/10]
ITER:0, TRAIN_LOSS:0.815, TRAIN_ACC:40.62%, TRAIN_RECALL:87.50%, TRAIN_F1:42.42%
 DEV_LOSS:0.719,                        DEV_ACC:43.96%, DEV_RECALL:49.40%, DEV_F1:30.74%,NO_IMPROVEMENT:0


 19%|█▊        | 100/540 [01:17<04:36,  1.59it/s]

ITER:100, TRAIN_LOSS:0.248, TRAIN_ACC:90.62%, TRAIN_RECALL:57.14%, TRAIN_F1:72.73%
 DEV_LOSS:0.324,                        DEV_ACC:86.52%, DEV_RECALL:59.98%, DEV_F1:69.14%,NO_IMPROVEMENT:100


 37%|███▋      | 200/540 [02:36<03:35,  1.58it/s]

ITER:200, TRAIN_LOSS:0.330, TRAIN_ACC:87.50%, TRAIN_RECALL:60.00%, TRAIN_F1:60.00%
 DEV_LOSS:0.281,                        DEV_ACC:88.07%, DEV_RECALL:75.99%, DEV_F1:76.23%,NO_IMPROVEMENT:100


 56%|█████▌    | 300/540 [03:55<02:32,  1.57it/s]

ITER:300, TRAIN_LOSS:0.323, TRAIN_ACC:84.38%, TRAIN_RECALL:87.50%, TRAIN_F1:73.68%
 DEV_LOSS:0.261,                        DEV_ACC:88.79%, DEV_RECALL:78.29%, DEV_F1:77.86%,NO_IMPROVEMENT:100


 74%|███████▍  | 400/540 [05:14<01:28,  1.57it/s]

ITER:400, TRAIN_LOSS:0.387, TRAIN_ACC:90.62%, TRAIN_RECALL:77.78%, TRAIN_F1:82.35%
 DEV_LOSS:0.259,                        DEV_ACC:88.65%, DEV_RECALL:82.24%, DEV_F1:78.49%,NO_IMPROVEMENT:100


 93%|█████████▎| 500/540 [06:33<00:25,  1.58it/s]

ITER:500, TRAIN_LOSS:0.232, TRAIN_ACC:90.62%, TRAIN_RECALL:75.00%, TRAIN_F1:80.00%
 DEV_LOSS:0.245,                        DEV_ACC:89.32%, DEV_RECALL:80.77%, DEV_F1:79.21%,NO_IMPROVEMENT:100


100%|██████████| 540/540 [07:13<00:00,  1.25it/s]
  0%|          | 0/540 [00:00<?, ?it/s]

EPOCH:[2/10]
ITER:0, TRAIN_LOSS:0.117, TRAIN_ACC:100.00%, TRAIN_RECALL:100.00%, TRAIN_F1:100.00%
 DEV_LOSS:0.242,                        DEV_ACC:89.62%, DEV_RECALL:78.75%, DEV_F1:79.26%,NO_IMPROVEMENT:40


 19%|█▊        | 100/540 [01:18<04:39,  1.57it/s]

ITER:100, TRAIN_LOSS:0.126, TRAIN_ACC:96.88%, TRAIN_RECALL:100.00%, TRAIN_F1:96.30%
 DEV_LOSS:0.253,                        DEV_ACC:89.74%, DEV_RECALL:83.81%, DEV_F1:80.44%,NO_IMPROVEMENT:100


 37%|███▋      | 200/540 [02:37<03:36,  1.57it/s]

ITER:200, TRAIN_LOSS:0.270, TRAIN_ACC:90.62%, TRAIN_RECALL:66.67%, TRAIN_F1:57.14%
 DEV_LOSS:0.247,                        DEV_ACC:89.79%, DEV_RECALL:84.36%, DEV_F1:80.62%,NO_IMPROVEMENT:100


 56%|█████▌    | 301/540 [04:09<17:59,  4.52s/it]

ITER:300, TRAIN_LOSS:0.211, TRAIN_ACC:90.62%, TRAIN_RECALL:75.00%, TRAIN_F1:66.67%
 DEV_LOSS:0.243,                        DEV_ACC:89.62%, DEV_RECALL:83.81%, DEV_F1:80.26%,NO_IMPROVEMENT:100


 74%|███████▍  | 401/540 [05:23<10:23,  4.49s/it]

ITER:400, TRAIN_LOSS:0.258, TRAIN_ACC:93.75%, TRAIN_RECALL:60.00%, TRAIN_F1:75.00%
 DEV_LOSS:0.240,                        DEV_ACC:90.00%, DEV_RECALL:81.97%, DEV_F1:80.49%,NO_IMPROVEMENT:200


 93%|█████████▎| 500/540 [06:23<00:24,  1.65it/s]

ITER:500, TRAIN_LOSS:0.169, TRAIN_ACC:90.62%, TRAIN_RECALL:90.91%, TRAIN_F1:86.96%
 DEV_LOSS:0.235,                        DEV_ACC:90.16%, DEV_RECALL:83.35%, DEV_F1:81.00%,NO_IMPROVEMENT:300


100%|██████████| 540/540 [07:04<00:00,  1.27it/s]
  0%|          | 0/540 [00:00<?, ?it/s]

EPOCH:[3/10]


  0%|          | 1/540 [00:13<2:02:01, 13.58s/it]

ITER:0, TRAIN_LOSS:0.091, TRAIN_ACC:96.88%, TRAIN_RECALL:100.00%, TRAIN_F1:80.00%
 DEV_LOSS:0.231,                        DEV_ACC:89.93%, DEV_RECALL:81.42%, DEV_F1:80.27%,NO_IMPROVEMENT:40


 19%|█▊        | 100/540 [01:13<04:27,  1.64it/s]

ITER:100, TRAIN_LOSS:0.086, TRAIN_ACC:100.00%, TRAIN_RECALL:100.00%, TRAIN_F1:100.00%
 DEV_LOSS:0.257,                        DEV_ACC:90.11%, DEV_RECALL:84.64%, DEV_F1:81.16%,NO_IMPROVEMENT:140


 37%|███▋      | 200/540 [02:32<03:35,  1.57it/s]

ITER:200, TRAIN_LOSS:0.109, TRAIN_ACC:93.75%, TRAIN_RECALL:81.82%, TRAIN_F1:90.00%
 DEV_LOSS:0.263,                        DEV_ACC:90.25%, DEV_RECALL:83.99%, DEV_F1:81.26%,NO_IMPROVEMENT:100


 56%|█████▌    | 301/540 [04:04<17:58,  4.51s/it]

ITER:300, TRAIN_LOSS:0.103, TRAIN_ACC:93.75%, TRAIN_RECALL:83.33%, TRAIN_F1:83.33%
 DEV_LOSS:0.258,                        DEV_ACC:89.93%, DEV_RECALL:79.76%, DEV_F1:79.94%,NO_IMPROVEMENT:100


 71%|███████   | 384/540 [04:54<01:34,  1.64it/s]

In [22]:
predictions = []
for i in range(5):
    checkpoint = torch.load('/home/featurize/output/Ernie/checkpoint_cnn{}.pth.tar'.format(i))
    model.load_state_dict(checkpoint['state_dict'])
    all_pred = predict(config, model, test_iter)
    predictions.append(all_pred)

预测完成.输出1679/1680

预测完成.输出1679/1680

预测完成.输出1679/1680

预测完成.输出1679/1680

预测完成.输出1679/1680



In [23]:
pred = predictions[0]
for i in range(1,5):
    pred += predictions[i]
pred = (pred/5)
pred = np.argmax(pred, axis=-1)

In [25]:
import pandas as pd

submit = pd.read_csv("sample_submission.tsv", header=None, sep="\t")
submit.columns = ["cid", "rid", "label"]
submit["label"] = pred
submit.to_csv("submit_cleaned_new.tsv", header = None, sep="\t", index=None)