In [4]:
from transformers import BertTokenizer, BertConfig, BertModel
from DataIter import DataIter
import torch
import numpy as np
import torch.nn as nn

# 这份模型跑内容
class Config:
    def __init__(self):
        self.bert_path = "Ernie"
        self.tokenizer = BertTokenizer.from_pretrained(self.bert_path)
        self.max_length = 100
        self.dropout = 0.2
        self.num_classes = 20
        self.device = torch.device("cuda") \
            if torch.cuda.is_available() else torch.device("cpu")
        self.batch_size = 32
        self.num_epochs = 10
        self.learning_rate = 5e-5
        self.weight_decay = 0.02
        self.adam_epsilon = 1e-8
        self.warmup_rate = 0.1
        self.filter_sizes = [2,3,4,5]
        self.num_filters = 128
        
# 设置种子

torch.manual_seed(2020)
np.random.seed(2020)
torch.manual_seed(2020)
torch.cuda.manual_seed_all(2020)
torch.backends.cudnn.deterministic = True  # cudnn 使用确定性算法，保证每次结果一样


In [5]:
# Bert-TextCNN
class Model(nn.Module):
    
    def __init__(self, config):
        super().__init__()
        self.bert_config = BertConfig.from_pretrained(config.bert_path)
        self.bert = BertModel.from_pretrained(config.bert_path, config = self.bert_config)
        # 然后是提取特征
        self.convs = nn.ModuleList(
            [nn.Conv1d(self.bert_config.hidden_size, config.num_filters,x) for x in config.filter_sizes]
        )
        self.dropout = nn.Dropout(config.dropout)
        # 最后都要经过池化层，使得输出为
        self.fc = nn.Linear(config.num_filters * len(config.filter_sizes), config.num_classes)
        self.relu = nn.ReLU()
        
    def pool(self, out, conv):
        out = self.relu(conv(out))
        max_pool = nn.MaxPool1d(out.shape[-1])
        out = max_pool(out)
        out = out.squeeze(2)
        return out
        
    def forward(self,input_ids, attention_masks):
        embedded = self.bert(input_ids, attention_mask = attention_masks)[0]
        # embedding = [batch_size, seq_len, emb_dim]
        embedded = embedded.permute(0,2,1)
        # embedded = [batch_size, seq_len, emb_dim]
        output = [self.pool(embedded, conv) for conv in self.convs]
        # output = num_filter_sizes * [batch_size, num_filters]
        out = torch.cat(output, dim=1)
        # out = [batch_size, num_filter_sizes * num_filters]
        out = self.dropout(out)
        out = self.fc(out)
        return out


In [6]:
config = Config()
model = Model(config)
model.to(config.device)
data_iter = DataIter(config)
test_iter = data_iter.build_test()
train_iter = data_iter.build_examples(data_iter.train_df)

11712it [00:14, 803.16it/s]
49306it [01:02, 791.30it/s]


In [None]:
class FGM():
    def __init__(self, model):
        self.model = model
        self.backup = {}

    def attack(self, epsilon=1., emb_name='word_embeddings'):
        # emb_name这个参数要换成你模型中embedding的参数名
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name:
                self.backup[name] = param.data.clone()
                norm = torch.norm(param.grad)
                if norm != 0:
                    r_at = epsilon * param.grad / norm
                    param.data.add_(r_at)

    def restore(self, emb_name='word_embeddings'):
        # emb_name这个参数要换成你模型中embedding的参数名
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name:
                assert name in self.backup
                param.data = self.backup[name]
        self.backup = {}

In [7]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score,accuracy_score, recall_score
import numpy as np
import os
from tqdm.notebook import tqdm
from transformers import get_linear_schedule_with_warmup, AdamW

def predict(config, model, test_iter):
    all_pred = []
    for i, batch in enumerate(test_iter):
        print("\r 正在预测输出%d/%d"%(i, len(test_iter)), end="")
        model.eval()
        with torch.no_grad():
            input_ids, attention_masks= batch
            out = model(input_ids, attention_masks)
            y_pred = torch.softmax(out, dim=-1).detach().cpu().numpy()

            all_pred.append(y_pred)
    print("\r预测完成.\n")
    return np.concatenate(all_pred)

def evaluate(config, model, data_iter):
    all_pred = np.array([])
    all_true = np.array([])
    total_loss = 0.
    criterion = nn.CrossEntropyLoss()
    for i, batch in enumerate(data_iter):
        model.eval()
        with torch.no_grad():
            input_ids, attention_masks, label = batch
            out = model(input_ids, attention_masks)
            loss = criterion(out, label)
            total_loss += loss.item()
            y_pred = torch.argmax(out, dim=-1).float().detach().cpu().numpy()
            y_true = label.detach().cpu().numpy()
            all_pred = np.append(all_pred, y_pred, axis=0)
            all_true = np.append(all_true, y_true, axis=0)
    accuracy = accuracy_score(all_true, all_pred)
    return total_loss / len(data_iter), accuracy

def train(config, Model, train_iter, test_iter, data_iter):

    save_path = "output"
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    gkf = StratifiedKFold(n_splits=5, shuffle=True).split(data_iter.train_df.content, data_iter.train_df.label)
    for fold, (train_idx, valid_idx) in enumerate(gkf):
        train_inputs = [train_iter[i][train_idx] for i in range(len(train_iter))]
        valid_inputs = [train_iter[i][valid_idx] for i in range(len(train_iter))]
        train, dev = data_iter.build_iterator(train_inputs, valid_inputs)
        model = Model(config)
        model.to(config.device)
        # fgm = FGM(model)
        best_loss = float('inf')
        stop_steps = 0
        early_stop = 3000
        flag = False

        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
             'weight_decay': config.weight_decay},
            {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]

        t_total = len(train) * config.num_epochs
        optimizer = AdamW(optimizer_grouped_parameters, lr=config.learning_rate, eps=config.adam_epsilon)
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=t_total * config.warmup_rate, num_training_steps=t_total)
        criterion = nn.CrossEntropyLoss()
        tqdm.write("*************************************Fold:[{}/{}]**********************************".format(fold+1, 5))
        for epoch in range(config.num_epochs):
            epoch_loss = 0.
            tqdm.write("EPOCH:[{}/{}]".format(epoch+1, config.num_epochs))
            for i, batch in enumerate(tqdm(train)):
                model.train()

                input_ids, attention_masks, label = batch
                out = model(input_ids, attention_masks)
                loss = criterion(out, label)
                loss.backward()
                # fgm.attack() # 在embedding上添加对抗扰动
                # out_adv = model(input_ids, attention_masks)
                # loss_adv = criterion(out_adv, label)
                # loss_adv.backward() # 反向传播，并在正常的grad基础上，累加对抗训练的梯度
                # fgm.restore() # 恢复embedding参数
                epoch_loss += loss.item()
                optimizer.step()
                scheduler.step()
                model.zero_grad()
                
                msg = "ITER:{}, TRAIN_LOSS:{:.3f}, TRAIN_ACC:{:.2%},DEV_LOSS:{:.3f},DEV_ACC:{:.2%},NO_IMPROVEMENT:{}"
                if stop_steps > early_stop:
                    print("more than {} steps not improved yet, early stopping".format(stop_steps))
                    flag = True
                    break
                if i % 500 == 0:
                    y_pred = torch.argmax(out, dim=-1).float().detach().cpu().numpy()
                    y_true = label.detach().cpu().numpy()
                    accuracy = accuracy_score(y_true, y_pred)
                    dev_loss, dev_acc = evaluate(config, model, dev)
                    tqdm.write(msg.format(i, loss.item(), accuracy, dev_loss, dev_acc, stop_steps))
                        
                    if dev_loss < best_loss:
                        best_loss = dev_loss
                        stop_steps = 0
                        torch.save({'state_dict': model.state_dict()}, 'output/checkpoint{}.pth.tar'.format(fold))
                stop_steps += 1
            if flag:
                break
        print("*********************************************************************************")
        data_iter._gc()

In [None]:
train(config, Model, train_iter, test_iter, data_iter)



*************************************Fold:[1/5]**********************************
EPOCH:[1/10]


HBox(children=(FloatProgress(value=0.0, max=1233.0), HTML(value='')))

ITER:0, TRAIN_LOSS:3.316, TRAIN_ACC:6.25%,DEV_LOSS:3.124,DEV_ACC:10.73%,NO_IMPROVEMENT:0
ITER:500, TRAIN_LOSS:0.848, TRAIN_ACC:75.00%,DEV_LOSS:1.185,DEV_ACC:66.34%,NO_IMPROVEMENT:500
ITER:1000, TRAIN_LOSS:0.772, TRAIN_ACC:81.25%,DEV_LOSS:0.778,DEV_ACC:79.59%,NO_IMPROVEMENT:500

EPOCH:[2/10]


HBox(children=(FloatProgress(value=0.0, max=1233.0), HTML(value='')))

ITER:0, TRAIN_LOSS:1.024, TRAIN_ACC:65.62%,DEV_LOSS:0.706,DEV_ACC:80.99%,NO_IMPROVEMENT:233
ITER:500, TRAIN_LOSS:0.827, TRAIN_ACC:78.12%,DEV_LOSS:0.620,DEV_ACC:82.84%,NO_IMPROVEMENT:500


In [6]:
predictions = []
for i in range(5):
    checkpoint = torch.load('output/checkpoint{}.pth.tar'.format(i))
    model.load_state_dict(checkpoint['state_dict'])
    all_pred = predict(config, model, test_iter)
    predictions.append(all_pred)

pred = None
pred = predictions[0]
for i in range(1,5):
    pred += predictions[i]
pred = (pred/5)
pred = np.argmax(pred, axis=-1)

预测完成.

预测完成.

预测完成.

预测完成.

预测完成.



In [8]:
import pandas as pd

submit = pd.read_csv("submit_content.csv")
labels = ['文化休闲', '医疗卫生', '经济管理', '教育科技', '城乡建设', '工业', '民政社区', '交通运输',
       '生态环境', '政法监察', '农业畜牧业', '文秘行政', '劳动人事', '资源能源', '信息产业', '旅游服务',
       '商业贸易', '气象水文测绘地震地理', '财税金融', '外交外事']
result = [labels[i] for i in pred]
submit["label"] = result
submit.to_csv("submit_content_2.csv", index=None)