In [1]:
# https://github.com/skdjfla/toutiao-text-classfication-dataset/raw/master/toutiao_cat_data.txt.zip

In [2]:
!ls dataset

afqmc_public	  c3_public.zip        msra
afqmc_public.zip  cmrc2018_public      toutiao_cat_data.txt
c3_public	  cmrc2018_public.zip  toutiao_cat_data.txt.zip


In [1]:
# pandas 数据集读取，dataframe形式的
import pandas as pd
# 文件读取
import codecs

# 读取文本

# 标签
news_label = [int(x.split('_!_')[1])-100 
                  for x in codecs.open('./dataset/toutiao_cat_data.txt')]

# 文本
news_text = [x.strip().split('_!_')[-1] if x.strip()[-3:] != '_!_' else x.strip().split('_!_')[-2]
                 for x in codecs.open('./dataset/toutiao_cat_data.txt')]

In [4]:
news_text[:30]

['保利集团,马未都,中国科学技术馆,博物馆,新中国',
 '发酵床的垫料种类有哪些？哪种更好？',
 '上联：黄山黄河黄皮肤黄土高原。怎么对下联？',
 '林徽因什么理由拒绝了徐志摩而选择梁思成为终身伴侣？',
 '黄杨木是什么树？',
 '上联：草根登上星光道，怎么对下联？',
 '什么是超写实绘画？',
 '松涛听雨莺婉转，下联？',
 '上联：老子骑牛读书，下联怎么对？',
 '上联：山水醉人何须酒。如何对下联？',
 '林风眠,黄海归来步步云,秋山图,计白当黑,山水画,江山万里图,张大千,巫峡清秋图,活眼,山雨欲来图',
 '牡丹,收藏价值',
 '有哪些让人感动的语句呢？',
 '上联，绿竹引清风，如何对下联？',
 '叶浅予,田世光,李苦禅,花鸟画,中央美术学院',
 '夕阳无语燕归愁，如何接下句？',
 '上联：山水醉人何须酒。如何对下联？',
 '上联：上班为下班，如何对下联？',
 '下联:夕陽西下已黄昏。上联是什麽？',
 '荷花,西湖,金粟词话,采莲女,念奴娇·赤壁怀古,林逋,荷叶',
 '佟丽娅,网络谣言,快乐大本营,李浩菲,谢娜,观众们',
 '汪涵,火星情报局,杨迪,主办方,谢娜,刘维',
 '飞纱,新娘,脱口秀,中国网,婚礼',
 '陆贞传奇,大红大紫,楚乔传,微博热搜,赵丽颖,花千骨,迪丽热巴,Angelababy',
 '戴上眼镜,刘德华,张翰,远大前程,杜志国,刘亦菲',
 '电影院,前任3,刘若英,张一白,田壮壮',
 '金刚狼3,休·杰克曼,神奇女侠,绯红女巫,超人,金刚狼',
 '张绍刚,新组合,腾讯视频,无限歌谣季,毛不易,父子',
 '中岛美嘉,滨崎步,张靓颖,演唱会,林子祥',
 '成龙改口决定不裸捐了，20亿财产给儿子一半，你怎么看？']

In [5]:
news_label[:30]

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2]

In [4]:
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader, TensorDataset
import numpy as np
import pandas as pd
import random
import re

In [6]:
# 划分为训练集和验证集
# stratify 按照标签进行采样，训练集和验证部分同分布
x_train, x_test, train_label, test_label =  train_test_split(news_text[:500], 
                                                             news_label[:500], 
                                                             test_size=0.2, 
                                                             stratify=news_label[:500])

In [7]:
# input_ids：字的编码
# token_type_ids：标识是第一个句子还是第二个句子
# attention_mask：标识是不是填充

In [8]:
# pip install transformers
# transformers bert相关的模型使用和加载
from transformers import BertTokenizer
# 分词器，词典

tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
train_encoding = tokenizer(x_train, truncation=True, padding=True, max_length=64)
test_encoding = tokenizer(x_test, truncation=True, padding=True, max_length=64)

In [9]:
tokenizer('我们是好学生', '好学生是我们',  padding=True, max_length=120)

  "`max_length` is ignored when `padding`=`True` and there is no truncation strategy. "


{'input_ids': [101, 2769, 812, 3221, 1962, 2110, 4495, 102, 1962, 2110, 4495, 3221, 2769, 812, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [10]:
# 数据集读取
class NewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    # 读取单个样本
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(int(self.labels[idx]))
        return item
    
    def __len__(self):
        return len(self.labels)

train_dataset = NewsDataset(train_encoding, train_label)
test_dataset = NewsDataset(test_encoding, test_label)

In [11]:
train_dataset[1]

{'input_ids': tensor([ 101,  833, 7987,  753, 2099,  868,  671, 2190, 5468, 8024, 3297, 1962,
         2199,  753, 2099, 1146, 1166, 3123, 1762,  677,  678, 5468, 4638, 1928,
          671, 2099, 8024, 1963,  862,  868, 8043,  102,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'labels': tensor(1)}

In [12]:
# 精度计算
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [13]:
from transformers import BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
model = BertForSequenceClassification.from_pretrained('bert-base-chinese', num_labels=17)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 单个读取到批量读取
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=True)

# 优化方法
optim = AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_loader) * 1
scheduler = get_linear_schedule_with_warmup(optim, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [8]:
# 训练函数
def train():
    model.train()
    total_train_loss = 0
    iter_num = 0
    total_iter = len(train_loader)
    for batch in train_loader:
        # 正向传播
        optim.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        total_train_loss += loss.item()
        
        # 反向梯度信息
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        # 参数更新
        optim.step()
        scheduler.step()

        iter_num += 1
        if(iter_num % 100==0):
            print("epoth: %d, iter_num: %d, loss: %.4f, %.2f%%" % (epoch, iter_num, loss.item(), iter_num/total_iter*100))
        
    print("Epoch: %d, Average training loss: %.4f"%(epoch, total_train_loss/len(train_loader)))
    
def validation():
    model.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0
    for batch in test_dataloader:
        with torch.no_grad():
            # 正常传播
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        
        loss = outputs[0]
        logits = outputs[1]

        total_eval_loss += loss.item()
        logits = logits.detach().cpu().numpy()
        label_ids = labels.to('cpu').numpy()
        total_eval_accuracy += flat_accuracy(logits, label_ids)
        
    avg_val_accuracy = total_eval_accuracy / len(test_dataloader)
    print("Accuracy: %.4f" % (avg_val_accuracy))
    print("Average testing loss: %.4f"%(total_eval_loss/len(test_dataloader)))
    print("-------------------------------")
    

for epoch in range(4):
    print("------------Epoch: %d ----------------" % epoch)
    train()
    validation()

------------Epoch: 0 ----------------
epoth: 0, iter_num: 100, loss: 1.1153, 4.00%
epoth: 0, iter_num: 200, loss: 0.6116, 8.00%
epoth: 0, iter_num: 300, loss: 1.2130, 12.00%
epoth: 0, iter_num: 400, loss: 0.2496, 16.00%
epoth: 0, iter_num: 500, loss: 0.4285, 20.00%
epoth: 0, iter_num: 600, loss: 0.3757, 24.00%
epoth: 0, iter_num: 700, loss: 0.6119, 28.00%
epoth: 0, iter_num: 800, loss: 0.3269, 32.00%
epoth: 0, iter_num: 900, loss: 0.5932, 36.00%
epoth: 0, iter_num: 1000, loss: 0.2557, 40.00%
epoth: 0, iter_num: 1100, loss: 0.3224, 44.00%
epoth: 0, iter_num: 1200, loss: 0.2232, 48.00%
epoth: 0, iter_num: 1300, loss: 0.7005, 52.00%
epoth: 0, iter_num: 1400, loss: 0.4618, 56.00%
epoth: 0, iter_num: 1500, loss: 0.3498, 60.00%
epoth: 0, iter_num: 1600, loss: 1.2221, 64.00%
epoth: 0, iter_num: 1700, loss: 0.3283, 68.00%
epoth: 0, iter_num: 1800, loss: 0.1679, 72.00%
epoth: 0, iter_num: 1900, loss: 0.3219, 76.00%
epoth: 0, iter_num: 2000, loss: 0.6863, 80.00%
epoth: 0, iter_num: 2100, loss: 0

KeyboardInterrupt: 

In [11]:
class FGM():
    def __init__(self, model):
        self.model = model
        self.backup = {}

    def attack(self, epsilon=1., emb_name='emb'):
        # emb_name这个参数要换成你模型中embedding的参数名
        # 例如，self.emb = nn.Embedding(5000, 100)
        for name, param in self.model.named_parameters():
            if param.requires_grad and 'embedding' in name:
                self.backup[name] = param.data.clone()
                norm = torch.norm(param.grad) # 默认为2范数
                if norm != 0:
                    r_at = epsilon * param.grad / norm
                    param.data.add_(r_at)

    def restore(self, emb_name='emb'):
        # emb_name这个参数要换成你模型中embedding的参数名
        for name, param in self.model.named_parameters():
            if param.requires_grad and 'embedding' in name: 
                assert name in self.backup
                param.data = self.backup[name]
        self.backup = {}

In [14]:


# 训练函数
def train():
    model.train()
    fgm = FGM(model)
    total_train_loss = 0
    iter_num = 0
    total_iter = len(train_loader)
    for batch in train_loader:
        # 正向传播
        optim.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        total_train_loss += loss.item()
        
        # 反向梯度信息
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        # 产生对抗样本
        # 再次正向传播
         
        # 对抗训练
        fgm.attack() # embedding被修改了
        loss_sum = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss_sum[0].backward() # 反向传播，在正常的grad基础上，累加对抗训练的梯度
        fgm.restore() # 恢复Embedding的参数
        
        # 参数更新
        optim.step()
        scheduler.step()

        iter_num += 1
        if(iter_num % 10==0):
            print("epoth: %d, iter_num: %d, loss: %.4f, %.2f%%" % (epoch, iter_num, loss.item(), iter_num/total_iter*100))
        
    print("Epoch: %d, Average training loss: %.4f"%(epoch, total_train_loss/len(train_loader)))
    
def validation():
    model.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0
    for batch in test_dataloader:
        with torch.no_grad():
            # 正常传播
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        
        loss = outputs[0]
        logits = outputs[1]

        total_eval_loss += loss.item()
        logits = logits.detach().cpu().numpy()
        label_ids = labels.to('cpu').numpy()
        total_eval_accuracy += flat_accuracy(logits, label_ids)
        
    avg_val_accuracy = total_eval_accuracy / len(test_dataloader)
    print("Accuracy: %.4f" % (avg_val_accuracy))
    print("Average testing loss: %.4f"%(total_eval_loss/len(test_dataloader)))
    print("-------------------------------")
    

for epoch in range(4):
    print("------------Epoch: %d ----------------" % epoch)
    train()
    validation()

------------Epoch: 0 ----------------
epoth: 0, iter_num: 10, loss: 2.8839, 0.40%
epoth: 0, iter_num: 20, loss: 2.7609, 0.80%
epoth: 0, iter_num: 30, loss: 2.5918, 1.20%
epoth: 0, iter_num: 40, loss: 2.6522, 1.60%
epoth: 0, iter_num: 50, loss: 2.7058, 2.00%
epoth: 0, iter_num: 60, loss: 2.6441, 2.40%
epoth: 0, iter_num: 70, loss: 2.6895, 2.80%
epoth: 0, iter_num: 80, loss: 2.6126, 3.20%
epoth: 0, iter_num: 90, loss: 2.7909, 3.60%
epoth: 0, iter_num: 100, loss: 2.6045, 4.00%
epoth: 0, iter_num: 110, loss: 2.6010, 4.40%
epoth: 0, iter_num: 120, loss: 2.6245, 4.80%
epoth: 0, iter_num: 130, loss: 2.5980, 5.20%
epoth: 0, iter_num: 140, loss: 2.6549, 5.60%
epoth: 0, iter_num: 150, loss: 2.6406, 6.00%
epoth: 0, iter_num: 160, loss: 2.5500, 6.40%
epoth: 0, iter_num: 170, loss: 2.6483, 6.80%
epoth: 0, iter_num: 180, loss: 2.5613, 7.20%
epoth: 0, iter_num: 190, loss: 2.6371, 7.60%
epoth: 0, iter_num: 200, loss: 2.5517, 8.00%
epoth: 0, iter_num: 210, loss: 2.4984, 8.40%
epoth: 0, iter_num: 220, l