In [None]:
import pandas as pd
import torch
from torch import nn
import matplotlib
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from d2l import torch as d2l
import numpy as np
import sklearn
from sklearn.metrics import accuracy_score, f1_score, classification_report,hamming_loss
import shutil

In [None]:
data = pd.read_csv("")
data = sklearn.utils.shuffle(data)

In [None]:
from transformers import BertModel
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")
bert = BertModel.from_pretrained("bert-base-chinese")

In [None]:
class mydataset(Dataset):
    def __init__(self, content, label,tokenizer, label_vocab):
        self.content = content
        self.label = label
        self.tokenizer = tokenizer
        self.label_vocab = label_vocab
    def __getitem__(self, index):
        # 先把每个list的每个str转变成label
        con = self.content[index]
        lab = self.label[index]
        # 把一个label作为一个index
        lab_split = lab.split('/')
        lab_index = [self.label_vocab[i] for i in lab_split]
        return con, lab_index
        # 用自己定义的方式处理数据，按照每一个batch来填充，并且在中文部分把开始和终止的符号也加上
        # content用bert，label用自己的
    def batch_data_process(self,batch_datas):
        content_vocab = {"<PAD>":0}
        con_index, lab_index = [], []
        con_len, lab_len = [], []
        for con,_ in batch_datas:
            con_len.append(len(con))
        max_con_len = max(con_len)
        
        for con, lab in batch_datas:
            con_index.append(self.tokenizer.encode(con,add_special_tokens=True,truncation=True,
                                                    padding='max_length',max_length=max_con_len+2))   #maxlen要换
            lab_index.append(lab)
            #con_len.append(len(con))
            lab_len.append(len(lab))
        #max_con_len = max(con_len)
        max_lab_len = max(lab_len)
        # 注意啊，这边我用的是最大长度作为num_steps，李沐的是设置一个num_steps,其实本质是一样的，就是小于numsteps的填充，大于的截断
        
        lab_index = [i+[self.label_vocab["<EOS>"]]+[self.label_vocab["<PAD>"]]*(max_lab_len-len(i)) for i in lab_index]
        con_index = torch.tensor(con_index)
        lab_index = torch.tensor(lab_index)
        con_valid_len = (con_index != content_vocab["<PAD>"]).type(torch.int32).sum(1)
        lab_valid_len = (lab_index != label_vocab["<PAD>"]).type(torch.int32).sum(1)
        return con_index, con_valid_len, lab_index, lab_valid_len
    def __len__(self):
        return len(self.content)

In [None]:
a,b = list(data["content"]),list(data["label"])
label_vocab = {
    "内容准确":0,"内容不准确":1,"内容完整":2,"内容不完整":3, "难度适中":4,"难度不合理":5, "逻辑合理":6,"逻辑不合理":7,
    "更新及时":8,"更新不及时":9,"内容有用":10, "内容没用":11, "内容新颖":12,"内容普通":13, "<BOS>":14, "<PAD>":15,"<EOS>":16
}
dataset=mydataset(a,b,tokenizer,label_vocab)
dataloader=DataLoader(dataset, batch_size=4, shuffle=False, collate_fn=dataset.batch_data_process)

In [None]:
labels = [ "内容准确","内容不准确","内容完整","内容不完整", "难度适中","难度不合理", "逻辑合理","逻辑不合理",
    "更新及时","更新不及时","内容有用", "内容没用", "内容新颖","内容普通", "<BOS>", "<PAD>","<EOS>"]
label = torch.tensor([[ 3,  7, 16],
        [10, 16, 15],
        [10, 16, 15],
        [ 1, 16, 15]])
batch_labels_embedding = torch.zeros(1,label.size(1),768)
for oneBatch in label:
    o = torch.zeros(1,768)
    for words in oneBatch:
        i = int(words.item()) 
        print(i)
        l = labels[i]      
        print(l)
        token = tokenizer.encode(l, return_tensors="pt",padding=True)  
        label_embedding = bert(token).pooler_output           
        print("labels_embedding",label_embedding.size())
        o = torch.cat((o,label_embedding),0)
        print("o ",o.size())
    labels_embedding = o[1:,:].unsqueeze(dim=0)    
    print("labels_embedding",labels_embedding.size())
    batch_labels_embedding = torch.cat((batch_labels_embedding,labels_embedding),0)
    print(batch_labels_embedding.size())
batch_labels_embedding_final = batch_labels_embedding[1:,:,:]
print(batch_labels_embedding_final.size())

In [None]:
labelll = torch.tensor([[[3,0],[7,0],[1,6]],
        [[1,0],[1,6],[1,5]]])
print(labelll.size())
labellll = labelll[1:,:,:]
print(labellll,labellll.size())

In [None]:
class myBert(nn.Module):
    def __init__(self):
        super().__init__()
        self.bert = BertModel.from_pretrained("bert-base-chinese")

    def forward(self,batch_x):
        bert_out = self.bert(batch_x)
        return bert_out
class Seq2SeqEncoder(nn.Module):
    def __init__(self, mybert, num_hiddens, num_layers, dropout=0, **kwargs):
        super(Seq2SeqEncoder, self).__init__(**kwargs)
        self.bert = mybert
        self.rnn = nn.LSTM(embed_size, num_hiddens, num_layers, dropout=dropout)

    def forward(self, input, *args):
        embedd_input = self.bert(input)[0]
        embedd_input = embedd_input.permute(1, 0, 2)
        output, state = self.rnn(embedd_input)
        return output, state
class AttentionDecoder(d2l.Decoder):
    def __init__(self, **kwargs):
        super(AttentionDecoder, self).__init__(**kwargs)
    @property
    def attention_weights(self):
        raise NotImplementedError
class Seq2SeqAttentionDecoder(AttentionDecoder):
    def __init__(self,tokenizer, mybert, vocab_size, embed_size, num_hiddens, num_layers,
                 dropout=0, **kwargs):
        super(Seq2SeqAttentionDecoder, self).__init__(**kwargs)
        self.attention = d2l.AdditiveAttention(
            num_hiddens, num_hiddens, num_hiddens, dropout)
        self.tokenizer = tokenizer
        self.bert = mybert
        self.rnn = nn.LSTM(
            embed_size + num_hiddens, num_hiddens, num_layers,
            dropout=dropout)
        self.dense = nn.Linear(num_hiddens, vocab_size)

    def init_state(self, enc_outputs, enc_valid_lens, *args):
        outputs, hidden_state = enc_outputs
        return (outputs.permute(1, 0, 2), hidden_state, enc_valid_lens)
    
    def label_represtation_embedding(self,X):
        labels = [ "内容准确","内容不准确","内容完整","内容不完整", "难度适中","难度不合理", "逻辑合理","逻辑不合理","更新及时","更新不及时","内容有用", "内容没用", "内容新颖","内容普通", "<BOS>", "<PAD>","<EOS>"]
        batch_labels_embedding = torch.zeros(1,X.size(1),768).cuda()
        for oneBatch in X:
            o = torch.zeros(1,768).cuda()
            for words in oneBatch:
                i = int(words.item())  
                l = labels[i]      
                token = self.tokenizer.encode(l, return_tensors="pt",padding=True).cuda()  
                label_embedding = self.bert(token)[1].cuda()           
                o = torch.cat((o,label_embedding),0)
            labels_embedding = o[1:,:].unsqueeze(dim=0).cuda()     
            batch_labels_embedding = torch.cat((batch_labels_embedding,labels_embedding),0)
        batch_labels_embedding_final = batch_labels_embedding[1:,:,:].cuda()
        return batch_labels_embedding_final
    
    def forward(self, X, state):
        enc_outputs, hidden_state, enc_valid_lens = state
        
        X = self.label_represtation_embedding(X).permute(1, 0, 2)
        outputs, self._attention_weights = [], []
        for x in X:
            query = torch.unsqueeze(hidden_state[-1], dim=1)
            context = self.attention(
                query, enc_outputs, enc_outputs, enc_valid_lens)
            x = torch.cat((context, torch.unsqueeze(x, dim=1)), dim=-1)
            out, hidden_state = self.rnn(x.permute(1, 0, 2), hidden_state)
            outputs.append(out)
            self._attention_weights.append(self.attention.attention_weights)
        outputs = self.dense(torch.cat(outputs, dim=0))
        return outputs.permute(1, 0, 2), [enc_outputs, hidden_state,
                                          enc_valid_lens]
    
    @property
    def attention_weights(self):
        return self._attention_weights
class EncoderDecoder(nn.Module):
    def __init__(self, encoder, decoder, **kwargs):
        super(EncoderDecoder, self).__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder
    def forward(self ,enc_X, dec_X, *args):
        enc_outputs = self.encoder(enc_X, *args)
        dec_state = self.decoder.init_state(enc_outputs, *args)
        return self.decoder(dec_X, dec_state)

In [None]:
def sequence_mask(X, valid_len, value=0):
    maxlen = X.size(1)
    mask = torch.arange((maxlen), dtype=torch.float32,
                        device=X.device)[None, :] < valid_len[:, None]
    X[~mask] = value
    return X
class MaskedSoftmaxCELoss(nn.CrossEntropyLoss):
    def forward(self, pred, label, valid_len):
        weights = torch.ones_like(label)
        weights = sequence_mask(weights, valid_len)
        self.reduction = 'none'
        unweighted_loss = super(MaskedSoftmaxCELoss, self).forward(
            pred.permute(0, 2, 1), label)          
        weighted_loss = (unweighted_loss * weights).mean(dim=1)
        return weighted_loss

In [None]:
def train_and_valid(net, train_data_iter,valid_data_iter, lr, num_epochs, tgt_vocab, device):
    def xavier_init_weights(m):
        if type(m) == nn.Linear:
            nn.init.xavier_uniform_(m.weight)
        if type(m) == nn.GRU:
            for param in m._flat_weights_names:
                if "weight" in param:
                    nn.init.xavier_uniform_(m._parameters[param])
    

    net.apply(xavier_init_weights)
    net.to(device)
    
    optimizer = torch.optim.Adam(net.parameters(), lr=lr)
    loss = MaskedSoftmaxCELoss()
    train_loss = 0 
    valid_loss = 0
    for epoch in range(num_epochs):
        net.train()
        for batch in train_data_iter:
            X, X_valid_len, Y, Y_valid_len = [x.to(device) for x in batch]
            Y_hat, _ = net(X, Y, X_valid_len)
            l = loss(Y_hat, Y, Y_valid_len)
            optimizer.zero_grad()
            l.sum().backward()
            l = l.sum()
            nn.utils.clip_grad_norm_(net.parameters(),1)
            optimizer.step()
            optimizer.zero_grad()
            train_loss += l.item() 
        
        with torch.no_grad():
            net.eval()
            for batch in valid_data_iter:
                X, X_valid_len, Y, Y_valid_len = [x.to(device) for x in batch]
                bos = torch.tensor([tgt_vocab['<BOS>']] * Y.shape[0],
                              device=device).reshape(-1, 1)
                dec_input = torch.cat([bos, Y[:, :-1]], 1)
                Y_hat, _ = net(X, dec_input, X_valid_len)
                l = loss(Y_hat, Y, Y_valid_len)
                l = l.sum()
                valid_loss += l.item()
        train_loss = train_loss/len(train_data_iter)
        valid_loss = valid_loss/len(train_data_iter)
#         if (epoch + 1) % 5 == 0:
        print('Epoch: {} \t Avgerage Training Loss: {:.6f} \tAverage Validation Loss: {:.6f}'
                      .format(epoch, train_loss, valid_loss))


        # save checkpoint as best model
    torch.save(net.state_dict(), "./data/net_parameter.pt")

In [None]:
label_vocab = {
    "内容准确":0,"内容不准确":1,"内容完整":2,"内容不完整":3, "难度适中":4,"难度不合理":5, "逻辑合理":6,"逻辑不合理":7,
    "更新及时":8,"更新不及时":9,"内容有用":10, "内容没用":11, "内容新颖":12,"内容普通":13, "<BOS>":14, "<PAD>":15,"<EOS>":16
}
embed_size, num_hiddens, num_layers, dropout = 
batch_size, num_steps = 
lr, num_epochs =
device = d2l.try_gpu()
mybert = myBert()
encoder = Seq2SeqEncoder(mybert,num_hiddens, num_layers,
                        dropout)
decoder = Seq2SeqAttentionDecoder(tokenizer,mybert,len(label_vocab), embed_size, num_hiddens, num_layers,
                        dropout)
net = EncoderDecoder(encoder, decoder)

In [None]:
train_dataset=mydataset(train_content,train_label,tokenizer,label_vocab)
valid_dataset=mydataset(valid_content,valid_label,tokenizer,label_vocab)
train_dataloader=DataLoader(train_dataset, batch_size=batch_size,shuffle=True ,collate_fn=train_dataset.batch_data_process)
valid_dataloader=DataLoader(valid_dataset, batch_size=batch_size,shuffle=True ,collate_fn=valid_dataset.batch_data_process)
checkpoint_path = ''
best_model = ''
train_and_valid(net, train_dataloader,valid_dataloader, lr, num_epochs, label_vocab, device)

In [None]:
def predict_seq2seq(net, src_sentence, src_vocab, tgt_vocab,
                    device, save_attention_weights=True):
    net.eval()
    src_tokens = tokenizer.encode(src_sentence,add_special_tokens=True,truncation=True,padding='max_length',max_length=55)
    len_src_tokens = 0
    for i in src_tokens:
        if i != 0:
            len_src_tokens+=1
    enc_valid_len = torch.tensor([len_src_tokens], device=device)
    enc_X = torch.unsqueeze(
        torch.tensor(src_tokens, dtype=torch.long, device=device), dim=0)
    enc_outputs = net.encoder(enc_X, enc_valid_len)
    dec_state = net.decoder.init_state(enc_outputs, enc_valid_len)
    dec_X = torch.unsqueeze(torch.tensor(
        [tgt_vocab['<BOS>']], dtype=torch.long, device=device), dim=0)
    output_seq, attention_weight_seq = [], []
    output_tokens = []
    while True:
        Y, dec_state = net.decoder(dec_X, dec_state)
        dec_X = Y.argmax(dim=2)
        pred = dec_X.squeeze(dim=0).type(torch.int32).item()
        if save_attention_weights:
            attention_weight_seq.append(net.decoder.attention_weights)
        if pred == tgt_vocab['<EOS>']:
            break
        output_seq.append(pred)
    return output_seq, attention_weight_seq

In [None]:
model_copy = copy.deepcopy(net)
model.load_state_dict(torch.load(''))

In [None]:
content_predict,label_predict = list(data["content"]),list(data["label"])
label_pre = []
for c, l in zip(content_predict, label_predict):
    labels, attention_weight_seq = predict_seq2seq(
        net, c, tokenizer, label_vocab, device)
    label_pre.append(labels)
label_pre_pad = []
for L in label_pre:
    l = np.zeros(14,dtype=np.int).tolist()
    for j in L:
        l[j] = 1
    label_pre_pad.append(l)
label_data = list(data["label"])[1750:]
label_data_pad = []
for L in label_data:
    lab_split = L.split('/')
    lab_index = [label_vocab[i] for i in lab_split]
    l = np.zeros(14,dtype=np.int).tolist()
    for j in lab_index:
        l[j] = 1
    label_data_pad.append(l)

In [None]:
accuracy = accuracy_score(label_data_pad, label_pre_pad)
f1_score_micro = f1_score(label_data_pad, label_pre_pad, average='micro')
f1_score_macro = f1_score(label_data_pad, label_pre_pad, average='macro')
ham_loss = hamming_loss(label_data_pad,label_pre_pad)
print(f"Accuracy Score = {accuracy}")
print(f"Hamming losss = {ham_loss}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")
print(classification_report(label_data_pad, label_pre_pad))