In [None]:
import torch
from transformers import BertTokenizer, BertModel,BertConfig
import random
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data.dataloader as dataloader
import torch.optim as optim
import torch.autograd as autograd
import torchtext.vocab as torchvocab
from torch.autograd import Variable
import tqdm
import os
import time
import re
import pandas as pd
import string
import time
import collections
from collections import Counter
from nltk.corpus import stopwords
from itertools import chain
from sklearn import metrics
from bert_SourceCode.optimization import BertAdam
import time 
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [None]:
tokenizer=BertTokenizer.from_pretrained('chinese-bert-wwm-ext')
bert = BertModel.from_pretrained('chinese-bert-wwm-ext')



In [None]:
traindata=pd.read_csv("./dataset/train.txt",header=None,sep='\t')
traindata.columns=['review','label']
validdata=pd.read_csv("./dataset/dev.txt",header=None,sep='\t')
validdata.columns=['review','label']
testdata=pd.read_csv("./dataset/test.txt",header=None,sep='\t')
testdata.columns=['review','label']

In [None]:
#截长去短
def pad(sentlist,maxlen,PAD=0):
    padded_list = sentlist
    while(len(padded_list) < maxlen):
        padded_list.append(PAD)
    return padded_list

#对pad元素进行mask
def mask(sentlist):
    attention_mask=[float(i>0) for i in sentlist]
    return attention_mask

#数据预处理，需要增加首尾标记、同长、转id、增加mask
def preprocess_data(dataframe,MAX=20):
    data=dataframe.copy()
    data['review']=data['review'].apply(lambda x:x[:MAX])
    data['preprocess']=['[CLS] ' + sent + ' [SEP]' for sent in data['review'].values]
    data['tokenized']=[tokenizer.tokenize(sent) for sent in data['preprocess']]
    data['original_inputs_id']=[tokenizer.convert_tokens_to_ids(sent) for sent in data['tokenized']]
    data['inputs_id']=data['original_inputs_id'].apply(pad,maxlen=MAX+2)
    data['attention_mask']=data['inputs_id'].apply(mask)
    return data[['inputs_id','label','attention_mask']]

#把数据转换成tensor类型，并加载入dataloader生成迭代器
def load_data(data,batch_size,mode="train"):
    inputs=torch.tensor(data['inputs_id'].tolist(),dtype=torch.int64)
    labels=torch.tensor(data['label'].tolist(),dtype=torch.int64)
    masks=torch.tensor(data['attention_mask'].tolist(),dtype=torch.float)
    if mode=="train":
        _data = torch.utils.data.TensorDataset(inputs, masks, labels)
        _iter = torch.utils.data.DataLoader(_data, shuffle=True, batch_size=batch_size)
    else:
        _data = torch.utils.data.TensorDataset(inputs, masks, labels)
        _iter = torch.utils.data.DataLoader(_data, shuffle=False, batch_size=batch_size)
    return _iter

In [None]:
BATCH_SIZE=64
train_iter=load_data(preprocess_data(traindata),batch_size=BATCH_SIZE)
valid_iter=load_data(preprocess_data(validdata),batch_size=BATCH_SIZE,mode='eva')
test_iter=load_data(preprocess_data(testdata),batch_size=BATCH_SIZE,mode='eva')

In [None]:
class BERTFCSentiment(nn.Module):
    def __init__(self,bert,output_dim):
        
        super(BERTFCSentiment,self).__init__()
        
        self.bert = bert
        self.bertconfig=bert.config
        self.bertconfig.output_attentions=True
        self.bertconfig.output_hidden_states=True
        
        embedding_dim = bert.config.to_dict()['hidden_size']
        for param in self.bert.parameters():
            param.requires_grad = True
        
        self.decoder=nn.Sequential(nn.Linear(embedding_dim,embedding_dim//2),
            nn.Dropout(p=0.5),nn.ReLU(),nn.Linear(embedding_dim//2,output_dim))
            
        

        self.weight_W = nn.Parameter(torch.rand(embedding_dim,embedding_dim))
        self.weight_proj = nn.Parameter(torch.rand(embedding_dim, 1))

    def forward(self, batch):
        
        #inputs_ids = [batch size, sent len]
        input_ids,input_mask,_=batch
        bert_out=self.bert(input_ids=input_ids,attention_mask=input_mask)
        last_hidden_state = bert_out[0]
        pooled=bert_out[1]

        
        ####below is a kind of self attention
        u = torch.tanh(torch.matmul(last_hidden_state, self.weight_W))
        # u=[batch_size, sequence_length, hidden_size]
        att = torch.matmul(u, self.weight_proj)
        # att=[batch_size, sequence_length,1]
        att_score = F.softmax(att, dim=1)
        # att_score=[batch_size, sequence_length,1],which sum by {seq_len}=1
        scored_x = last_hidden_state * att_score
        # scored_x=[batch_size, sequence_length,hidden_size]
        #####attention_weighted_x

        scored_x = torch.sum(scored_x, dim=1)
        output=self.decoder(scored_x)
        
        #above is attention for last_hidden_state,
        #if using pooled for decoder,just annotate above code
        # output = self.decoder(pooled)
        

        #output = [batch size, out dim]
        
        return output,att_score

In [None]:
class BERTLSTMSentimentNet(nn.Module):
    def __init__(self, bert,output_dim,num_hiddens,bidirectional, **kwargs):
        super(BERTLSTMSentimentNet, self).__init__(**kwargs)

        self.bert = bert
        self.bertconfig=bert.config
        self.bertconfig.output_attentions=True
        self.bertconfig.output_hidden_states=True
        self.embedding_dim = bert.config.to_dict()['hidden_size']
        for param in self.bert.parameters():
            param.requires_grad = True
        
        self.num_hiddens = num_hiddens
        self.bidirectional = bidirectional

        self.encoder = nn.LSTM(input_size=self.embedding_dim,
                               hidden_size=self.num_hiddens,
                               bidirectional=self.bidirectional,
                               dropout=0)
        
        
        if self.bidirectional:
            self.weight_W = nn.Parameter(torch.rand(self.num_hiddens*2, self.num_hiddens*2))
            self.weight_proj = nn.Parameter(torch.rand(self.num_hiddens*2, 1))
        else:
            self.weight_W = nn.Parameter(torch.rand(self.num_hiddens, self.num_hiddens))
            self.weight_proj = nn.Parameter(torch.rand(self.num_hiddens,1))
            
        self.decoder=nn.Sequential(nn.Linear(self.num_hiddens*2,self.num_hiddens),
            nn.Dropout(p=0.5),nn.ReLU(),nn.Linear(self.num_hiddens,output_dim))

    def forward(self, batch):
        input_ids,input_mask,_=batch
        bert_out=self.bert(input_ids=input_ids,attention_mask=input_mask)
        last_hidden_state = bert_out[0]
        #embeddings=[batch_size,seq_length,embedded_size]
        states, hidden = self.encoder(last_hidden_state.permute([1, 0, 2]))
        #states=[seq_len, batch, num_directions * hidden_size]
        x=states.permute([1,0,2])
        #x=[batch,seq_len,num_directions*hidden_szie]
        
        #####attention calculation#####
        u = torch.tanh(torch.matmul(x, self.weight_W))
        #u=[batch,seq_len,num_directions*hidden_szie]
        att = torch.matmul(u, self.weight_proj)
        #att=[batch,seq_len,1]
        att_score = F.softmax(att, dim=1)
        #att_score=[batch,seq_len,1],which sum by {seq_len}=1
        scored_x = x * att_score
        #scored_x=[batch,seq_len,num_directions*hidden_szie]
        #####attention_weighted_x
        
        scored_x = torch.sum(scored_x, dim=1)
        #scored_x=[batch,num_directions*hidden_szie]
        outputs=self.decoder(scored_x)
        #outputs=[batch,labels]
        
        return outputs,att_score

In [None]:
class BERTATSentiment(nn.Module):
    def __init__(self,bert,output_dim):
        
        super(BERTATSentiment,self).__init__()
        
        self.bert = bert
        self.bertconfig=bert.config
        self.bertconfig.output_hidden_states=True
        
        embedding_dim = bert.config.to_dict()['hidden_size']
        for param in self.bert.parameters():
            param.requires_grad = True
        
        self.decoder=nn.Sequential(nn.Linear(embedding_dim,embedding_dim//2),
            nn.Dropout(p=0.5),nn.ReLU(),nn.Linear(embedding_dim//2,output_dim))
            
        
 
        self.weight_W = nn.Parameter(torch.rand(embedding_dim,embedding_dim))
        self.weight_proj = nn.Parameter(torch.rand(embedding_dim, 1))

    def forward(self, batch):
        
        #inputs_ids = [batch size, sent len]
        input_ids,input_mask,_=batch
        bert_out=self.bert(input_ids=input_ids,attention_mask=input_mask)
        last_hidden_state = bert_out[0]
        pooled=bert_out[1]
        all_hidden_states=bert_out[2]
        
        batch_size = input_ids.shape[0]
        seq_length= input_ids.shape[1]
        #an easy idea:just concatnate by seq_length
        x = torch.cat(all_hidden_states,1)
        #x=[batch_size,13*seq_length,hidden_seize]
        
         #####attention calculation#####
        u = torch.tanh(torch.matmul(x, self.weight_W))
        #u=[batch,seq_len,num_directions*hidden_szie]
        att = torch.matmul(u, self.weight_proj)
        #att=[batch,seq_len,1]
        att_score = F.softmax(att, dim=1)
        #att_score=[batch,seq_len,1],which sum by {seq_len}=1
        scored_x = x * att_score
        #scored_x=[batch,seq_len,num_directions*hidden_szie]
        #####attention_weighted_x
        
        scored_x = torch.sum(scored_x, dim=1)
        #scored_x=[batch,num_directions*hidden_szie]
        outputs=self.decoder(scored_x)
        #outputs=[batch,labels]


        
        return outputs,att_score

In [None]:
#这是bertfc的参数设置
N_EPOCHS = 2
OUTPUT_DIM = 7
device = torch.device('cuda:0')
lr = 5e-5
model = BERTFCSentiment(bert,OUTPUT_DIM)

In [None]:
#这是bertlstm的参数设置
#N_EPOCHS = 50
#OUTPUT_DIM = 7
#NUM_HIDDENS=64
#bidirectional=True
#device = torch.device('cuda:0')
#lr = 5e-5
#model=BERTLSTMSentimentNet(bert,OUTPUT_DIM,NUM_HIDDENS,bidirectional)

In [None]:
#这是bertat的参数设置
#N_EPOCHS = 2
#OUTPUT_DIM = 7
#device = torch.device('cuda:0')
#lr = 5e-5
#model = BERTATSentiment(bert,OUTPUT_DIM)

In [None]:
#for name, param in model.named_parameters():                
#    if name.startswith('bert'):
#        print(name)

In [None]:
#解决不平衡问题，改用focalloss

class FocalLoss(nn.Module):
    r"""
        This criterion is a implemenation of Focal Loss, which is proposed in 
        Focal Loss for Dense Object Detection.

            Loss(x, class) = - \alpha (1-softmax(x)[class])^gamma \log(softmax(x)[class])

        The losses are averaged across observations for each minibatch.

        Args:
            alpha(1D Tensor, Variable) : the scalar factor for this criterion
            gamma(float, double) : gamma > 0; reduces the relative loss for well-classiﬁed examples (p > .5), 
                                   putting more focus on hard, misclassiﬁed examples
            size_average(bool): By default, the losses are averaged over observations for each minibatch.
                                However, if the field size_average is set to False, the losses are
                                instead summed for each minibatch.


    """
    def __init__(self, class_num, alpha=None, gamma=2, size_average=True):
        super(FocalLoss, self).__init__()
        if alpha is None:
            self.alpha = Variable(torch.ones(class_num, 1))
        else:
            if isinstance(alpha, Variable):
                self.alpha = alpha
            else:
                self.alpha = Variable(alpha)
        self.gamma = gamma
        self.class_num = class_num
        self.size_average = size_average

    def forward(self, inputs, targets):
        N = inputs.size(0)
        C = inputs.size(1)
        P = F.softmax(inputs)

        class_mask = inputs.data.new(N, C).fill_(0)
        class_mask = Variable(class_mask)
        ids = targets.view(-1, 1)
        class_mask.scatter_(1, ids.data, 1.)
        #print(class_mask)


        if inputs.is_cuda and not self.alpha.is_cuda:
            self.alpha = self.alpha.cuda()
        alpha = self.alpha[ids.data.view(-1)]

        probs = (P*class_mask).sum(1).view(-1,1)

        log_p = probs.log()
        #print('probs size= {}'.format(probs.size()))
        #print(probs)

        batch_loss = -alpha*(torch.pow((1-probs), self.gamma))*log_p 
        #print('-----bacth_loss------')
        #print(batch_loss)


        if self.size_average:
            loss = batch_loss.mean()
        else:
            loss = batch_loss.sum()
        return loss

In [None]:
param_optimizer = list(model.named_parameters())
network_param = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in network_param)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in network_param)], 'weight_decay': 0.0}]
optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=lr,
                         warmup=0.05,
                         t_total=len(train_iter) * N_EPOCHS)
#optimizer = optim.Adam(model.parameters(),lr=lr)
alpha=Variable(torch.tensor([0.13,0.2,0.12,0.13,0.14,0.14,0.14]))
criterion=FocalLoss(class_num=OUTPUT_DIM,alpha=alpha)
model = model.to(device)
criterion = criterion.to(device)

In [None]:
def category_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.max(preds,1)[1]
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [None]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        batch = tuple(t.to(device) for t in batch)
        input_ids,input_mask,labels=batch
        
        optimizer.zero_grad()
        
        predictions,attention = model(batch)

        loss = criterion(predictions, labels)
        
        acc = category_accuracy(predictions, labels)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator),epoch_acc / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:
            batch = tuple(t.to(device) for t in batch)
            input_ids,input_mask,labels=batch

            predictions,attention = model(batch)
            
            loss = criterion(predictions, labels)
            
            acc = category_accuracy(predictions, labels)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
class_list = [x.strip() for x in open( 'dataset/class.txt').readlines()]

def test(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    labels_all = np.array([], dtype=int)
    predicts_all = np.array([], dtype=int)
    
    model.eval()
    with torch.no_grad():

        for batch in iterator:
            batch = tuple(t.to(device) for t in batch)
            input_ids,input_mask,labels=batch

            predictions,attention = model(batch)

            loss = criterion(predictions, labels)

            acc = category_accuracy(predictions, labels)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
            
            labels = labels.data.cpu().numpy()
            predicts = torch.max(predictions, 1)[1].cpu().numpy()
            labels_all = np.append(labels_all, labels)
            predicts_all = np.append(predicts_all, predicts)
            
    report = metrics.classification_report(labels_all, predicts_all, target_names=class_list, digits=4)
    confusion = metrics.confusion_matrix(labels_all, predicts_all)
    
    return epoch_loss / len(iterator), epoch_acc / len(iterator),report,confusion
    

In [None]:


best_valid_loss = float('inf')
trainacc,val_acc,trainloss,val_loss=[],[],[],[]

for epoch in range(N_EPOCHS):
    
    start = time.time()
    
    train_loss,train_acc= train(model, train_iter, optimizer, criterion)
    valid_loss,valid_acc = evaluate(model, valid_iter, criterion)
        
    end = time.time()
        
    runtime = end - start
    #模型保不保存无所谓了 做实验为主    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'bertlstm-model.pt')
    trainacc.append(train_acc)
    val_acc.append(valid_acc)
    trainloss.append(train_loss)
    val_loss.append(valid_loss)

    print('epoch: %d, train loss: %.4f, train acc: %.4f, test loss: %.4f, test acc: %.4f, time: %.2f' %
      (epoch, train_loss, train_acc, valid_loss,valid_acc, runtime))

In [None]:
import matplotlib.pyplot as plt
# 绘制训练 
plt.plot(trainacc)
plt.plot(val_acc)
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

# 绘制训练 & 验证的损失值
plt.plot(trainloss)
plt.plot(val_loss)
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

In [None]:
result=test(model,test_iter, criterion)
print("test_acc:  %.4f ,test loss: %.4f"%(result[1],result[0]))
print(result[2])
print(result[3])