In [1]:
import numpy as np
import torch
import string
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import csv
import time

data_train_dir="Data/snli_1.0_train.csv"
data_dev_dir="Data/snli_1.0_dev.csv"
data_test_dir="Data/snli_1.0_test.csv"
embedding_file_dir="Data/glove.6B.300d.txt"
worddict_dir='Data/worddict.txt'

In [2]:
def read_data(data_dir):
    premise=[]
    hypothesis=[]
    labels=[] 
    labels_map={"entailment":0,"neutral":1,"contradiction":2}
    punct_table = str.maketrans({key: " " for key in string.punctuation})
    with open(data_dir,'r') as f:
        reader=csv.reader(f)
        for line in reader:
            if line[0] not in labels_map :   #忽略没有label的例子
                continue
            premise.append(line[5].translate(punct_table).lower())
            hypothesis.append(line[6].translate(punct_table).lower())
            labels.append(line[0])
    del reader
    return {"premise":premise,
            "hypothesis":hypothesis,
            "labels":labels}  

def build_worddict(data):
    words=[]
    words.extend(["_PAD_","_OOV_","_BOS_","_EOS_"])
    for sentence in data["premise"]:
        words.extend(sentence.strip().split(" "))
    for sentence in data["hypothesis"]:
        words.extend(sentence.strip().split(" ")) 
    word_id={}
    id_word={}
    i=0
    for index,word in enumerate(words):
        if word not in word_id:
            word_id[word]=i
            id_word[i]=word
            i+=1
    #保存词典
    with open(worddict_dir, "w",encoding='utf-8') as f:
        for word in word_id:
            f.write("%s\t%d\n"%(word, word_id[word]))
    return word_id,id_word

def sentence2idList(sentence,word_id):
    ids=[]
    ids.append(word_id["_BOS_"])
    sentence=sentence.strip().split(" ")
    for word in sentence:
        if word not in word_id:
            ids.append(word_id["_OOV_"])
        else:
            ids.append(word_id[word])
    ids.append(word_id["_EOS_"])
    return ids

def data2id(data,word_id):
    premise_id=[]
    hypothesis_id=[]
    labels_id=[] 
    labels_map={"entailment":0,"neutral":1,"contradiction":2}
    for i,label in enumerate(data["labels"]):
        if label not in labels_map:   #忽略没有label的例子
            continue
        premise_id.append(sentence2idList(data["premise"][i],word_id))
        hypothesis_id.append(sentence2idList(data["hypothesis"][i],word_id))
        labels_id.append(labels_map[label])
            
    return {"premise_id":premise_id,
            "hypothesis_id":hypothesis_id,
            "labels_id":labels_id}    

def build_embeddings(embedding_file,word_id):   #读取文件存入集合中
    embeddings_map={}
    with open(embedding_file,'r',encoding='utf-8') as f:
        for line in f:
            line=line.strip().split()
            word=line[0]
            if word in word_id:
                embeddings_map[word]=line[1:]
    #放入矩阵中
    words_num = len(word_id)
    embedding_dim=len(embeddings_map['a'])
    embedding_matrix=np.zeros((words_num,embedding_dim))
    #print(words_num,embedding_dim)
    missed_cnt=0
    for i,word in enumerate(word_id):
        if word in embeddings_map:
            embedding_matrix[i]=embeddings_map[word]
        else:
            if word=="_PAD_":
                continue
            else:
                missed_cnt+=1
                embedding_matrix[i]=np.random.normal(size=embedding_dim)
    print("missed word count: %d"%(missed_cnt)) 
    return embeddings_map,embedding_matrix

In [3]:
data_train_str=read_data(data_train_dir)
data_dev_str=read_data(data_dev_dir)
word_id,id_word=build_worddict(data_train_str)
data_train_str["hypothesis"][91381]="cannot see picture to discribe"
data_train_str["hypothesis"][91382]="there is no picture"
data_train_str["hypothesis"][91383]="there is a picture"

In [4]:
data_train_id=data2id(data_train_str,word_id)
data_dev_id=data2id(data_dev_str,word_id)
embeddings_map,embedding_matrix=build_embeddings(embedding_file_dir,word_id)
print("embedding_matrix size: %d"%len(embedding_matrix))

missed word count: 5994
embedding_matrix size: 33268


In [5]:
class SnliDataSet(Dataset):
    def __init__(self,data,max_premise_len=None,max_hypothesis_len=None):
        self.num_sequence=len(data["premise_id"])  #创建tensor矩阵的尺寸
        self.premise_len=[len(seq) for seq in data["premise_id"]]
        self.max_premise_len=max_premise_len
        if self.max_premise_len is None:
            self.max_premise_len=max(self.premise_len)
        
        self.hypothesis_len=[len(seq) for seq in data["hypothesis_id"]]
        self.max_hypothesis_len=max_hypothesis_len
        if max_hypothesis_len is None:
            self.max_hypothesis_len=max(self.hypothesis_len)
        print(self.num_sequence, self.max_premise_len)
        print(self.num_sequence, self.max_hypothesis_len)
        #转成tensor，封装到data里
        self.data= {
            "premise":torch.zeros((self.num_sequence,self.max_premise_len),dtype=torch.long),
            "hypothesis":torch.zeros((self.num_sequence,self.max_hypothesis_len),dtype=torch.long),
            "labels":torch.zeros(len(data["labels_id"]),dtype=torch.long)
        }
        
        for i,premise in enumerate(data["premise_id"]):
            l1=len(data["premise_id"][i])
            self.data["premise"][i][:l1]=torch.tensor(data["premise_id"][i][:l1])
            l2=len(data["hypothesis_id"][i])
            self.data["hypothesis"][i][:l2]=torch.tensor(data["hypothesis_id"][i][:l2])
            self.data["labels"][i]=data["labels_id"][i]
        
    def __len__(self):
        return self.num_sequence
        
    def __getitem__(self,index):
        return {"premises": self.data["premise"][index],
                "premises_len":min(self.premise_len[index], self.max_premise_len),
                "hypothesis":self.data["hypothesis"][index],
                "hypothesis_len":min(self.hypothesis_len[index], self.max_hypothesis_len),
                "labels":self.data["labels"][index]   
               }

In [6]:
batch_size=128
patience=3
hidden_size=128
dropout=0.5
num_classes=3
lr=0.0001
epochs=5
max_grad_norm=10.0
use_gpu=False

if(torch.cuda.is_available()):
    use_gpu=True
device=torch.device("cuda:0" if use_gpu else "cpu")

train_data=SnliDataSet(data_train_id,max_premise_len=None,max_hypothesis_len=None)
train_loader=DataLoader(train_data,batch_size=batch_size,shuffle=True)
dev_data=SnliDataSet(data_dev_id,max_premise_len=None,max_hypothesis_len=None)
dev_loader=DataLoader(dev_data,batch_size=batch_size,shuffle=False)
embeddings=torch.tensor(embedding_matrix,dtype=torch.float).to(device)

549367 83
549367 66
9842 60
9842 56


In [7]:
def sort_by_seq_lens(batch, sequences_lengths, descending=True):
    sorted_seq_lens, sorting_index =sequences_lengths.sort(0, descending=descending)
    sorted_batch = batch.index_select(0, sorting_index)
    idx_range = sequences_lengths.new_tensor(torch.arange(0, len(sequences_lengths)))
    _, reverse_mapping = sorting_index.sort(0, descending=False)
    restoration_index = idx_range.index_select(0, reverse_mapping)
    return sorted_batch, sorted_seq_lens, sorting_index, restoration_index


def get_mask(sequences_batch, sequences_lengths):
    batch_size = sequences_batch.size()[0]
    max_length = torch.max(sequences_lengths)
    mask = torch.ones(batch_size, max_length, dtype=torch.float)
    mask[sequences_batch[:, :max_length] == 0] = 0.0
    return mask


def masked_softmax(tensor, mask):
    tensor_shape = tensor.size()
    reshaped_tensor = tensor.view(-1, tensor_shape[-1])
    while mask.dim() < tensor.dim():
        mask = mask.unsqueeze(1)
    mask = mask.expand_as(tensor).contiguous().float()
    reshaped_mask = mask.view(-1, mask.size()[-1])
    result = nn.functional.softmax(reshaped_tensor * reshaped_mask, dim=-1)
    result = result * reshaped_mask
    result = result / (result.sum(dim=-1, keepdim=True) + 1e-13)
    return result.view(*tensor_shape)


def weighted_sum(tensor, weights, mask):
    weighted_sum = weights.bmm(tensor)
    while mask.dim() < weighted_sum.dim():
        mask = mask.unsqueeze(1)
    mask = mask.transpose(-1, -2)
    mask = mask.expand_as(weighted_sum).contiguous().float()
    return weighted_sum * mask


def replace_masked(tensor, mask, value):
    mask = mask.unsqueeze(1).transpose(2, 1)
    reverse_mask = 1.0 - mask
    values_to_add = value * reverse_mask
    return tensor * mask + values_to_add

In [8]:
class RNNDropout(nn.Dropout):
    def forward(self, sequences_batch):
        ones = sequences_batch.data.new_ones(sequences_batch.shape[0],
                                             sequences_batch.shape[-1])
        dropout_mask = nn.functional.dropout(ones, self.p, self.training,
                                             inplace=False)
        return dropout_mask.unsqueeze(1) * sequences_batch


class Seq2SeqEncoder(nn.Module):
    def __init__(self,
                 rnn_type,
                 input_size,
                 hidden_size,
                 num_layers=1,
                 bias=True,
                 dropout=0.0,
                 bidirectional=False):
        assert issubclass(rnn_type, nn.RNNBase),\
              "rnn_type must be a class inheriting from torch.nn.RNNBase"

        super(Seq2SeqEncoder, self).__init__()
        self.rnn_type = rnn_type
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bias = bias
        self.dropout = dropout
        self.bidirectional = bidirectional
        self._encoder = rnn_type(input_size,
                                 hidden_size,
                                 num_layers=num_layers,
                                 bias=bias,
                                 batch_first=True,
                                 dropout=dropout,
                                 bidirectional=bidirectional
                                )
    def forward(self, sequences_batch, sequences_lengths):
        sorted_batch, sorted_lengths, _, restoration_idx =\
            sort_by_seq_lens(sequences_batch, sequences_lengths)
        packed_batch = nn.utils.rnn.pack_padded_sequence(sorted_batch,
                                                         sorted_lengths,
                                                         batch_first=True)

        outputs, _ = self._encoder(packed_batch, None)

        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs,
                                                      batch_first=True)
        reordered_outputs = outputs.index_select(0, restoration_idx)

        return reordered_outputs


class SoftmaxAttention(nn.Module):
    
    def forward(self,
                premise_batch,
                premise_mask,
                hypothesis_batch,
                hypothesis_mask):
        
        # Dot product between premises and hypotheses in each sequence of
        # the batch.
        similarity_matrix = premise_batch.bmm(hypothesis_batch.transpose(2, 1)
                                                              .contiguous())

        # Softmax attention weights.
        prem_hyp_attn = masked_softmax(similarity_matrix, hypothesis_mask)
        hyp_prem_attn = masked_softmax(similarity_matrix.transpose(1, 2)
                                                        .contiguous(),
                                       premise_mask)

        # Weighted sums of the hypotheses for the the premises attention,
        # and vice-versa for the attention of the hypotheses.
        attended_premises = weighted_sum(hypothesis_batch,
                                         prem_hyp_attn,
                                         premise_mask)
        attended_hypotheses = weighted_sum(premise_batch,
                                           hyp_prem_attn,
                                           hypothesis_mask)

        return attended_premises, attended_hypotheses

In [None]:
class ESIM(nn.Module):
    """
    Implementation of the ESIM model presented in the paper "Enhanced LSTM for
    Natural Language Inference" by Chen et al.
    """

    def __init__(self,
                 vocab_size,
                 embedding_dim,
                 hidden_size,
                 embeddings=None,
                 padding_idx=0,
                 dropout=0.5,
                 num_classes=3,
                 device="cpu"):
        """
        Args:
            vocab_size: The size of the vocabulary of embeddings in the model.
            embedding_dim: The dimension of the word embeddings.
            hidden_size: The size of all the hidden layers in the network.
            embeddings: A tensor of size (vocab_size, embedding_dim) containing
                pretrained word embeddings. If None, word embeddings are
                initialised randomly. Defaults to None.
            padding_idx: The index of the padding token in the premises and
                hypotheses passed as input to the model. Defaults to 0.
            dropout: The dropout rate to use between the layers of the network.
                A dropout rate of 0 corresponds to using no dropout at all.
                Defaults to 0.5.
            num_classes: The number of classes in the output of the network.
                Defaults to 3.
            device: The name of the device on which the model is being
                executed. Defaults to 'cpu'.
        """
        super(ESIM, self).__init__()

        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.hidden_size = hidden_size
        self.num_classes = num_classes
        self.dropout = dropout
        self.device = device

        self._word_embedding = nn.Embedding(self.vocab_size,
                                            self.embedding_dim,
                                            padding_idx=padding_idx,
                                            _weight=embeddings)

        if self.dropout:
            self._rnn_dropout = RNNDropout(p=self.dropout)
            # self._rnn_dropout = nn.Dropout(p=self.dropout)

        self._encoding = Seq2SeqEncoder(nn.LSTM,
                                        self.embedding_dim,
                                        self.hidden_size,
                                        bidirectional=True)

        self._attention = SoftmaxAttention()

        self._projection = nn.Sequential(nn.Linear(4*2*self.hidden_size,
                                                   self.hidden_size),
                                         nn.ReLU())

        self._composition = Seq2SeqEncoder(nn.LSTM,
                                           self.hidden_size,
                                           self.hidden_size,
                                           bidirectional=True)

        self._classification = nn.Sequential(nn.Dropout(p=self.dropout),
                                             nn.Linear(2*4*self.hidden_size,
                                                       self.hidden_size),
                                             nn.Tanh(),
                                             nn.Dropout(p=self.dropout),
                                             nn.Linear(self.hidden_size,
                                                       self.num_classes))

        # Initialize all weights and biases in the model.
        self.apply(_init_esim_weights)

    def forward(self,
                premises,
                premises_lengths,
                hypotheses,
                hypotheses_lengths):
        """
        Args:
            premises: A batch of varaible length sequences of word indices
                representing premises. The batch is assumed to be of size
                (batch, premises_length).
            premises_lengths: A 1D tensor containing the lengths of the
                premises in 'premises'.
            hypothesis: A batch of varaible length sequences of word indices
                representing hypotheses. The batch is assumed to be of size
                (batch, hypotheses_length).
            hypotheses_lengths: A 1D tensor containing the lengths of the
                hypotheses in 'hypotheses'.

        Returns:
            logits: A tensor of size (batch, num_classes) containing the
                logits for each output class of the model.
            probabilities: A tensor of size (batch, num_classes) containing
                the probabilities of each output class in the model.
        """
        premises_mask = get_mask(premises, premises_lengths).to(self.device)
        hypotheses_mask = get_mask(hypotheses, hypotheses_lengths)\
            .to(self.device)

        embedded_premises = self._word_embedding(premises)
        embedded_hypotheses = self._word_embedding(hypotheses)

        if self.dropout:
            embedded_premises = self._rnn_dropout(embedded_premises)
            embedded_hypotheses = self._rnn_dropout(embedded_hypotheses)

        encoded_premises = self._encoding(embedded_premises,
                                          premises_lengths)
        encoded_hypotheses = self._encoding(embedded_hypotheses,
                                            hypotheses_lengths)

        attended_premises, attended_hypotheses =\
            self._attention(encoded_premises, premises_mask,
                            encoded_hypotheses, hypotheses_mask)

        enhanced_premises = torch.cat([encoded_premises,
                                       attended_premises,
                                       encoded_premises - attended_premises,
                                       encoded_premises * attended_premises],
                                      dim=-1)
        enhanced_hypotheses = torch.cat([encoded_hypotheses,
                                         attended_hypotheses,
                                         encoded_hypotheses -
                                         attended_hypotheses,
                                         encoded_hypotheses *
                                         attended_hypotheses],
                                        dim=-1)

        projected_premises = self._projection(enhanced_premises)
        projected_hypotheses = self._projection(enhanced_hypotheses)

        if self.dropout:
            projected_premises = self._rnn_dropout(projected_premises)
            projected_hypotheses = self._rnn_dropout(projected_hypotheses)

        v_ai = self._composition(projected_premises, premises_lengths)
        v_bj = self._composition(projected_hypotheses, hypotheses_lengths)

        v_a_avg = torch.sum(v_ai * premises_mask.unsqueeze(1)
                                                .transpose(2, 1), dim=1)\
            / torch.sum(premises_mask, dim=1, keepdim=True)
        v_b_avg = torch.sum(v_bj * hypotheses_mask.unsqueeze(1)
                                                  .transpose(2, 1), dim=1)\
            / torch.sum(hypotheses_mask, dim=1, keepdim=True)

        v_a_max, _ = replace_masked(v_ai, premises_mask, -1e7).max(dim=1)
        v_b_max, _ = replace_masked(v_bj, hypotheses_mask, -1e7).max(dim=1)

        v = torch.cat([v_a_avg, v_a_max, v_b_avg, v_b_max], dim=1)

        logits = self._classification(v)
        probabilities = nn.functional.softmax(logits, dim=-1)

        return logits, probabilities


def _init_esim_weights(module):
    """
    Initialise the weights of the ESIM model.
    """
    if isinstance(module, nn.Linear):
        nn.init.xavier_uniform_(module.weight.data)
        nn.init.constant_(module.bias.data, 0.0)

    elif isinstance(module, nn.LSTM):
        nn.init.xavier_uniform_(module.weight_ih_l0.data)
        nn.init.orthogonal_(module.weight_hh_l0.data)
        nn.init.constant_(module.bias_ih_l0.data, 0.0)
        nn.init.constant_(module.bias_hh_l0.data, 0.0)
        hidden_size = module.bias_hh_l0.data.shape[0] // 4
        module.bias_hh_l0.data[hidden_size:(2*hidden_size)] = 1.0

        if (module.bidirectional):
            nn.init.xavier_uniform_(module.weight_ih_l0_reverse.data)
            nn.init.orthogonal_(module.weight_hh_l0_reverse.data)
            nn.init.constant_(module.bias_ih_l0_reverse.data, 0.0)
            nn.init.constant_(module.bias_hh_l0_reverse.data, 0.0)
            module.bias_hh_l0_reverse.data[hidden_size:(2*hidden_size)] = 1.0

In [None]:
model = ESIM(embeddings.shape[0],
             embeddings.shape[1],
             hidden_size,
             embeddings=embeddings,
             dropout=dropout,
             num_classes=num_classes,
             device=device).to(device)

In [None]:
#准备训练
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,mode="max",factor=0.5,patience=0)


In [None]:
def getCorrectNum(probs, targets):
    _, out_classes = probs.max(dim=1)
    correct = (out_classes == targets).sum()
    return correct.item()

def train(model, data_loader, optimizer, criterion, max_gradient_norm):
    model.train()
    device=model.device
    
    time_epoch_start= time.time()
    running_loss=0 
    correct_cnt=0
    batch_cnt=0
    
    for index,batch in enumerate(data_loader):
        time_batch_start=time.time()
        #从data_loader中取出数据
        premises=batch["premises"].to(device)
        premises_len=batch["premises_len"].to(device)
        hypothesis=batch["hypothesis"].to(device)
        hypothesis_len=batch["hypothesis_len"].to(device)
        labels=batch["labels"].to(device)
        #梯度置0
        optimizer.zero_grad()
        #正向传播
        logits,probs=model(premises,premises_len,hypothesis,hypothesis_len)
        #求损失，反向传播，梯度裁剪，更新权重
        loss = criterion(logits, labels)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_gradient_norm)
        optimizer.step()
        
        running_loss+=loss.item()
        correct_cnt+=getCorrectNum(probs,labels)
        batch_cnt+=1
        if index%100==0:
            print("Training  ------>   Batch count: {:d}/{:d},  batch time: {:.4f}s,  batch average loss: {:.4f}"
              .format(batch_cnt,len(data_loader),time.time()-time_batch_start, running_loss/(index+1)))
        del premises
        del premises_len
        del hypothesis
        del hypothesis_len
        del labels
        
    epoch_time = time.time() - time_epoch_start
    epoch_loss = running_loss / len(data_loader)
    epoch_accuracy = correct_cnt / len(data_loader.dataset) 
    return epoch_time,epoch_loss,epoch_accuracy

def validate(model, data_loader, criterion):
    model.eval()
    device=model.device
    
    time_epoch_start= time.time()
    running_loss=0 
    correct_cnt=0
    batch_cnt=0

    for index,batch in enumerate(data_loader):
        time_batch_start= time.time()
        #从data_loader中取出数据
        premises=batch["premises"].to(device)
        premises_len=batch["premises_len"].to(device)
        hypothesis=batch["hypothesis"].to(device)
        hypothesis_len=batch["hypothesis_len"].to(device)
        labels=batch["labels"].to(device)
        
        #正向传播
        logits,probs=model(premises,premises_len,hypothesis,hypothesis_len)

        #求损失
        loss = criterion(logits, labels)
        running_loss+=loss.item()
        correct_cnt+=getCorrectNum(probs,labels)
        batch_cnt+=1
        if index%100==0:
            print("Training  ------>   Batch count: {:d}/{:d},  batch time: {:.4f}s,  batch average loss: {:.4f}"
              .format(batch_cnt,len(data_loader),time.time()-time_batch_start, running_loss/(index+1)))
        del premises
        del premises_len
        del hypothesis
        del hypothesis_len
        del labels
        
    epoch_time = time.time() - time_epoch_start
    epoch_loss = running_loss / len(data_loader)
    epoch_accuracy = correct_cnt / len(data_loader.dataset) 
    return epoch_time,epoch_loss,epoch_accuracy



In [None]:
#模型训练

In [None]:
#训练过程中的参数
best_score=0.0
train_losses=[]
valid_losses=[]
patience_cnt=0


for epoch in range(epochs):
    #训练
    print("-"*50,"Training epoch %d"%(epoch),"-"*50)
    epoch_time,epoch_loss,epoch_accuracy =train(model,train_loader,optimizer,criterion,max_grad_norm)
    train_losses.append(epoch_loss)
    print("Training time: {:.4f}s, loss :{:.4f}, accuracy: {:.4f}%".format(epoch_time, epoch_loss, (epoch_accuracy*100)))
    
    #验证
    print("-"*50,"Validating epoch %d"%(epoch),"-"*50)
    epoch_time_dev, epoch_loss_dev, epoch_accuracy_dev = validate(model,dev_loader,criterion)
    valid_losses.append(epoch_loss_dev)
    print("Validating time: {:.4f}s, loss: {:.4f}, accuracy: {:.4f}%\n".format(epoch_time_dev, epoch_loss_dev, (epoch_accuracy_dev*100)))
    
    #更新学习率
    scheduler.step(epoch_accuracy)
    
    #early stoping
    if epoch_accuracy_dev< best_score:
        patience_cnt+=1
    else:
        best_score=epoch_accuracy_dev
        patience_cnt=0
    if patience_cnt>=patience:
            print("-"*50,"Early stopping","-"*50)
            break
        
    #每个epoch都保存模型
    torch.save({"epoch": epoch,
                "model": model.state_dict(),
                "best_score": best_score,
                "train_losses": train_losses,
                "valid_losses": valid_losses},
               "model_train_dir_"+str(epoch)+".pth")