In [1]:
import sys
import jieba
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import torch.nn.functional as F
from torch.optim.lr_scheduler import StepLR
from torchtext import data
from time import time
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

# hyper parameters

STR_MAXLEN = 30
BATCH_SIZE = 256
DEVICE = torch.device("cuda:0"if torch.cuda.is_available() else "cpu")
EMBED_DIM = 300
HIDDEN_DIM = 100
DEEP_LAYERS = [200]
LEARNING_RATE = 1e-3
EPOCHES = 80
DECAY_STEP = 2
DECAY_GAMMA = 0.99
CLASS_WEIGHT = [0.6116, 2.7397]
def print_flush(data, args=None):
    if args == None:
        print(data)
    else:
        print(data, args)
    sys.stdout.flush()
    
def pad_seq(seq, max_length):
    length = len(seq[0])
    pad_leng = 0 if length > max_length else (max_length-length)
    if pad_leng == 0:
        seq = seq[:, :max_length]
    else:
        seq = torch.cat([seq, torch.ones(len(seq), pad_leng).long().to(DEVICE)], dim=1)
    return seq

def wordlist_to_matrix(pretrain_path, wordlist, device, dim=300):
    word_vec = {}
    with open(pretrain_path, encoding='utf-8') as fr:
        for line in fr:
            line = line.split(' ')
            word = line[0]
            vec = line[1:]
            word_vec[word] = np.array(vec, dtype=float)
    word_vec_list = []
    oov = 0
    oov_words = []
    for idx, word in enumerate(wordlist):
        try:
            vector = np.array(word_vec[word], dtype=float).reshape(1,dim)
        except:
            oov += 1
            oov_words.append(word)
            # print(word)
            vector = np.random.rand(1, dim)
        word_vec_list.append(torch.from_numpy(vector))
    wordvec_matrix = torch.cat(word_vec_list)
    print("Load embedding finished.")
    print("Total words count: {}, oov count: {}.".format(wordvec_matrix.size()[0], oov))
    return wordvec_matrix if device == -1 else wordvec_matrix.to(device)

In [3]:
# prepare data
def tokenizer(text): # create a tokenizer function
    return [txt for txt in text]
class BatchWrapper:
    def __init__(self, dl, iter_columns):
        self.dl, self.iter_columns = dl, iter_columns  # we pass in the list of attributes for x &amp;amp;amp;amp;lt;g class="gr_ gr_3178 gr-alert gr_spell gr_inline_cards gr_disable_anim_appear ContextualSpelling ins-del" id="3178" data-gr-id="3178"&amp;amp;amp;amp;gt;and y&amp;amp;amp;amp;lt;/g&amp;amp;amp;amp;gt;

    def __iter__(self):
        for batch in self.dl:
            yield (getattr(batch, attr) for attr in self.iter_columns)

    def __len__(self):
        return len(self.dl)
    
print_flush('process raw data...')
TEXT = data.Field(sequential=True, use_vocab=True, eos_token='<EOS>', init_token='<BOS>',pad_token='<PAD>', 
                  batch_first=True, tokenize=tokenizer)
LABEL = data.Field(sequential=False, use_vocab=False, batch_first=True)

tv_datafields = [("id", None), # we won't be needing the id, so we pass in None as the field
                 ("txt1", TEXT), ("txt2", TEXT),
                 ("label", LABEL)]

train = data.TabularDataset(path='../datasets/train.csv', format='csv', skip_header=True, fields=tv_datafields)
valid = data.TabularDataset(path='../datasets/valid.csv', format='csv', skip_header=True, fields=tv_datafields)

TEXT.build_vocab(train, valid, min_freq=3)
print_flush('Building vocabulary Finished.')
matrix = wordlist_to_matrix('../datasets/pretrain_embedding/pretrain_full_emb.txt', TEXT.vocab.itos, DEVICE)

process raw data...
Building vocabulary Finished.
Load embedding finished.
Total words count: 1517, oov count: 8.


In [4]:
train_iter = data.BucketIterator(dataset=train, batch_size=BATCH_SIZE, sort_key=lambda x: len(x.txt1) + len(x.txt2), shuffle=False, device=DEVICE, repeat=False)
valid_iter = data.Iterator(dataset=valid, batch_size=BATCH_SIZE, device=DEVICE, shuffle=False, repeat=False)

train_dl = BatchWrapper(train_iter, ["txt1", "txt2", "label"])
valid_dl = BatchWrapper(valid_iter, ["txt1", "txt2", "label"])

print_flush('prepare data done!')

prepare data done!


In [5]:
class ConvLayer(nn.Module):
    '''
    convolution layer for abcnn
    Attributes
    ----------
    inception : bool
        whether use inception module
    '''
    def __init__(self, isAbcnn2, sentence_length, filter_width, filter_height, filter_channel, inception):
        super(ConvLayer, self).__init__()
        if inception:
            self.model = InceptionModule(1 if isAbcnn2 else 2, sentence_length, filter_width, filter_height, filter_channel)
        else:
            self.model = convolution(1 if isAbcnn2 else 2, filter_width, filter_height, filter_channel, filter_width-1)

    def forward(self, x):
        '''
        1. convlayer
            size (batch_size, filter_channel, width, 1)
        2. transpose
            size (batch_size, 1, width, filter_channel)
        Parameters
        ----------
        x : 4-D torch Tensor
            size (batch_size, 1, width, height)
        
        Returns
        -------
        output : 4-D torch Tensor
            size (batch_size, 1, width, filter_channel)
        '''
        output = self.model(x)
        output = output.permute(0, 3, 2, 1)
        return output

def cosine_similarity(x1, x2):
    '''compute cosine similarity between x1 and x2
    Parameters
    ----------
    x1, x2 : 2-D torch Tensor
        size (batch_size, 1)
    Returns
    -------
    distance : 2-D torch Tensor
        similarity result of size (batch_size, 1)
    '''
    return F.cosine_similarity(x1, x2).unsqueeze(1)

def manhattan_distance(x1, x2):
    '''compute manhattan distance between x1 and x2 (not in paper)
    Parameters
    ----------
    x1, x2 : 2-D torch Tensor
        size (batch_size, 1)
    Returns
    -------
    distance : 2-D torch Tensor
        similarity result of size (batch_size, 1)
    '''
    return torch.div(torch.norm((x1 - x2), 1, 1, keepdim=True), x1.size()[1])

def convolution(in_channel, filter_width, filter_height, filter_channel, padding):
    '''convolution layer
    '''
    model = nn.Sequential(
        nn.Conv2d(in_channel, filter_channel, (filter_width, filter_height), stride=1, padding=(padding, 0)),
        nn.BatchNorm2d(filter_channel),
        nn.Tanh()
    )
    return model
    
def attention_matrix(x1, x2, eps=1e-6):
    '''compute attention matrix using match score
    
    1 / (1 + |x · y|)
    |·| is euclidean distance
    Parameters
    ----------
    x1, x2 : 4-D torch Tensor
        size (batch_size, 1, sentence_length, h)
    
    Returns
    -------
    output : 3-D torch Tensor
        match score result of size (batch_size, sentence_length(for x2), sentence_length(for x1))
    '''
    eps = torch.tensor(eps).to(DEVICE)
    one = torch.tensor(1.).to(DEVICE)
    euclidean = (torch.pow(x1 - x2.permute(0, 2, 1, 3), 2).sum(dim=3) + eps).sqrt()
    return (euclidean + one).reciprocal()

class ApLayer(nn.Module):
    '''column-wise averaging over all columns
    '''

    def __init__(self, pool_width, height):
        super(ApLayer, self).__init__()
        self.ap = nn.AvgPool2d((pool_width, 1), stride=1)
        self.height = height

    def forward(self, x):
        '''
        1. average pooling
            x size (batch_size, 1, 1, height)
        2. representation vector for the sentence
            output size (batch_size, height)
        Parameters
        ----------
        x : 4-D torch Tensor
            convolution output of size (batch_size, 1, sentence_length, height)
        
        Returns
        -------
        output : 2-D torch Tensor
            representation vector of size (batch_size, height)
        '''
        
        
        return self.ap(x).view([-1, self.height])

class WpLayer(nn.Module):
    '''column-wise averaging over windows of w consecutive columns
    Attributes
    ----------
    attention : bool
        compute layer with attention matrix
    '''
    def __init__(self, sentence_length, filter_width, attention):
        super(WpLayer, self).__init__()
        self.attention = attention
        if attention:
            self.sentence_length = sentence_length
            self.filter_width = filter_width
        else:
            self.wp = nn.AvgPool2d((filter_width, 1), stride=1)

    def forward(self, x, attention_matrix=None):
        '''
        if attention
            reweight the convolution output with attention matrix
        else
            average pooling
        Parameters
        ----------
        x : 4-D torch Tensor
            convolution output of size (batch_size, 1, sentence_length + filter_width - 1, height)
        attention_matrix: 2-D torch Tensor
            attention matrix between (convolution output x1 and convolution output x2) of size (batch_size, sentence_length + filter_width - 1)
        
        Returns
        -------
        output : 4-D torch Tensor
            size (batch_size, 1, sentence_length, height)
        '''
        if self.attention:
            pools = []
            attention_matrix = attention_matrix.unsqueeze(1).unsqueeze(3)
            for i in range(self.sentence_length):
                pools.append((x[:, :, i:i+self.filter_width, :] * attention_matrix[:, :, i:i+self.filter_width, :]).sum(dim=2, keepdim=True))

            return torch.cat(pools, dim=2)
        
        else:
            return self.wp(x)        
    
def weights_init(m):
    classname = m.__class__.__name__
    if classname.find('Conv') != -1 and classname.find('Layer') == -1:
        nn.init.xavier_uniform_(m.weight)
    elif classname.find('Linear') != -1:
        nn.init.xavier_uniform_(m.weight)
        nn.init.constant_(m.bias, 0.1)
    elif classname.find('BatchNorm') != -1:
        m.weight.data.normal_(1.0, 0.02)
        m.bias.data.fill_(0)

In [6]:
class Abcnn1Portion(nn.Module):
    '''Part of Abcnn1
    '''

    def __init__(self, in_dim, out_dim):
        super(Abcnn1Portion, self).__init__()
        self.batchNorm = nn.BatchNorm2d(2)
        self.attention_feature_layer = nn.Linear(in_dim, out_dim)

    def forward(self, x1, x2):
        '''
        1. compute attention matrix
            attention_m : size (batch_size, sentence_length, sentence_length)
        2. generate attention feature map(weight matrix are parameters of the model to be learned)
            x_attention : size (batch_size, 1, sentence_length, height)
        3. stack the representation feature map and attention feature map
            x : size (batch_size, 2, sentence_length, height)
        4. batch norm(not in paper)
        Parameters
        ----------
        x1, x2 : 4-D torch Tensor
            size (batch_size, 1, sentence_length, height)
        Returns
        -------
        (x1, x2) : list of 4-D torch Tensor
            size (batch_size, 2, sentence_length, height)
        '''
        attention_m = attention_matrix(x1, x2)
#         print('attention matrix', attention_m)
#         print('size', attention_m.size())
        x1_attention = self.attention_feature_layer(attention_m.permute(0, 2, 1))
#         print('x1 attention', x1_attention)
#         print('size', x1_attention.size())

        x1_attention = x1_attention.unsqueeze(1)
        x1 = torch.cat([x1, x1_attention], 1)

        x2_attention = self.attention_feature_layer(attention_m)
        x2_attention = x2_attention.unsqueeze(1)
        x2 = torch.cat([x2, x2_attention], 1)

        x1 = self.batchNorm(x1)
        x2 = self.batchNorm(x2)
        
        return (x1, x2)

In [14]:
class Abcnn1(nn.Module):
    '''
    ABCNN1
    1. ABCNN1
    2. wide convolution
    3. W-ap
    Attributes
    ----------
    layer_size : int
        the number of (abcnn1)
    distance : function
        cosine similarity or manhattan
    abcnn : list of abcnn1
    conv : list of convolution layer
    wp : list of w-ap pooling layer
    ap : list of pooling layer
    fc : last linear layer(in paper use logistic regression)
    '''

    def __init__(self, vocab, emb_dim, sentence_length, filter_width, filter_channel=50, layer_size=2, match='cosine', inception=True,  pretrain_embed=torch.tensor([]), ):
        super(Abcnn1, self).__init__()
        self.layer_size = layer_size
        if match == 'cosine':
            self.distance = cosine_similarity
        else:
            self.distance = manhattan_distance
        self.word_embed = nn.Embedding(len(vocab), emb_dim, padding_idx=1)
        if len(pretrain_embed) != 0:
            self.word_embed.weight.data.copy_(pretrain_embed)
        self.lstm_embed = nn.LSTM(embed_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.abcnn = nn.ModuleList()
        self.conv = nn.ModuleList()
        self.ap = nn.ModuleList([ApLayer(sentence_length, emb_dim)])
        self.wp = nn.ModuleList()
        self.fc = nn.Linear(layer_size+1, 2)
        torch.manual_seed(2018)
        for i in range(layer_size):
            self.abcnn.append(Abcnn1Portion(sentence_length, emb_dim if i == 0 else filter_channel))
            self.conv.append(ConvLayer(False, sentence_length, filter_width, emb_dim if i == 0 else filter_channel, filter_channel, inception))
            self.ap.append(ApLayer(sentence_length + filter_width - 1, filter_channel))
            self.wp.append(WpLayer(sentence_length, filter_width, False))

    def forward(self, x1, x2):
        '''
        1. stack sentence vector similarity
        2. for layer_size
            abcnn1
            convolution
            stack sentence vector similarity
            W-ap for next loop x1, x2
        
        3. concatenate similarity list
            size (batch_size, layer_size + 1)
        4. Linear layer
            size (batch_size, 1)
        Parameters
        ----------
        x1, x2 : 4-D torch Tensor
            size (batch_size, 1, width, emb_dim)
        Returns
        -------
        output : 2-D torch Tensor
            size (batch_size, 1)
        '''
        x1 = self.word_embed(x1)
        x2 = self.word_embed(x2)
        x1.unsqueeze_(1)
        x2.unsqueeze_(1)
        sim = []
        sim.append(self.distance(self.ap[0](x1), self.ap[0](x2)))

        for i in range(self.layer_size):
            x1, x2 = self.abcnn[i](x1, x2)
#             print('ap', self.conv[i](x1))
#             print('size', self.conv[i](x1).size())
            x1 = self.conv[i](x1)
            x2 = self.conv[i](x2)
            sim.append(self.distance(self.ap[i+1](x1), self.ap[i+1](x2)))
#             print('ap',  self.wp[i](x1))
#             print('size',  self.wp[i](x1).size())
            x1 = self.wp[i](x1)
            x2 = self.wp[i](x2)
        sim_fc = torch.cat(sim, dim=1)
        output = self.fc(sim_fc)
        return F.log_softmax(output)

In [15]:
model = Abcnn1(TEXT.vocab.stoi, EMBED_DIM, STR_MAXLEN, 3,50, 2, 'cosine', False)
model.to(DEVICE)
criterion = nn.NLLLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
scheduler = StepLR(optimizer, step_size=DECAY_STEP, gamma=DECAY_GAMMA)

NameError: name 'embed_dim' is not defined

In [37]:
print_every = 50
best_state = None
max_metric = 0
def predict_on(model, data_dl, loss_func, device, model_state_path=None):
#     if model_state_path:
#         model.load_state_dict(torch.load(model_state_path))
#         print('Start predicting...')
    model.eval()
    res_list = []
    label_list = []
    loss = 0
    for text1, text2, label in data_dl:
        text1 = pad_seq(text1, STR_MAXLEN)
        text2 = pad_seq(text2, STR_MAXLEN)
        y_pred = model(text1, text2)
        loss += loss_func(y_pred, label).data.cpu()
        y_pred = y_pred.data.max(1)[1].cpu().numpy()
        res_list.extend(y_pred)
        label_list.extend(label.data.cpu().numpy())
    acc = accuracy_score(res_list, label_list)
    Precision = precision_score(res_list, label_list)
    Recall = recall_score(res_list, label_list)
    F1 = f1_score(res_list, label_list)
    return loss, (acc, Precision, Recall, F1)

def evaluate(model, txt1, txt2, y):
    pred = model(txt1, txt2)
    F1 = f1_score(pred.data.max(1)[1].cpu(), y)
    return F1

def training_termination(valid_result):
    if len(valid_result) >= 4:
        if valid_result[-1] < valid_result[-2] and \
            valid_result[-2] < valid_result[-3] and \
            valid_result[-3] < valid_result[-4]:
            return True
    return False


print('start train...')
valid_iter.create_batches()
valid_batch_num = len(list(valid_iter.batches))
train_iter.create_batches()
batch_num = len(list(train_iter.batches))
valid_result = []
print('batch number ', batch_num)
for epoch in range(EPOCHES):
    for param_group in optimizer.param_groups:
        print('learning rate: %.6f'% param_group['lr'])
    epoch_begin = time()
    total_loss = 0.0
    train_iter.init_epoch()
    batch_count = 0
    batch_begin_time = time()
    for text1, text2, label in train_dl:
        text1 = pad_seq(text1, STR_MAXLEN)
        text2 = pad_seq(text2, STR_MAXLEN)
        y_pred = model(text1, text2)
#         print(text1)
        loss = criterion(y_pred, label)
        model.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        batch_count += 1
        if batch_count % print_every == 0:
            metric = evaluate(model.eval(), text1, text2, label)
            print('[%d %d] loss: %.6f metric: %.6f time: %.1f s' %
                  (epoch + 1, batch_count, total_loss / print_every, metric, time() - batch_begin_time))
            total_loss = 0.0
            batch_begin_time = time()
    scheduler.step()
    print("Evaluating....")
    loss, (acc, Precision, Recall, F1) = predict_on(model, valid_dl, criterion, DEVICE)
    valid_result.append(F1)
    print_flush('*'*50)
    print_flush('[epoch %d]. loss: %.6f acc: %.6f f1: %.6f time: %.1f s'%(epoch+1, loss/valid_batch_num, acc, F1, time()-epoch_begin))
    print_flush('*'*50)
    if F1 > max_metric:
        best_state = model.state_dict()
        max_metric = F1
        print_flush("save model...")
#         torch.save(best_state, './datasets/baseline_LSTM.pth')
    epoch_begin = time()
    if training_termination(valid_result):
        print_flush("early stop at [%d] epoch!" % (epoch+1))
        break

start train...
batch number  337
learning rate: 0.000676




[1 50] loss: 0.127723 metric: 0.909091 time: 1.4 s
[1 100] loss: 0.125935 metric: 0.910891 time: 1.4 s
[1 150] loss: 0.117511 metric: 0.948454 time: 1.4 s
[1 200] loss: 0.119179 metric: 0.894118 time: 1.4 s
[1 250] loss: 0.124890 metric: 0.883117 time: 1.4 s
[1 300] loss: 0.124726 metric: 0.919540 time: 1.4 s
Evaluating....
**************************************************
[epoch 1]. loss: 0.614114 acc: 0.773983 f1: 0.394246 time: 10.1 s
**************************************************
save model...
learning rate: 0.000669


KeyboardInterrupt: 

In [85]:
b.size()

torch.Size([1, 2, 1, 3])

In [130]:
?nn.AvgPool2d

In [128]:
a = torch.randn(1, 1, 3, 5)
avg = nn.AvgPool2d((3, 1), stride=1)
a

tensor([[[[ 2.1606, -1.2174,  0.3667,  1.2651,  1.2977],
          [ 0.4916, -0.0915, -0.3557,  1.4138, -1.3672],
          [ 1.2788, -2.1588,  1.0595,  1.6671, -0.6631]]]])

In [129]:
b = avg(a)
b

tensor([[[[ 1.3103, -1.1559,  0.3569,  1.4487, -0.2442]]]])