In [40]:
import sys
import jieba
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import torch.nn.functional as F
from torch.optim.lr_scheduler import StepLR
from torchtext import data
from time import time
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

# hyper parameters
STR_MAXLEN = 30
BATCH_SIZE = 256
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
EMBED_DIM = 200
HIDDEN_DIM = 100
ATTEN_UNIT = 350
ATTEN_HOPS = 4
DEEP_LAYERS = [200]
LEARNING_RATE = 1e-3
EPOCHES = 20
DECAY_STEP = 2
DECAY_GAMMA = 0.99
CLASS_WEIGHT = [0.6116, 2.7397]
def print_flush(data, args=None):
    if args == None:
        print(data)
    else:
        print(data, args)
    sys.stdout.flush()
    
def pad_seq(seq, max_length):
    length = len(seq[0])
    pad_leng = 0 if length > max_length else (max_length-length)
    if pad_leng == 0:
        seq = seq[:, :max_length]
    else:
        seq = torch.cat([seq, torch.ones(len(seq), pad_leng).long().to(DEVICE)], dim=1)
    return seq

def Frobenius(mat):
    size = mat.size()
    if len(size) == 3:  # batched matrix
        ret = (torch.sum(torch.sum((mat ** 2), 2), 1).squeeze() + 1e-10) ** 0.5
        return torch.sum(ret) / size[0]
    else:
        raise Exception('matrix for computing Frobenius norm should be with 3 dims')
        
def getIdentityMatrix(batch_size, attention_hops):
    I = torch.zeros(batch_size, attention_hops, attention_hops)
    for i in range(batch_size):
        for j in range(attention_hops):
            I.data[i][j][j] = 1
    return I.to(DEVICE)

In [4]:
# prepare data

def tokenizer(text): # create a tokenizer function
    return [txt for txt in text]
class BatchWrapper:
    def __init__(self, dl, iter_columns):
        self.dl, self.iter_columns = dl, iter_columns  # we pass in the list of attributes for x &amp;amp;amp;amp;lt;g class="gr_ gr_3178 gr-alert gr_spell gr_inline_cards gr_disable_anim_appear ContextualSpelling ins-del" id="3178" data-gr-id="3178"&amp;amp;amp;amp;gt;and y&amp;amp;amp;amp;lt;/g&amp;amp;amp;amp;gt;

    def __iter__(self):
        for batch in self.dl:
            yield (getattr(batch, attr) for attr in self.iter_columns)

    def __len__(self):
        return len(self.dl)
    
print_flush('process raw data...')
TEXT = data.Field(sequential=True, use_vocab=True, eos_token='<EOS>', init_token='<BOS>',pad_token='<PAD>', 
                  batch_first=True, tokenize=tokenizer)
LABEL = data.Field(sequential=False, use_vocab=False, batch_first=True)

tv_datafields = [("id", None), # we won't be needing the id, so we pass in None as the field
                 ("txt1", TEXT), ("txt2", TEXT),
                 ("label", LABEL)]

train = data.TabularDataset(path='../datasets/train.csv', format='csv', skip_header=True, fields=tv_datafields)
valid = data.TabularDataset(path='../datasets/valid.csv', format='csv', skip_header=True, fields=tv_datafields)

TEXT.build_vocab(train, valid, min_freq=3)
print_flush('Building vocabulary Finished.')

process raw data...
Building vocabulary Finished.


In [5]:
len(TEXT.vocab.stoi)

1517

In [6]:
train_iter = data.BucketIterator(dataset=train, batch_size=BATCH_SIZE, sort_key=lambda x: len(x.txt1) + len(x.txt2), shuffle=True, device=DEVICE, repeat=False)
valid_iter = data.Iterator(dataset=valid, batch_size=BATCH_SIZE, device=DEVICE, shuffle=False, repeat=False)

train_dl = BatchWrapper(train_iter, ["txt1", "txt2", "label"])
valid_dl = BatchWrapper(valid_iter, ["txt1", "txt2", "label"])

print_flush('prepare data done!')

prepare data done!


In [50]:
class SelfAttentiveEncoder(nn.Module):
    def __init__(self, hidden_dim, atten_unit, atten_hops):
        super(SelfAttentiveEncoder, self).__init__()
#         self.dropout = nn.Dropout(config['dropout'])
        self.ws1 = nn.Linear(hidden_dim * 2, atten_unit, bias=False)
        self.ws2 = nn.Linear(atten_unit, atten_hops, bias=False)
        self.tanh = nn.Tanh()
        self.softmax = nn.Softmax()
        self.attention_hops = atten_hops

    def init_weights(self, init_range=0.1):
        self.ws1.weight.data.uniform_(-init_range, init_range)
        self.ws2.weight.data.uniform_(-init_range, init_range)

    def forward(self, hidden_txt1):
        size1 = hidden_txt1.size()
#         size2 = hidden_txt2.size()
        hidden_txt1_ = hidden_txt1.contiguous().view(-1, size1[2])  # [bsz*seq_len, hidden_dim]
#         hidden_txt2_ = hidden_txt2.contiguous().view(-1, size2[2])  # [bsz*seq_len, hidden_dim]

        hbar1 = self.tanh(self.ws1(hidden_txt1_))  # [bsz*seq_len, attention-units]
#         hbar2 = self.tanh(self.ws1(hidden_txt2_))  # [bsz*seq_len, attention-units]

        alphas1 = self.ws2(hbar1).view(size1[0], size1[1], -1)  # [bsz, seq_len, hops]
        alphas1 = torch.transpose(alphas1, 1, 2).contiguous()  # [bsz, hops, seq_len]

#         alphas2 = self.ws2(hbar2).view(size2[0], size2[1], -1)  # [bsz, seq_len, hops]
#         alphas2 = torch.transpose(alphas2, 1, 2).contiguous()  # [bsz, hops, seq_len]
        alphas1 = self.softmax(alphas1.view(-1, size1[1]))  # [bsz*hop, seq_len]
#         alphas2 = self.softmax(alphas2.view(-1, size2[1]))  # [bsz*hop, seq_len]

        alphas1 = alphas1.view(size1[0], self.attention_hops, size1[1])  # [bsz, hop, seq_len]
#         alphas2 = alphas2.view(size2[0], self.attention_hops, size2[1])  # [bsz, hop, seq_len]
#         print('alphas1', alphas1)
#         print('alphas1', alphas1.size())
#         print('hidden_txt1', hidden_txt1)
#         print('hidden_txt1', hidden_txt1.size())

        return torch.bmm(alphas1, hidden_txt1), alphas1

#     def init_hidden(self, bsz):
#         return self.bilstm.init_hidden(bsz)


In [51]:
class baseline_LSTM_self_attention(nn.Module):
    def __init__(self, vocab, embed_dim, hidden_dim, atten_unit, atten_hops, deep_layers):
        super(baseline_LSTM_self_attention, self).__init__()
        self.embed_dim = embed_dim
        self.hidden_dim = hidden_dim
        self.deep_layers = deep_layers
        self.atten_unit = atten_unit
        self.atten_hops = atten_hops
        self.word_embed = nn.Embedding(len(vocab), embed_dim, padding_idx=vocab['<PAD>'])
        self.lstm_embed = nn.LSTM(embed_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.self_attention1 = SelfAttentiveEncoder(hidden_dim, atten_unit,atten_hops)
        self.self_attention2 = SelfAttentiveEncoder(hidden_dim, atten_unit,atten_hops)
        self.deep_layer_1 = nn.Linear(4 * hidden_dim*2, deep_layers[0])
        self.deep_out = nn.Linear(deep_layers[0], 2)
#         self.deep_dropout_1 = nn.Dropout(0.5)
    def forward(self, txt1, txt2, hidden=None):
        embed_txt1 = self.word_embed(txt1)
        embed_txt2 = self.word_embed(txt2)
#         lstm_txt1 = torch.sum(self.lstm_embed(embed_txt1, hidden)[0], 1)
#         lstm_txt2 = torch.sum(self.lstm_embed(embed_txt2, hidden)[0], 1)
        lstm_txt1 = self.lstm_embed(embed_txt1, hidden)[0]
        lstm_txt2 = self.lstm_embed(embed_txt2, hidden)[0]
        atten_txt1, mat1 = self.self_attention1(lstm_txt1)
        atten_txt2, mat2 = self.self_attention2(lstm_txt2)

        atten_txt1 = atten_txt1.unsqueeze_(1)
        atten_txt2 = atten_txt2.unsqueeze_(1)
        atten_txt1 = F.max_pool2d(atten_txt1, (self.atten_hops, 1)) 
        atten_txt2 = F.max_pool2d(atten_txt2, (self.atten_hops, 1)) 
        atten_txt1 = atten_txt1.squeeze()
        atten_txt2 = atten_txt2.squeeze()
#         print('atten_txt1', atten_txt1)
#         print('atten_txt1', atten_txt1.size())
#         print('atten_txt2', atten_txt2)
#         print('atten_txt2', atten_txt2.size())
        txt_substract = torch.abs(atten_txt1 - atten_txt2)
        txt_multiply = torch.mul(atten_txt1, atten_txt2)
        output = self.deep_layer_1(torch.cat([atten_txt1, atten_txt2, txt_substract, txt_multiply], 1))
        output = self.deep_out(output)
        return F.log_softmax(output), mat1, mat2

In [52]:
model = baseline_LSTM_self_attention(TEXT.vocab.stoi, EMBED_DIM, HIDDEN_DIM, ATTEN_UNIT, ATTEN_HOPS, DEEP_LAYERS)
model.to(DEVICE)
criterion = nn.NLLLoss(weight=torch.tensor(CLASS_WEIGHT).float().to(DEVICE))
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
scheduler = StepLR(optimizer, step_size=DECAY_STEP, gamma=DECAY_GAMMA)

In [53]:
print_every = 50
best_state = None
max_metric = 0
I = getIdentityMatrix(BATCH_SIZE, ATTEN_HOPS)
# model.to(DEVICE)
def predict(model, data_dl, loss_func, device):
    model.eval()
    res_list = []
    label_list = []
    loss = 0
    for text1, text2, label in data_dl:
#         text1 = pad_seq(text1, STR_MAXLEN)
#         text2 = pad_seq(text2, STR_MAXLEN)
        y_pred, _1, _2 = model(text1, text2)
        loss += loss_func(y_pred, label).data.cpu()
        y_pred = y_pred.data.max(1)[1].cpu().numpy()
        res_list.extend(y_pred)
        label_list.extend(label.data.cpu().numpy())
    acc = accuracy_score(res_list, label_list)
    Precision = precision_score(res_list, label_list)
    Recall = recall_score(res_list, label_list)
    F1 = f1_score(res_list, label_list)
    
    return loss, (acc, Precision, Recall, F1)
        
def evaluate(model, txt1, txt2, y):
    pred, _1, _2 = model(txt1, txt2)
    out_batch = pred.data.max(1)[1].cpu().numpy()
    F1 = f1_score(out_batch, y)
    return F1

def training_termination(valid_result):
    if len(valid_result) >= 4:
        if valid_result[-1] < valid_result[-2] and \
            valid_result[-2] < valid_result[-3] and \
            valid_result[-3] < valid_result[-4]:
            return True
    return False

valid_iter.create_batches()
valid_batch_num = len(list(valid_iter.batches))
print_flush('start train...')
train_iter.create_batches()
batch_num = len(list(train_iter.batches))
print_flush('batch number %d '%batch_num)
valid_result = []
for epoch in range(EPOCHES):
    for param_group in optimizer.param_groups:
        print('learning rate: %.6f'% param_group['lr'])
    epoch_begin = time()
    total_loss = 0.0
    train_iter.init_epoch()
    batch_count = 0
    batch_begin_time = time()
    for text1, text2, label in train_dl:
#         text1 = pad_seq(text1, STR_MAXLEN)
#         text2 = pad_seq(text2, STR_MAXLEN)
        model.train()
        y_pred, attention1, attention2 = model(text1, text2)
        attention1T = torch.transpose(attention1, 1, 2).contiguous()
        attention2T = torch.transpose(attention2, 1, 2).contiguous()
        extra_loss1 = Frobenius(torch.bmm(attention1, attention1T) - I[:attention1.size(0)])
        extra_loss2 = Frobenius(torch.bmm(attention2, attention2T) - I[:attention2.size(0)])
        loss = criterion(y_pred, label) + extra_loss1 + extra_loss2
        model.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        batch_count += 1
        if batch_count % print_every == 0:
            metric = evaluate(model.eval(), text1, text2, label)
            print_flush('[%d %d] loss: %.6f metric: %.6f time: %.1f s' %
                  (epoch + 1, batch_count, total_loss / print_every, metric, time() - batch_begin_time))
            total_loss = 0.0
            batch_begin_time = time()
    scheduler.step()
    print_flush("Evaluating....")
    loss, (acc, Precision, Recall, F1) = predict(model, valid_dl, criterion, DEVICE)
    valid_result.append(F1)
    print_flush('*'*50)
    print_flush('[epoch %d]. loss: %.6f acc: %.6f f1: %.6f time: %.1f s'%(epoch+1, loss/valid_batch_num, acc, F1, time()-epoch_begin))
    print_flush('*'*50)
    if F1 > max_metric:
        best_state = model.state_dict()
        max_metric = F1
        print_flush("save model...")
#         torch.save(best_state, '../datasets/models/baseline_LSTM.pth')
    epoch_begin = time()
    if training_termination(valid_result):
        print_flush("early stop at [%d] epoch!" % (epoch+1))
        break

start train...
batch number 337 
learning rate: 0.001000




[1 50] loss: 3.474555 metric: 0.336634 time: 1.6 s
[1 100] loss: 2.637081 metric: 0.268293 time: 1.3 s
[1 150] loss: 2.137220 metric: 0.355556 time: 1.4 s
[1 200] loss: 0.764659 metric: 0.322917 time: 1.4 s
[1 250] loss: 0.694769 metric: 0.363636 time: 1.5 s
[1 300] loss: 0.671105 metric: 0.470588 time: 1.4 s
Evaluating....
**************************************************
[epoch 1]. loss: 0.608093 acc: 0.658718 f1: 0.420704 time: 10.6 s
**************************************************
save model...
learning rate: 0.001000
[2 50] loss: 0.639672 metric: 0.444444 time: 1.5 s
[2 100] loss: 0.609722 metric: 0.376623 time: 1.4 s
[2 150] loss: 0.602120 metric: 0.535032 time: 1.5 s
[2 200] loss: 0.591416 metric: 0.426230 time: 1.4 s
[2 250] loss: 0.589789 metric: 0.437500 time: 1.4 s
[2 300] loss: 0.584310 metric: 0.419355 time: 1.4 s
Evaluating....
**************************************************
[epoch 2]. loss: 0.581097 acc: 0.742087 f1: 0.471044 time: 10.6 s
*************************

In [None]:
lstm = nn.LSTM(5, 5, batch_first=True, bidirectional=True)

In [84]:
inputs = torch.randn(1, 2, 5)
out = lstm(inputs)

In [None]:
max_pool = nn.AvgPool2d()

In [85]:
output = out[0].unsqueeze_(1)

In [86]:
output.size()

torch.Size([1, 1, 2, 10])