## process

In [None]:
# -*- coding: utf-8 -*-
import pickle
import json
import jieba
from tqdm import tqdm

def seg_line(line):
    return list(jieba.cut(line))


def seg_data(path):
    print('start process ', path)
    data = []
    with open(path, 'r') as f:
        for line in tqdm(f):
            dic = json.loads(line, encoding='utf-8')
            question = dic['query']
            doc = dic['passage']
            alternatives = dic['alternatives']
            data.append([seg_line(question), seg_line(doc), alternatives.split('|'), dic['query_id']])
    return data


def build_word_count(data):
    wordCount = {}

    def add_count(lst):
        for word in lst:
            if word not in wordCount:
                wordCount[word] = 0
            wordCount[word] += 1

    for one in data:
        [add_count(x) for x in one[0:3]]
    print('word type size ', len(wordCount))
    return wordCount


def build_word2id(wordCount, threshold=10):
    word2id = {'<PAD>': 0, '<UNK>': 1}
    for word in wordCount:
        if wordCount[word] >= threshold:
            if word not in word2id:
                word2id[word] = len(word2id)
        else:
            chars = list(word)
            for char in chars:
                if char not in word2id:
                    word2id[char] = len(word2id)
    print('processed word size ', len(word2id))
    return word2id




In [None]:
def transform_data_to_id(raw_data, word2id):
    data = []

    def map_word_to_id(word):
        output = []
        if word in word2id:
            output.append(word2id[word])
        else:
            chars = list(word)
            for char in chars:
                if char in word2id:
                    output.append(word2id[char])
                else:
                    output.append(1)
        return output

    def map_sent_to_id(sent):
        output = []
        for word in sent:
            output.extend(map_word_to_id(word))
        return output

    for one in raw_data:
        question = map_sent_to_id(one[0])
        doc = map_sent_to_id(one[1])
        candidates = [map_word_to_id(x) for x in one[2]]
        length = [len(x) for x in candidates]
        max_length = max(length)
        if max_length > 1:
            pad_len = [max_length - x for x in length]
            candidates = [x[0] + [0] * x[1] for x in zip(candidates, pad_len)]
        data.append([question, doc, candidates, one[-1]])
    return data


def process_data(data_path, threshold):
    train_file_path = data_path + 'ai_challenger_oqmrc_validationset_20180816/ai_challenger_oqmrc_validationset.json'
    dev_file_path = data_path + 'ai_challenger_oqmrc_trainingset_20180816/ai_challenger_oqmrc_trainingset.json'
    test_a_file_path = data_path + 'ai_challenger_oqmrc_testa_20180816/ai_challenger_oqmrc_testa.json'
    # test_b_file_path = data_path + 'ai_challenger_oqmrc_testb_20180816/ai_challenger_oqmrc_testb.json'
    path_lst = [train_file_path, dev_file_path, test_a_file_path]
    output_path = [data_path + x for x in ['dev.pickle', 'train.pickle', 'testa.pickle']]
    return _process_data(path_lst, threshold, output_path)


def _process_data(path_lst, word_min_count=5, output_file_path=[]):
    raw_data = []
    for path in path_lst:
        raw_data.append(seg_data(path))
    word_count = build_word_count([y for x in raw_data for y in x])
    with open('../inputs//word-count.obj', 'wb') as f:
        pickle.dump(word_count, f)
    word2id = build_word2id(word_count, word_min_count)
    with open('../inputs//word2id.obj', 'wb') as f:
        pickle.dump(word2id, f)
    for one_raw_data, one_output_file_path in zip(raw_data, output_file_path):
        with open(one_output_file_path, 'wb') as f:
            one_data = transform_data_to_id(one_raw_data, word2id)
            pickle.dump(one_data, f)
    return len(word2id)

In [None]:
 process_data("/home/xq/data/aichallenger/Opinion Questions Machine Reading Comprehension/", 5)

In [1]:
import numpy as np
import os

def pad_answer(batch):
    output = []
    length_info = [len(x[0]) for x in batch]
    max_length = max(length_info)
    for one in batch:
        output.append([x + [0] * (max_length - len(x)) for x in one])
    return output


def get_model_parameters(model):
    total = 0
    for parameter in model.parameters():
        if parameter.requires_grad:
            tmp = 1
            for a in parameter.size():
                tmp *= a
            total += tmp
    return total


def padding(sequence, pads=0, max_len=None, dtype='int32', return_matrix_for_size=False):
    # we should judge the rank
    if True or isinstance(sequence[0], list):
        v_length = [len(x) for x in sequence]  # every sequence length
        seq_max_len = max(v_length)
        if (max_len is None) or (max_len > seq_max_len):
            max_len = seq_max_len
        v_length = list(map(lambda z: z if z <= max_len else max_len, v_length))
        x = (np.ones((len(sequence), max_len)) * pads).astype(dtype)
        for idx, s in enumerate(sequence):
            trunc = s[:max_len]
            x[idx, :len(trunc)] = trunc
        if return_matrix_for_size:
            v_matrix = np.asanyarray([map(lambda item: 1 if item < line else 0, range(max_len)) for line in v_length],
                                     dtype=dtype)
            return x, v_matrix
        return x, np.asarray(v_length, dtype='int32')
    else:
        seq_len = len(sequence)
        if max_len is None:
            max_len = seq_len
        v_vector = sequence + [0] * (max_len - seq_len)
        padded_vector = np.asarray(v_vector, dtype=dtype)
        v_index = [1] * seq_len + [0] * (max_len - seq_len)
        padded_index = np.asanyarray(v_index, dtype=dtype)
        return padded_vector, padded_index


def shuffle_data(data, axis=1):
    pool = {}
    for one in data:
        length = len(one[axis])
        if length not in pool:
            pool[length] = []
        pool[length].append(one)
    for one in pool:
        np.random.shuffle(pool[one])
    length_lst = list(pool.keys())
    np.random.shuffle(length_lst)
    return [x for y in length_lst for x in pool[y]]



In [2]:
import torch
from torch import nn
from torch.nn import functional as F


class MwAN(nn.Module):
    def __init__(self, vocab_size, embedding_size, encoder_size, drop_out=0.2):
        super(MwAN, self).__init__()
        self.drop_out=drop_out
        self.embedding = nn.Embedding(vocab_size + 1, embedding_dim=embedding_size)
        self.q_encoder = nn.GRU(input_size=embedding_size, hidden_size=encoder_size, batch_first=True,
                                bidirectional=True)
        self.p_encoder = nn.GRU(input_size=embedding_size, hidden_size=encoder_size, batch_first=True,
                                bidirectional=True)
        self.a_encoder = nn.GRU(input_size=embedding_size, hidden_size=embedding_size // 2, batch_first=True,
                                bidirectional=True)
        self.a_attention = nn.Linear(embedding_size, 1, bias=False)
        # Concat Attention
        self.Wc1 = nn.Linear(2 * encoder_size, encoder_size, bias=False)
        self.Wc2 = nn.Linear(2 * encoder_size, encoder_size, bias=False)
        self.vc = nn.Linear(encoder_size, 1, bias=False)
        # Bilinear Attention
        self.Wb = nn.Linear(2 * encoder_size, 2 * encoder_size, bias=False)
        # Dot Attention :
        self.Wd = nn.Linear(2 * encoder_size, encoder_size, bias=False)
        self.vd = nn.Linear(encoder_size, 1, bias=False)
        # Minus Attention :
        self.Wm = nn.Linear(2 * encoder_size, encoder_size, bias=False)
        self.vm = nn.Linear(encoder_size, 1, bias=False)

        self.Ws = nn.Linear(2 * encoder_size, encoder_size, bias=False)
        self.vs = nn.Linear(encoder_size, 1, bias=False)

        self.gru_agg = nn.GRU(12 * encoder_size, encoder_size, batch_first=True, bidirectional=True)
        """
        prediction layer
        """
        self.Wq = nn.Linear(2 * encoder_size, encoder_size, bias=False)
        self.vq = nn.Linear(encoder_size, 1, bias=False)
        self.Wp1 = nn.Linear(2 * encoder_size, encoder_size, bias=False)
        self.Wp2 = nn.Linear(2 * encoder_size, encoder_size, bias=False)
        self.vp = nn.Linear(encoder_size, 1, bias=False)
        self.prediction = nn.Linear(2 * encoder_size, embedding_size, bias=False)
        self.initiation()

    def initiation(self):
        initrange = 0.1
        nn.init.uniform_(self.embedding.weight, -initrange, initrange)
        for module in self.modules():
            if isinstance(module, nn.Linear):
                nn.init.xavier_uniform_(module.weight, 0.1)

    def forward(self, inputs):
        [query, passage, answer, is_train] = inputs
        q_embedding = self.embedding(query)
        p_embedding = self.embedding(passage)
        a_embeddings = self.embedding(answer)
        a_embedding, _ = self.a_encoder(a_embeddings.view(-1, a_embeddings.size(2), a_embeddings.size(3)))
        a_score = F.softmax(self.a_attention(a_embedding), 1)
        a_output = a_score.transpose(2, 1).bmm(a_embedding).squeeze()
        a_embedding = a_output.view(a_embeddings.size(0), 3, -1)
        hq, _ = self.q_encoder(p_embedding)
        hq=F.dropout(hq,self.drop_out)
        hp, _ = self.p_encoder(q_embedding)
        hp=F.dropout(hp,self.drop_out)
        _s1 = self.Wc1(hq).unsqueeze(1)
        _s2 = self.Wc2(hp).unsqueeze(2)
        sjt = self.vc(torch.tanh(_s1 + _s2)).squeeze()
        ait = F.softmax(sjt, 2)
        qtc = ait.bmm(hq)
        _s1 = self.Wb(hq).transpose(2, 1)
        sjt = hp.bmm(_s1)
        ait = F.softmax(sjt, 2)
        qtb = ait.bmm(hq)
        _s1 = hq.unsqueeze(1)
        _s2 = hp.unsqueeze(2)
        sjt = self.vd(torch.tanh(self.Wd(_s1 * _s2))).squeeze()
        ait = F.softmax(sjt, 2)
        qtd = ait.bmm(hq)
        sjt = self.vm(torch.tanh(self.Wm(_s1 - _s2))).squeeze()
        ait = F.softmax(sjt, 2)
        qtm = ait.bmm(hq)
        _s1 = hp.unsqueeze(1)
        _s2 = hp.unsqueeze(2)
        sjt = self.vs(torch.tanh(self.Ws(_s1 * _s2))).squeeze()
        ait = F.softmax(sjt, 2)
        qts = ait.bmm(hp)
        aggregation = torch.cat([hp, qts, qtc, qtd, qtb, qtm], 2)
        aggregation_representation, _ = self.gru_agg(aggregation)
        sj = self.vq(torch.tanh(self.Wq(hq))).transpose(2, 1)
        rq = F.softmax(sj, 2).bmm(hq)
        sj = F.softmax(self.vp(self.Wp1(aggregation_representation) + self.Wp2(rq)).transpose(2, 1), 2)
        rp = sj.bmm(aggregation_representation)
        encoder_output = F.dropout(F.leaky_relu(self.prediction(rp)),self.drop_out)
        score = F.softmax(a_embedding.bmm(encoder_output.transpose(2, 1)).squeeze(), 1)
        if not is_train:
            return score.argmax(1)
        loss = -torch.log(score[:, 0]).mean()
        return loss

In [4]:
import pickle
import torch

class Config:
    def __init__(self):
        self.data = '/home/xq/data/aichallenger/Opinion Questions Machine Reading Comprehension/'
        self.threshold = 5
        self.epoch = 50
        self.emsize = 128
        self.nhid = 128
        self.batch_size = 64
        self.log_interval = 300
        self.dropout = 0.2
        self.cuda = True
        self.save = "../ckpt/model.pt"


args = Config()

# vocab_size = process_data(args.data, args.threshold)
vocab_size = 96973

model = MwAN(vocab_size=vocab_size, embedding_size=args.emsize, encoder_size=args.nhid, drop_out=args.dropout)
print('Model total parameters:', get_model_parameters(model))
if args.cuda:
    model.cuda()
optimizer = torch.optim.Adamax(model.parameters())

with open(args.data + 'train.pickle', 'rb') as f:
    train_data = pickle.load(f)
with open(args.data + 'dev.pickle', 'rb') as f:
    dev_data = pickle.load(f)
dev_data = sorted(dev_data, key=lambda x: len(x[1]))

print('train data size {:d}, dev data size {:d}'.format(len(train_data), len(dev_data)))


def train(epoch):
    model.train()
    data = shuffle_data(train_data, 1)
    total_loss = 0.0
    for num, i in enumerate(range(0, len(data), args.batch_size)):
        one = data[i:i + args.batch_size]
        query, _ = padding([x[0] for x in one], max_len=50)
        passage, _ = padding([x[1] for x in one], max_len=350)
        answer = pad_answer([x[2] for x in one])
        query, passage, answer = torch.LongTensor(query), torch.LongTensor(passage), torch.LongTensor(answer)
        if args.cuda:
            query = query.cuda()
            passage = passage.cuda()
            answer = answer.cuda()
        optimizer.zero_grad()
        loss = model([query, passage, answer, True])
        loss.backward()
        total_loss += loss.item()
        optimizer.step()
        if (num + 1) % args.log_interval == 0:
            print('|------epoch {:d} train error is {:f}  eclipse {:.2f}%------|'.format(epoch,
                                                                                         total_loss / args.log_interval,
                                                                                         i * 100.0 / len(data)))
            total_loss = 0


def test():
    model.eval()
    r, a = 0.0, 0.0
    with torch.no_grad():
        for i in range(0, len(dev_data), args.batch_size):
            one = dev_data[i:i + args.batch_size]
            query, _ = padding([x[0] for x in one], max_len=50)
            passage, _ = padding([x[1] for x in one], max_len=500)
            answer = pad_answer([x[2] for x in one])
            query, passage, answer = torch.LongTensor(query), torch.LongTensor(passage), torch.LongTensor(answer)
            if args.cuda:
                query = query.cuda()
                passage = passage.cuda()
                answer = answer.cuda()
            output = model([query, passage, answer, False])
            r += torch.eq(output, 0).sum().item()
            a += len(one)
    return r * 100.0 / a


Model total parameters: 14524288
train data size 250000, dev data size 30000


In [None]:
import numpy as np
best = 0.0
for epoch in range(args.epoch):
    train(epoch)
    acc = test()
    if acc > best:
        best = acc
        with open(args.save, 'wb') as f:
            torch.save(model, f)
    print('epcoh {:d} dev acc is {:f}, best dev acc {:f}'.format(epoch, acc, best))

|------epoch 0 train error is 0.936349  eclipse 7.65%------|
|------epoch 0 train error is 0.929350  eclipse 15.33%------|
|------epoch 0 train error is 0.876534  eclipse 23.01%------|
|------epoch 0 train error is 0.881147  eclipse 30.69%------|
|------epoch 0 train error is 0.818682  eclipse 38.37%------|
|------epoch 0 train error is 0.820160  eclipse 46.05%------|
|------epoch 0 train error is 0.787513  eclipse 53.73%------|
|------epoch 0 train error is 0.804186  eclipse 61.41%------|
|------epoch 0 train error is 0.806545  eclipse 69.09%------|
|------epoch 0 train error is 0.777462  eclipse 76.77%------|
|------epoch 0 train error is 0.768388  eclipse 84.45%------|
|------epoch 0 train error is 0.719827  eclipse 92.13%------|
|------epoch 0 train error is 0.750242  eclipse 99.81%------|


  "type " + obj.__name__ + ". It won't be checked "


epcoh 0 dev acc is 66.206667, best dev acc 66.206667
|------epoch 1 train error is 0.775766  eclipse 7.65%------|
|------epoch 1 train error is 0.685328  eclipse 15.33%------|
|------epoch 1 train error is 0.690590  eclipse 23.01%------|
|------epoch 1 train error is 0.709426  eclipse 30.69%------|
|------epoch 1 train error is 0.744376  eclipse 38.37%------|
|------epoch 1 train error is 0.665399  eclipse 46.05%------|
|------epoch 1 train error is 0.714687  eclipse 53.73%------|
|------epoch 1 train error is 0.665559  eclipse 61.41%------|
|------epoch 1 train error is 0.710789  eclipse 69.09%------|
|------epoch 1 train error is 0.665269  eclipse 76.77%------|
|------epoch 1 train error is 0.739994  eclipse 84.45%------|
|------epoch 1 train error is 0.711733  eclipse 92.13%------|
|------epoch 1 train error is 0.685748  eclipse 99.81%------|
epcoh 1 dev acc is 68.736667, best dev acc 68.736667
|------epoch 2 train error is 0.654842  eclipse 7.65%------|
|------epoch 2 train error i

In [None]:
del model