## 1.任务描述
迁移学习是自然语言处理中的重要一环，其主要目的是通过从已学习的相关任务中转移知识来改进新任务的学习效果，从而提高模型的泛化能力。
本次评测任务的主要目标是针对中文的疾病问答数据，进行病种间的迁移学习。具体而言，给定来自5个不同病种的问句对，要求判定两个句子语义是否相同或者相近，并在CHIP2019会议发布了评测任务(http://cips-chip.org.cn/)。

## 2.任务说明
category表示问句对的病种名称，分别对应：diabetes-糖尿病，hypertension-高血压，hepatitis-乙肝，aids-艾滋病，breast_cancer-乳腺癌。label表示问句之间的语义是否相同。若相同，标为1，若不相同，标为0。
### 标注示例如下：
category: diabetes
问句1：糖尿病吃什么？
问句2：糖尿病的食谱？
label:1

category: hepatitis
问句1：乙肝小三阳的危害？
问句2：乙肝大三阳的危害？
label:0
## 3.评测指标
同CHIP-CTC任务，本任务的评价指标使用宏观F1值(Macro-F1，或称Average-F1)。

## 4.评测数据
本评测开放训练集数据16000条，验证集数据4000条，测试集数据10000条（注：榜单的训练数据和验证集来自原CHIP评测任务的训练集，榜单的测试数据10000条来自CHIP评测任务的B榜）。

## 说明
处理步骤基本上和KUAKE-QTR完全相同，不加赘述

In [1]:
# cut by word
def tokenize(text):
    return list(text)

In [2]:
# word to sequence
UNK_TAG = "UNK"
PAD_TAG = "PAD"
class Word2Sequence():
    UNK = 0
    PAD = 1

    def __init__(self):
        self.word2index_dict = {
            UNK_TAG : self.UNK,
            PAD_TAG : self.PAD,
        }
        self.count = {}


    def fit(self, sentence):
        # 保存句子到dict, 统计词频
        for word in sentence:
            self.count[word] = self.count.get(word, 0) + 1
        

    def build_vocab(self,min=0,max=None,max_features=None):
        self.count = {word:value for word,value in self.count.items() if value > min}
        if(max is not None):
            self.count = {word:value for word,value in self.count.items() if value < max}
        if max_features is not None:
            self.count = dict(sorted(self.count.items(), key = lambda x:x[-1], reverse=True)[:max_features])

        for word in self.count:
            self.word2index_dict[word] = len(self.word2index_dict)
        self.index2word_dict = dict(zip(self.word2index_dict.values(), self.word2index_dict.keys()))


    def words2index_transform(self, sentence, max_len=None):
        if max_len is not None:
            if max_len > len(sentence):
                sentence = sentence + [PAD_TAG] * (max_len - len(sentence))
            else:
                sentence = sentence[:max_len]
        return [self.word2index_dict.get(word, self.UNK) for word in sentence]


    def index2words_transform(self, sentence):
        return [self.index2word_dict.get(index) for index in sentence]

    
    def __len__(self):
        return len(self.word2index_dict)

In [3]:
# dictionary build
import pickle
from tqdm import tqdm
import json
import os

train_data_path = r"data\CHIP-STS\CHIP-STS_train.json"
test_data_path = r"data\CHIP-STS\CHIP-STS_test.json"
dev_data_path = r"data\CHIP-STS\CHIP-STS_dev.json"
if(not os.path.exists("models/CHIP-STS_Word2Sequence.pkl")):
    word_index_tranformer = Word2Sequence()
    with open(train_data_path, encoding="utf-8") as f:
        for data in tqdm(json.load(f)):
            word_index_tranformer.fit(tokenize(data['text1']))
            word_index_tranformer.fit(tokenize(data['text2']))
    word_index_tranformer.build_vocab()
    pickle.dump(word_index_tranformer, open(r"models/CHIP-STS_Word2Sequence.pkl", 'wb'))
else:
    word_index_tranformer = pickle.load(open(r"models/CHIP-STS_Word2Sequence.pkl", 'rb'))
print('\n' + str(len(word_index_tranformer)))


2148


In [4]:
# dataset
import torch
from torch.utils.data import Dataset
import json

max_sentece_length = 23
class RosDataset(Dataset):
    def __init__(self, data_path, train=True):
        self.train = train
        with open(data_path, encoding="utf-8") as f:
            self.data_list = json.load(f)

    def __getitem__(self, index):
        # 获取索引对应位置的一条数据
        cuted_text1 = tokenize(self.data_list[index]["text1"])
        cuted_text2 = tokenize(self.data_list[index]["text2"])
        indexed_text1 = torch.LongTensor(word_index_tranformer.words2index_transform(cuted_text1, max_len=max_sentece_length))
        indexed_text2 = torch.LongTensor(word_index_tranformer.words2index_transform(cuted_text2, max_len=max_sentece_length))
        if(self.train):
            label = int(self.data_list[index]["label"])
            return label, indexed_text1, indexed_text2
        else:
            return indexed_text1, indexed_text2

    def __len__(self):
        # 返回数据的总数量
        return len(self.data_list)

train_dataset = RosDataset(train_data_path)
dev_dataset = RosDataset(dev_data_path)
test_dataset = RosDataset(test_data_path, train=False)
print(train_dataset[0], len(train_dataset))

(0, tensor([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,  1,  1,  1,  1,
         1,  1,  1,  1,  1]), tensor([16, 17, 11, 12, 18, 19, 20, 21, 22, 23, 22,  2,  3,  4,  1,  1,  1,  1,
         1,  1,  1,  1,  1])) 16000


In [5]:
# dataloader
from torch.utils.data import DataLoader
import torch

train_data_loader = DataLoader(dataset=train_dataset,batch_size=128,shuffle=True)
dev_data_loader = DataLoader(dataset=dev_dataset,batch_size=1,shuffle=True)
test_data_loader = DataLoader(dataset=test_dataset,batch_size=1,shuffle=False, drop_last=False)
for index, (label, indexed_text1, indexed_text2) in enumerate(train_data_loader):
    if(index > 0):
        break
    print(f"{index}:{label},{indexed_text1},{indexed_text2}")

0:tensor([1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1,
        1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0,
        1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0,
        1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1,
        1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1,
        0, 0, 0, 0, 1, 1, 1, 1]),tensor([[175, 176, 442,  ...,   1,   1,   1],
        [ 26,  27,   4,  ...,   1,   1,   1],
        [ 47,  48,  49,  ...,   1,   1,   1],
        ...,
        [210, 211, 542,  ...,   1,   1,   1],
        [ 26,  27,   4,  ...,   1,   1,   1],
        [210, 211, 212,  ...,   1,   1,   1]]),tensor([[ 26,  27,   4,  ...,   1,   1,   1],
        [ 26,  27,   4,  ...,   1,   1,   1],
        [344, 142,  47,  ...,   1,   1,   1],
        ...,
        [210, 211, 542,  ...,   1,   1,   1],
        [ 26,  27,   4,  ...,   1,   1,   1],
        [452,   7,  41,  ...,   

In [6]:
# Siamese network
import torch.nn as nn
import torch.nn.functional as F
import torch

class SiameseNetwork(nn.Module):
    def __init__(self):
        super(SiameseNetwork, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=len(word_index_tranformer),embedding_dim=300,padding_idx=word_index_tranformer.PAD)
        self.gru1 = nn.GRU(input_size=300,hidden_size=256,num_layers=2,batch_first=True,bidirectional=True)
        self.gru2 = nn.GRU(input_size=256*4,hidden_size=256,num_layers=1,batch_first=True,bidirectional=False)
        self.dnn = nn.Sequential(
            nn.Linear(256*4,256),
            nn.ELU(inplace=True),
            nn.BatchNorm1d(256),
            nn.Dropout(0.3),

            nn.Linear(256,256),
            nn.ELU(inplace=True),
            nn.BatchNorm1d(256),
            nn.Dropout(0.3),

            nn.Linear(256, 2)
        )


    def forward(self, input1, input2):
        mask1 = input1.eq(word_index_tranformer.PAD)
        mask2 = input2.eq(word_index_tranformer.PAD)
        input1 = self.embedding(input1)
        input2 = self.embedding(input2)
        output1,_ = self.gru1(input1)
        output2,_ = self.gru1(input2)
        
        output1_align, output2_align = self.soft_attention_align(output1, output2, mask1, mask2)
        output1 = torch.cat([output1, output1_align], 2)
        output2 = torch.cat([output2, output2_align], 2)
        
        gru2_output1,_ = self.gru2(output1)
        gru2_output2,_ = self.gru2(output2)
        
        output1_pooled = self.apply_pooling(gru2_output1)
        output2_pooled = self.apply_pooling(gru2_output2)
        out = torch.cat([output1_pooled, output2_pooled], dim=-1)
        out = self.dnn(out)
        
        return F.log_softmax(out, dim=-1)


    def apply_pooling(self, output):
        avg_pooled = F.avg_pool1d(output.transpose(1,2), kernel_size=output.size(1)).squeeze(-1)
        max_pooled = F.max_pool1d(output.transpose(1,2), kernel_size=output.size(1)).squeeze(-1)
        return torch.cat([avg_pooled, max_pooled], dim=-1)


    def soft_attention_align(self, x1, x2, mask1, mask2):
        mask1 = mask1.float().masked_fill_(mask1, float("-inf"))
        mask2 = mask2.float().masked_fill_(mask2, float("-inf"))

        attention_weight = x1.bmm(x2.transpose(1, 2))
        x1_weight = F.softmax(attention_weight + mask2.unsqueeze(1), dim=-1) 
        x2_output = x1_weight.bmm(x2)

        x2_weight = F.softmax(attention_weight.transpose(1, 2) + mask1.unsqueeze(1), dim=-1) 
        x1_output = x2_weight.bmm(x1)
        
        return x1_output, x2_output

## Siamese Network
- embedding
- BiGRU
- attention
- GRU
- Pooling
- FC+softmax
## 说明
- 该任务和KUAKE-QQR使用完全相同的模型
- 该任务和KUAKE-QTR使用的模型大致相同
    - 该任务使用的模型规模小的多
        - 第二层GRU不再是双向的
        - 第一层GRU为2层，而第二层GRU只有一层；相较之下KUAKE-QTR为6+3
        - 增加了pooling理论上应该是缩小计算量，但实际上由于kernel size=1且做了cat，是增加了计算量
    - 完全有理由相信在使用和KUAKE-QTR模型中采用的增加网络深度的方式可以得到十分明显的性能提升
    - 但相应的代价是训练时间由几分钟涨到几十分钟
- 需要特别说明的是，原任务要求利用领域迁移知识而给出了问题类别的标签，但这里完全没有用到
    - 其实没有理解有了这个标签要怎么利用迁移知识
    - 但或许可以在使用专门设计的loss function时排上一定用处

In [7]:
from tqdm import tqdm
from torch.optim import Adam


def train(epochs, model, model_path=None, optimizer_path=None, device=None):
    model = model.to(device)
    model.train()
    optimizer = Adam(model.parameters(), lr=0.001)
    for epoch in tqdm(range(epochs), desc="Train"):
        for index, (label, text1, text2) in enumerate(train_data_loader):
            if not device is None:
                label = label.to(device)
                text1 = text1.to(device)
                text2 = text2.to(device)
            optimizer.zero_grad()
            output = model(text1, text2)
            loss = F.nll_loss(output, label)
            loss.backward()
            optimizer.step()
    
    if not model_path is None:
        torch.save(model.state_dict(), model_path)
    if not optimizer_path is None:
        torch.save(optimizer.state_dict(), optimizer_path)

In [8]:
def evaluation_accuracy(model, test_data_loader, device=None):
    count_correct = 0
    model = model.to(device)
    model.eval()
    with torch.no_grad():
        for label, text1, text2 in tqdm(dev_data_loader, desc="Evaluation"):
            if not device is None:
                label = label.to(device)
                text1 = text1.to(device)
                text2 = text2.to(device)
            if(model(text1, text2).argmax() == label):
                count_correct = count_correct + 1
    print(f"\n{count_correct}/{len(test_data_loader)}")
    return count_correct / len(test_data_loader)

In [9]:
train_mode = True
model = SiameseNetwork()
model_path = "models\STS_SiameseNetwork.pth"
optimizer_path = "models\STS_SiameseNetwork_optim.pth"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
if(train_mode):
    train(epochs=35, model=model, model_path=model_path, optimizer_path=optimizer_path, device=device)
else:
    model.load_state_dict(torch.load(model_path))

print('\n' + str(evaluation_accuracy(model, dev_data_loader, device=device)))

Train: 100%|██████████| 35/35 [04:26<00:00,  7.62s/it]
Evaluation: 100%|██████████| 4000/4000 [00:45<00:00, 88.51it/s]
3244/4000

0.811



In [10]:
dump_file_path = "result\CHIP-STS_test.json"
with open(test_data_path,'r',encoding="utf-8") as source:
    data = json.load(source)
    model = model.to(device)
    model.eval()
    with torch.no_grad():
        for index, (text1, text2) in tqdm(enumerate(test_data_loader), desc="Evaluation", total=len(test_data_loader)):
            if not device is None:
                text1 = text1.to(device)
                text2 = text2.to(device)
            data[index]["label"] = model(text1, text2).argmax().item()
            json_result = json.dumps(data, ensure_ascii=False)

with open(dump_file_path,'w',encoding="utf-8") as destination:
    destination.write(json_result)

Evaluation: 100%|██████████| 10000/10000 [04:53<00:00, 34.06it/s]
