## 1.任务描述
查询词之间的相关性是评估两个Query所表述主题的匹配程度，即判断Query-A和Query-B是否发生转义，以及转义的程度。Query即搜索词，包括用户在搜索框中输入的词、数字、符号等内容，Query的主题是指query的专注点，用户在输入query是希望找到与query主题相关的网页。判定两个查询词之间的相关性是一项重要的任务，常用于长尾query的搜索质量优化场景，本任务数据集就是在这样的背景下产生的。

## 2.任务说明
Query和Title的相关度共分为3档（0-2），0分为相关性最差，2分表示相关性最好。

2分：表示A与B等价，表述完全一致。

1分： B为A的语义子集，B指代范围小于A。

0分：B为A的语义父集，B指代范围大于A； 或者A与B语义毫无关联。

## 3.评测指标
本任务的评价指标使用准确率Accuracy来评估，即：
准确率(Accuracy) = #预测正确的条目数 / #预测总条目数

## 4.评测数据
本评测开放训练集数据15000条，验证集数据1600条，测试集数据1596条。

## 说明
处理过程同KUAKE-QTR完全类似，不加赘述

In [1]:
# cut by word

def tokenize(text):
    return list(text)

In [2]:
# word to sequence
UNK_TAG = "UNK"
PAD_TAG = "PAD"
class Word2Sequence():
    UNK = 0
    PAD = 1

    def __init__(self):
        self.word2index_dict = {
            UNK_TAG : self.UNK,
            PAD_TAG : self.PAD,
        }
        self.count = {}


    def fit(self, sentence):
        # 保存句子到dict, 统计词频
        for word in sentence:
            self.count[word] = self.count.get(word, 0) + 1
        

    def build_vocab(self,min=0,max=None,max_features=None):
        self.count = {word:value for word,value in self.count.items() if value > min}
        if(max is not None):
            self.count = {word:value for word,value in self.count.items() if value < max}
        if max_features is not None:
            self.count = dict(sorted(self.count.items(), key = lambda x:x[-1], reverse=True)[:max_features])

        for word in self.count:
            self.word2index_dict[word] = len(self.word2index_dict)
        self.index2word_dict = dict(zip(self.word2index_dict.values(), self.word2index_dict.keys()))


    def words2index_transform(self, sentence, max_len=None):
        if max_len is not None:
            if max_len > len(sentence):
                sentence = sentence + [PAD_TAG] * (max_len - len(sentence))
            else:
                sentence = sentence[:max_len]
        return [self.word2index_dict.get(word, self.UNK) for word in sentence]


    def index2words_transform(self, sentence):
        return [self.index2word_dict.get(index) for index in sentence]

    
    def __len__(self):
        return len(self.word2index_dict)

In [3]:
# dictionary build
import pickle
from tqdm import tqdm
import json
import os

train_data_path = r"data\KUAKE-QQR\KUAKE-QQR_train.json"
test_data_path = r"data\KUAKE-QQR\KUAKE-QQR_test.json"
dev_data_path = r"data\KUAKE-QQR\KUAKE-QQR_dev.json"
if(not os.path.exists("models/KUAKE-QQR_Word2Sequence.pkl")):
    word_index_tranformer = Word2Sequence()
    with open(train_data_path, encoding="utf-8") as f:
        for data in tqdm(json.load(f)):
            word_index_tranformer.fit(tokenize(data['query1']))
            word_index_tranformer.fit(tokenize(data['query2']))
    word_index_tranformer.build_vocab()
    pickle.dump(word_index_tranformer, open(r"models/KUAKE-QQR_Word2Sequence.pkl", 'wb'))
else:
    word_index_tranformer = pickle.load(open(r"models/KUAKE-QQR_Word2Sequence.pkl", 'rb'))
print('\n' + str(len(word_index_tranformer)))

100%|██████████| 15000/15000 [00:00<00:00, 130441.80it/s]
2047



In [4]:
# dataset
import torch
from torch.utils.data import Dataset
import json

max_sentece_length = 12
class RosDataset(Dataset):
    def __init__(self, data_path, train=True):
        self.train = train
        with open(data_path, encoding="utf-8") as f:
            self.data_list = json.load(f)

    def __getitem__(self, index):
        # 获取索引对应位置的一条数据
        cuted_text1 = tokenize(self.data_list[index]["query1"])
        cuted_text2 = tokenize(self.data_list[index]["query2"])
        indexed_text1 = torch.LongTensor(word_index_tranformer.words2index_transform(cuted_text1, max_len=max_sentece_length))
        indexed_text2 = torch.LongTensor(word_index_tranformer.words2index_transform(cuted_text2, max_len=max_sentece_length))
        if(self.train):
            label = int(self.data_list[index]["label"])
            return label, indexed_text1, indexed_text2
        else:
            return indexed_text1, indexed_text2

    def __len__(self):
        # 返回数据的总数量
        return len(self.data_list)

train_dataset = RosDataset(train_data_path)
dev_dataset = RosDataset(dev_data_path)
test_dataset = RosDataset(test_data_path, train=False)
print(train_dataset[0], len(train_dataset))

(0, tensor([2, 3, 4, 5, 6, 1, 1, 1, 1, 1, 1, 1]), tensor([4, 5, 3, 7, 1, 1, 1, 1, 1, 1, 1, 1])) 15000


In [5]:
# dataloader
from torch.utils.data import DataLoader
import torch

train_data_loader = DataLoader(dataset=train_dataset,batch_size=128,shuffle=True)
dev_data_loader = DataLoader(dataset=dev_dataset,batch_size=1,shuffle=True)
test_data_loader = DataLoader(dataset=test_dataset,batch_size=1,shuffle=False, drop_last=False)
for index, (label, indexed_text1, indexed_text2) in enumerate(train_data_loader):
    if(index > 0):
        break
    print(f"{index}:{label},{indexed_text1},{indexed_text2}")

0:tensor([1, 1, 2, 1, 0, 1, 0, 2, 0, 2, 0, 0, 1, 0, 0, 2, 2, 0, 1, 2, 0, 0, 2, 0,
        0, 2, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
        0, 0, 2, 0, 2, 0, 0, 1, 1, 2, 0, 1, 2, 0, 2, 0, 0, 0, 2, 2, 1, 0, 0, 0,
        2, 0, 0, 2, 0, 2, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
        2, 0, 2, 0, 0, 2, 0, 0, 2, 2, 0, 2, 2, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        2, 0, 0, 0, 0, 0, 0, 1]),tensor([[ 847,  717,  537,  ...,    1,    1,    1],
        [ 148,  149,  150,  ...,    1,    1,    1],
        [  67,   43,  654,  ...,  735,  517,    1],
        ...,
        [1454,    2,  165,  ...,  133,  462,    1],
        [ 238,   81,  239,  ...,  159,    1,    1],
        [ 421, 1163,   14,  ...,    1,    1,    1]]),tensor([[847, 717, 537,  ...,   1,   1,   1],
        [148, 149, 150,  ...,   1,   1,   1],
        [ 67,  43, 430,  ..., 735, 517,   1],
        ...,
        [690,  28, 974,  ..., 128, 109, 129],
        [260, 261, 242,  ...,   1,   1,   

In [6]:
# Siamese network
import torch.nn as nn
import torch.nn.functional as F
import torch

class SiameseNetwork(nn.Module):
    def __init__(self):
        super(SiameseNetwork, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=len(word_index_tranformer),embedding_dim=300,padding_idx=word_index_tranformer.PAD)
        self.gru1 = nn.GRU(input_size=300,hidden_size=256,num_layers=2,batch_first=True,bidirectional=True)
        self.gru2 = nn.GRU(input_size=256*4,hidden_size=256,num_layers=1,batch_first=True,bidirectional=False)
        self.dnn = nn.Sequential(
            nn.Linear(256*4,256),
            nn.ELU(inplace=True),
            nn.BatchNorm1d(256),
            nn.Dropout(0.3),

            nn.Linear(256,256),
            nn.ELU(inplace=True),
            nn.BatchNorm1d(256),
            nn.Dropout(0.3),

            nn.Linear(256, 3)
        )


    def forward(self, input1, input2):
        mask1 = input1.eq(word_index_tranformer.PAD)
        mask2 = input2.eq(word_index_tranformer.PAD)
        input1 = self.embedding(input1)
        input2 = self.embedding(input2)
        output1,_ = self.gru1(input1)
        output2,_ = self.gru1(input2)
        
        output1_align, output2_align = self.soft_attention_align(output1, output2, mask1, mask2)
        output1 = torch.cat([output1, output1_align], 2)
        output2 = torch.cat([output2, output2_align], 2)
        
        gru2_output1,_ = self.gru2(output1)
        gru2_output2,_ = self.gru2(output2)
        
        output1_pooled = self.apply_pooling(gru2_output1)
        output2_pooled = self.apply_pooling(gru2_output2)
        out = torch.cat([output1_pooled, output2_pooled], dim=-1)
        out = self.dnn(out)
        
        return F.log_softmax(out, dim=-1)


    def apply_pooling(self, output):
        avg_pooled = F.avg_pool1d(output.transpose(1,2), kernel_size=output.size(1)).squeeze(-1)
        max_pooled = F.max_pool1d(output.transpose(1,2), kernel_size=output.size(1)).squeeze(-1)
        return torch.cat([avg_pooled, max_pooled], dim=-1)


    def soft_attention_align(self, x1, x2, mask1, mask2):
        mask1 = mask1.float().masked_fill_(mask1, float("-inf"))
        mask2 = mask2.float().masked_fill_(mask2, float("-inf"))

        attention_weight = x1.bmm(x2.transpose(1, 2))
        x1_weight = F.softmax(attention_weight + mask2.unsqueeze(1), dim=-1) 
        x2_output = x1_weight.bmm(x2)

        x2_weight = F.softmax(attention_weight.transpose(1, 2) + mask1.unsqueeze(1), dim=-1) 
        x1_output = x2_weight.bmm(x1)
        
        return x1_output, x2_output

## 说明
- 可见CHIP-STS

In [7]:
from tqdm import tqdm
from torch.optim import Adam


def train(epochs, model, model_path=None, optimizer_path=None, device=None):
    model = model.to(device)
    model.train()
    optimizer = Adam(model.parameters(), lr=0.001)
    for epoch in tqdm(range(epochs), desc="Train"):
        for index, (label, text1, text2) in enumerate(train_data_loader):
            if not device is None:
                label = label.to(device)
                text1 = text1.to(device)
                text2 = text2.to(device)
            optimizer.zero_grad()
            output = model(text1, text2)
            loss = F.nll_loss(output, label)
            loss.backward()
            optimizer.step()
    
    if not model_path is None:
        torch.save(model.state_dict(), model_path)
    if not optimizer_path is None:
        torch.save(optimizer.state_dict(), optimizer_path)

In [8]:
def evaluation_accuracy(model, test_data_loader, device=None):
    count_correct = 0
    model = model.to(device)
    model.eval()
    with torch.no_grad():
        for label, text1, text2 in tqdm(dev_data_loader, desc="Evaluation"):
            if not device is None:
                label = label.to(device)
                text1 = text1.to(device)
                text2 = text2.to(device)
            if(model(text1, text2).argmax() == label):
                count_correct = count_correct + 1
    print(f"\n{count_correct}/{len(test_data_loader)}")
    return count_correct / len(test_data_loader)

In [9]:
train_mode = False
model = SiameseNetwork()
model_path = "models\QQR_SiameseNetwork.pth"
optimizer_path = "models\QQR_SiameseNetwork_optim.pth"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
if(train_mode):
    train(epochs=35, model=model, model_path=model_path, optimizer_path=optimizer_path, device=device)
else:
    model.load_state_dict(torch.load(model_path))

print('\n' + str(evaluation_accuracy(model, dev_data_loader, device=device)))

Evaluation: 100%|██████████| 1600/1600 [00:16<00:00, 98.61it/s] 
1080/1600

0.675



In [10]:
dump_file_path = "result\KUAKE-QQR_test.json"
with open(test_data_path,'r',encoding="utf-8") as source:
    data = json.load(source)
    new_data = list()
    model = model.to(device)
    model.eval()
    with torch.no_grad():
        for index, (text1, text2) in tqdm(enumerate(test_data_loader), desc="Evaluation", total=len(test_data_loader)):
            if not device is None:
                text1 = text1.to(device)
                text2 = text2.to(device)
            t_dict = dict()
            t_dict["id"] = data[index]["id"]
            t_dict["query"] = data[index]["query1"]
            t_dict["title"] = data[index]["query2"]
            t_dict["label"] = str(model(text1, text2).argmax().item())
            new_data.append(t_dict)
            json_result = json.dumps(new_data, ensure_ascii=False)

with open(dump_file_path,'w',encoding="utf-8") as destination:
    destination.write(json_result)

Evaluation: 100%|██████████| 1596/1596 [00:17<00:00, 93.14it/s]
