# 『2022信通院兴智杯：深度学习模型可解释竞赛』- 文本相似度可解释性评测
## 1、项目介绍
深度学习模型在很多NLP任务上已经取得巨大成功，但其常被当作一个黑盒使用，内部预测机制对使用者是不透明的。这使得深度学习模型结果不被人信任，增加落地难度，尤其是在医疗、法律等特殊领域。同时，当模型出现效果不好或鲁棒性差等问题时，由于不了解其内部机制，导致很难对模型进行优化。近期，深度学习模型的可解释性被越来越多的人关注。但模型的可解释性评估还不够完善，本基线提供了文本相似度任务的评测数据和相关评测指标，旨在评估模型的可解释性。
## 2、基线运行


### 依赖安装
安装一些必须的依赖包。

In [None]:
!pip3 install paddlepaddle-gpu
!pip3 install -U paddlenlp==2.3.0

### 数据准备
#### 模型训练数据
中文文本相似度任务中，我们使用LCQMC数据集进行模型训练，使用paddlenlp框架自动缓存。
#### 下载预训练模型
使用paddlenlp框架自动缓存模型文件。
#### 其他数据下载
暂无。

### 一些初始化工作
初始化工作包括了模型选择及加载、训练数据集选择、模型存储路径设定、抽取证据的长度占原文本长度的比例设定等。可按需更改。

In [2]:
import sys
import json
import numpy as np
import paddle
import paddlenlp
from paddlenlp.transformers import ErnieForSequenceClassification, ErnieTokenizer
sys.path.append("..")
sys.path.append("../../")

MODEL_NAME = "ernie-3.0-base-zh" # choose from ["ernie-1.0", "ernie-1.0-base-zh", "ernie-1.0-large-zh-cw", "ernie-2.0-base-zh", "ernie-2.0-large-zh", "ernie-3.0-xbase-zh", "ernie-3.0-base-zh", "ernie-3.0-medium-zh", "ernie-3.0-mini-zh", "ernie-3.0-micro-zh", "ernie-3.0-nano-zh"]
DATASET_NAME = 'lcqmc'
MODEL_SAVE_PATH = f'../assets/{DATASET_NAME}-{MODEL_NAME}'
RATIONALE_RATIO = 0.7


# Init model and tokenizer
model = ErnieForSequenceClassification.from_pretrained(MODEL_NAME, num_classes=2)
tokenizer = ErnieTokenizer.from_pretrained(MODEL_NAME)

[2022-06-28 12:59:01,224] [    INFO] - Downloading https://bj.bcebos.com/paddlenlp/models/transformers/ernie_3.0/ernie_3.0_base_zh.pdparams and saved to /home/aistudio/.paddlenlp/models/ernie-3.0-base-zh
[2022-06-28 12:59:01,226] [    INFO] - Downloading ernie_3.0_base_zh.pdparams from https://bj.bcebos.com/paddlenlp/models/transformers/ernie_3.0/ernie_3.0_base_zh.pdparams
100%|██████████| 452M/452M [00:06<00:00, 73.4MB/s] 
W0628 12:59:07.819039   182 device_context.cc:447] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 11.2, Runtime API Version: 10.1
W0628 12:59:07.823046   182 device_context.cc:465] device: 0, cuDNN Version: 7.6.
[2022-06-28 12:59:12,908] [    INFO] - Downloading https://bj.bcebos.com/paddlenlp/models/transformers/ernie_3.0/ernie_3.0_base_zh_vocab.txt and saved to /home/aistudio/.paddlenlp/models/ernie-3.0-base-zh
[2022-06-28 12:59:12,911] [    INFO] - Downloading ernie_3.0_base_zh_vocab.txt from https://bj.bcebos.com/paddlenlp/models/transf

### 训练模型
这里以ERNIE-3.0为例训练一个文本相似度模型。

In [4]:
from paddlenlp.datasets import load_dataset
from assets.utils import training_model

# Load dataset
train_ds, dev_ds, test_ds = load_dataset(DATASET_NAME, splits=["train", "dev", "test"])

# Start training
training_model(model, tokenizer, train_ds, dev_ds, save_dir=MODEL_SAVE_PATH)

[2022-06-21 19:11:14,877] [    INFO] - Already cached /home/aistudio/.paddlenlp/models/ernie-3.0-base-zh/ernie_3.0_base_zh.pdparams
[2022-06-21 19:11:17,019] [    INFO] - Already cached /home/aistudio/.paddlenlp/models/ernie-3.0-base-zh/ernie_3.0_base_zh_vocab.txt


dataset labels: ['0', '1']
dataset examples:
{'query': '喜欢打篮球的男生喜欢什么样的女生', 'title': '爱打篮球的男生喜欢什么样的女生', 'label': 1}
{'query': '我手机丢了，我想换个手机', 'title': '我想买个新手机，求推荐', 'label': 1}
{'query': '大家觉得她好看吗', 'title': '大家觉得跑男好看吗？', 'label': 0}
{'query': '求秋色之空漫画全集', 'title': '求秋色之空全集漫画', 'label': 1}
{'query': '晚上睡觉带着耳机听音乐有什么害处吗？', 'title': '孕妇可以戴耳机听音乐吗?', 'label': 0}
Training Starts:
global step 100, epoch: 1, batch: 100, loss: 0.57603, acc: 0.55437
global step 200, epoch: 1, batch: 200, loss: 0.55437, acc: 0.63703
global step 300, epoch: 1, batch: 300, loss: 0.31157, acc: 0.70583
global step 400, epoch: 1, batch: 400, loss: 0.23469, acc: 0.74852
global step 500, epoch: 1, batch: 500, loss: 0.24344, acc: 0.77219
global step 600, epoch: 1, batch: 600, loss: 0.37564, acc: 0.78682
global step 700, epoch: 1, batch: 700, loss: 0.44287, acc: 0.79746
global step 800, epoch: 1, batch: 800, loss: 0.42520, acc: 0.80582
global step 900, epoch: 1, batch: 900, loss: 0.37588, acc: 0.81118
global step 1000, ep

[2022-06-21 19:36:33,288] [    INFO] - tokenizer config file saved in ../assets/lcqmc-ernie-3.0-base-zh/tokenizer_config.json
[2022-06-21 19:36:33,291] [    INFO] - Special tokens file saved in ../assets/lcqmc-ernie-3.0-base-zh/special_tokens_map.json


global step 7500, epoch: 2, batch: 38, loss: 0.24792, acc: 0.89720
global step 7600, epoch: 2, batch: 138, loss: 0.05826, acc: 0.89855
global step 7700, epoch: 2, batch: 238, loss: 0.28852, acc: 0.89916
global step 7800, epoch: 2, batch: 338, loss: 0.26902, acc: 0.89858
global step 7900, epoch: 2, batch: 438, loss: 0.20164, acc: 0.89947
global step 8000, epoch: 2, batch: 538, loss: 0.29403, acc: 0.90027
global step 8100, epoch: 2, batch: 638, loss: 0.12226, acc: 0.90125
global step 8200, epoch: 2, batch: 738, loss: 0.18106, acc: 0.90058
global step 8300, epoch: 2, batch: 838, loss: 0.12764, acc: 0.89898
global step 8400, epoch: 2, batch: 938, loss: 0.13077, acc: 0.89829
global step 8500, epoch: 2, batch: 1038, loss: 0.25547, acc: 0.89800
global step 8600, epoch: 2, batch: 1138, loss: 0.19892, acc: 0.89859
global step 8700, epoch: 2, batch: 1238, loss: 0.41702, acc: 0.89812
global step 8800, epoch: 2, batch: 1338, loss: 0.18125, acc: 0.89822
global step 8900, epoch: 2, batch: 1438, loss

[2022-06-21 20:01:19,573] [    INFO] - tokenizer config file saved in ../assets/lcqmc-ernie-3.0-base-zh/tokenizer_config.json
[2022-06-21 20:01:19,575] [    INFO] - Special tokens file saved in ../assets/lcqmc-ernie-3.0-base-zh/special_tokens_map.json


global step 15000, epoch: 3, batch: 76, loss: 0.23220, acc: 0.91118
global step 15100, epoch: 3, batch: 176, loss: 0.07116, acc: 0.91033
global step 15200, epoch: 3, batch: 276, loss: 0.12253, acc: 0.91214
global step 15300, epoch: 3, batch: 376, loss: 0.33100, acc: 0.91240
global step 15400, epoch: 3, batch: 476, loss: 0.18419, acc: 0.91170
global step 15500, epoch: 3, batch: 576, loss: 0.31150, acc: 0.91151
global step 15600, epoch: 3, batch: 676, loss: 0.07493, acc: 0.91207
global step 15700, epoch: 3, batch: 776, loss: 0.13670, acc: 0.91096
global step 15800, epoch: 3, batch: 876, loss: 0.22541, acc: 0.91067
global step 15900, epoch: 3, batch: 976, loss: 0.18129, acc: 0.91028
global step 16000, epoch: 3, batch: 1076, loss: 0.20111, acc: 0.91081
global step 16100, epoch: 3, batch: 1176, loss: 0.19327, acc: 0.91013
global step 16200, epoch: 3, batch: 1276, loss: 0.18196, acc: 0.91022
global step 16300, epoch: 3, batch: 1376, loss: 0.20761, acc: 0.91022
global step 16400, epoch: 3, ba

[2022-06-21 20:26:03,057] [    INFO] - tokenizer config file saved in ../assets/lcqmc-ernie-3.0-base-zh/tokenizer_config.json
[2022-06-21 20:26:03,060] [    INFO] - Special tokens file saved in ../assets/lcqmc-ernie-3.0-base-zh/special_tokens_map.json


best accuracy is 0.902977!


### 重要度分数获取
从这一步开始，我们进行对训练好的模型的可解释分析。首先获取评测数据集上数据的重要性分数。这里我们先更改一下训练好的模型以及评估数据的存储路径（MODEL_PATH和DATA_PATH），加载训练好的模型以及评测数据集，然后做一些数据上的预处理。

In [6]:
import json
import numpy as np
from trustai.interpretation.token_level import IntGradInterpreter
from assets.utils import convert_example, load_data
from paddlenlp.data import Stack, Tuple, Pad
from trustai.interpretation import get_word_offset
from assets.utils import predict, load_data

# Correct MODEL_PATH and DATA_PATH before executing
MODEL_PATH = MODEL_SAVE_PATH + '/model_state.pdparams'
DATA_PATH = '/home/aistudio/TrustAI/tutorials/assets/sim_interpretation_A.txt'

# Function to process data
def preprocess_fn(data):
    examples = []

    if isinstance(data, dict):
        data_t = []
        for d in data:
            data_t.append(data[d])
        data = data_t

    for text in data:
        input_ids, segment_ids = convert_example(text, tokenizer, max_seq_length=128, is_test=True)
        examples.append((input_ids, segment_ids))

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input id
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # segment id
    ): fn(samples)

    input_ids, segment_ids = batchify_fn(examples)
    return paddle.to_tensor(input_ids, stop_gradient=False), paddle.to_tensor(segment_ids, stop_gradient=False)

# Load the trained parameters
state_dict = paddle.load(MODEL_PATH)
model.set_dict(state_dict)

# Prepare test data
data = load_data(DATA_PATH)
print("Num of data:", len(data))

# Get the combined contexts of both original text and standard splited text
contexts = []
standard_split = []
for idx in data:
    example = data[idx]
    contexts.append("[CLS]" + example['query'] + "[SEP]" + example['title'] + "[SEP]")
    standard_split.append(["[CLS]"] + example['text_q_seg'] + ["[SEP]"] + example['text_t_seg'] + ["[SEP]"])

# Get the offset map of tokenized tokens and standard splited tokens
ori_offset_maps = []
standard_split_offset_maps = []
for i in range(len(contexts)):
    ori_offset_maps.append(tokenizer.get_offset_mapping(contexts[i]))
    standard_split_offset_maps.append(get_word_offset(contexts[i], standard_split[i]))

Num of data: 1712


接下来，我们可以选取任何一种解释方法，去获取重要性分数。这里提供attention，IG，以及LIME这三种作为示例。实际使用时选择其中一种运行即可。

#### Attention Interpreter获取重要性分数
用attention获取评测数据集上数据的重要性分数。

In [5]:
from trustai.interpretation.token_level.common import attention_predict_fn_on_paddlenlp
from trustai.interpretation.token_level import AttentionInterpreter
from assets.utils import create_dataloader_from_scratch

# Init an attention interpreter and get the importance scores
att = AttentionInterpreter(model, device="gpu", predict_fn=attention_predict_fn_on_paddlenlp)

# Use attention interpreter to get the importance scores for all data
interp_results = None
for batch in create_dataloader_from_scratch(list(data.values()), tokenizer, 8):
    if interp_results:
        interp_results += att(batch)
    else:
        interp_results = att(batch)

# Align the results back to the standard splited tokens so that it can be evaluated correctly later
align_res = att.alignment(interp_results, contexts, standard_split, standard_split_offset_maps, ori_offset_maps, special_tokens=["[CLS]", '[SEP]'])

#### IG Interpreter获取重要性分数
用IG的方法获取评测数据集上数据的重要性分数，这一步会消耗相对长的时间。

In [8]:
from trustai.interpretation.token_level import IntGradInterpreter
from assets.utils import create_dataloader_from_scratch
# Hyperparameters
IG_STEP = 100

# Init an IG interpreter
ig = IntGradInterpreter(model, device="gpu")

# Use IG interpreter to get the importance scores for all data
interp_results = None
for batch in create_dataloader_from_scratch(list(data.values()), tokenizer, 8):
    if interp_results:
        interp_results += ig(batch, steps=IG_STEP)
    else:
        interp_results = ig(batch, steps=IG_STEP)

# Align the results back to the standard splited tokens so that it can be evaluated correctly later
align_res = ig.alignment(interp_results, contexts, standard_split, standard_split_offset_maps, ori_offset_maps, special_tokens=["[CLS]", '[SEP]'])

#### LIME Interpreter获取重要性分数
用LIME的方法获取评测数据集上数据的重要性分数，这一步会消耗相对长的时间。

In [4]:

from trustai.interpretation.token_level import LIMEInterpreter
from assets.utils import create_dataloader_from_scratch
# Hyperparameters
LIME_SAMPLES = 1000

# Init an LIME interpreter
lime = LIMEInterpreter(model, device="gpu",
    unk_id=tokenizer.convert_tokens_to_ids('[UNK]'),
    pad_id=tokenizer.convert_tokens_to_ids('[PAD]'))

# Use LIME interpreter to get the importance scores for all data
interp_results = None
for batch in create_dataloader_from_scratch(list(data.values()), tokenizer, 8):
    if interp_results:
        interp_results += lime(batch, num_samples=LIME_SAMPLES)
    else:
        interp_results = lime(batch, num_samples=LIME_SAMPLES)
    
# Align the results back to the standard splited tokens so that it can be evaluated correctly later
align_res = lime.alignment(interp_results, contexts, standard_split, standard_split_offset_maps, ori_offset_maps, special_tokens=["[CLS]", '[SEP]'])

### 生成用于评估的数据

In [5]:
import math

# Re-sort the token index according to their importance scores
def resort(index_array, importance_score):
    res = sorted([[idx, importance_score[idx]] for idx in index_array], key=lambda x:x[1], reverse=True)
    res = [n[0] for n in res]
    return res

# Post-prepare the result data so that it can be used for the evaluation directly
def prepare_eval_data(data, results, paddle_model):
    res = {}
    for data_id, inter_res in zip(data, results):
        # Split importance score vectors for query and title from inter_res.word_attributions
        query_importance_score = np.array(inter_res.word_attributions[1:len(data[data_id]['text_q_seg'])+1])
        title_importance_score = np.array(inter_res.word_attributions[len(data[data_id]['text_q_seg'])+2:-1])
        # Extract topK importance scores
        query_topk = math.ceil(len(data[data_id]['text_q_seg'])*RATIONALE_RATIO)
        title_topk = math.ceil(len(data[data_id]['text_t_seg'])*RATIONALE_RATIO)
        
        eval_data = {}        
        eval_data['id'] = data_id
        eval_data['pred_label'] = inter_res.pred_label
        # Find the token index of the topK importance scores
        eval_data['rationale_q'] = np.argpartition(query_importance_score, -query_topk)[-query_topk:]
        eval_data['rationale_t'] = np.argpartition(title_importance_score, -title_topk)[-title_topk:]
        # Re-sort the token index according to their importance scores
        eval_data['rationale_q'] = resort(eval_data['rationale_q'], query_importance_score)
        eval_data['rationale_t'] = resort(eval_data['rationale_t'], title_importance_score)

        res[data_id] = eval_data
    return res

# Generate results for evaluation
predicts = prepare_eval_data(data, align_res, model)
out_file = open('./sim_rationale.txt', 'w')
for key in predicts:
    out_file.write(str(predicts[key]['id'])+'\t'+ str(predicts[key]['pred_label'])+'\t')
    for idx in predicts[key]['rationale_q'][:-1]:
        out_file.write(str(idx)+',')
    out_file.write(str(predicts[key]['rationale_q'][-1])+'\t')

    for idx in predicts[key]['rationale_t'][:-1]:
        out_file.write(str(idx)+',')
    out_file.write(str(predicts[key]['rationale_t'][-1])+'\n')
out_file.close()