In [1]:
import sys

sys.path.insert(0, "../..")
sys.path.insert(0, "../../../")

# Load the Pretrained Model and the dataset

In [2]:
import numpy as np
import paddle
import paddlenlp
from paddlenlp.transformers import ErnieForSequenceClassification, ErnieTokenizer

MODEL_NAME = "ernie-1.0"

# init model and tokenizer
model = ErnieForSequenceClassification.from_pretrained(MODEL_NAME, num_classes=2)
tokenizer = ErnieTokenizer.from_pretrained(MODEL_NAME)

[32m[2022-04-25 16:26:44,563] [    INFO][0m - Already cached /home/zhangshuai/.paddlenlp/models/ernie-1.0/ernie_v1_chn_base.pdparams[0m
W0425 16:26:44.565753 14179 device_context.cc:447] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 11.4, Runtime API Version: 10.2
W0425 16:26:44.571143 14179 device_context.cc:465] device: 0, cuDNN Version: 8.2.
[32m[2022-04-25 16:26:49,199] [    INFO][0m - Already cached /home/zhangshuai/.paddlenlp/models/ernie-1.0/vocab.txt[0m


In [3]:
from paddlenlp.datasets import load_dataset

# load dataset
DATASET_NAME = 'lcqmc'
train_ds, dev_ds, test_ds = load_dataset(DATASET_NAME, splits=["train", "dev", "test"])

# Prepare the Model

## Train the model

In [None]:
# training the model and save to save_dir
# only needs to run once.

from assets.utils import training_model

training_model(model, tokenizer, train_ds, dev_ds, save_dir=f'../../assets/{DATASET_NAME}-{MODEL_NAME}')

## Or Load the trained model


In [4]:
# Load the trained model.
!wget --no-check-certificate -c https://trustai.bj.bcebos.com/lcqmc-ernie-1.0.tar
!tar -xvf ./lcqmc-ernie-1.0.tar -C ../../assets/
!rm ./lcqmc-ernie-1.0.tar

state_dict = paddle.load(f'../../assets/{DATASET_NAME}-{MODEL_NAME}/model_state.pdparams')
model.set_dict(state_dict)

--2022-04-25 16:26:49--  https://trustai.bj.bcebos.com/lcqmc-ernie-1.0.tar
Resolving trustai.bj.bcebos.com (trustai.bj.bcebos.com)... 10.70.0.165
Connecting to trustai.bj.bcebos.com (trustai.bj.bcebos.com)|10.70.0.165|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 399595520 (381M) [application/x-tar]
Saving to: ‘lcqmc-ernie-1.0.tar’


2022-04-25 16:26:53 (120 MB/s) - ‘lcqmc-ernie-1.0.tar’ saved [399595520/399595520]

lcqmc-ernie-1.0/
lcqmc-ernie-1.0/tokenizer_config.json
lcqmc-ernie-1.0/vocab.txt
lcqmc-ernie-1.0/model_state.pdparams
lcqmc-ernie-1.0/model_config.json


# See the prediction results

In [5]:
from assets.utils import predict

data = [
    {
        "query": '手机运行内存能扩展吗，怎么扩？', "title": '手机运行内存能扩展吗？如果有怎么扩展？', "label": 1
    },
    {
        "query": '宝宝起名五行缺什么？', "title": '测五行缺什么。起名？', "label": 1
    },
    {
        "query": '真空压缩袋怎么样？', "title": '真空压缩袋怎么卖？', "label": 0
    },
]

label_map = {0: 'negative', 1: 'positive'}
true_labels = [1, 1, 0]
batch_size = 32
results = predict(model, data, tokenizer, label_map, batch_size=batch_size)

for idx, text in enumerate(data):
    print('Data: {} \t Label: {}'.format(text, results[idx]))

Data: {'query': '手机运行内存能扩展吗，怎么扩？', 'title': '手机运行内存能扩展吗？如果有怎么扩展？', 'label': 1} 	 Label: positive
Data: {'query': '宝宝起名五行缺什么？', 'title': '测五行缺什么。起名？', 'label': 1} 	 Label: positive
Data: {'query': '真空压缩袋怎么样？', 'title': '真空压缩袋怎么卖？', 'label': 0} 	 Label: negative


# Prepare for Interpretations


In [6]:
from paddlenlp.data import Stack, Tuple, Pad

from assets.utils import convert_example
from trustai.interpretation.token_level.data_processor import VisualizationTextRecord, visualize_text
from trustai.interpretation.token_level.common import get_rationales_and_non_ratioanles

def preprocess_fn(data):
    examples = []

    if not isinstance(data, list):
        data = [data]

    for text in data:
        input_ids, segment_ids = convert_example(text, tokenizer, max_seq_length=128, is_test=True)
        examples.append((input_ids, segment_ids))

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input id
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # segment id
    ): fn(samples)

    input_ids, segment_ids = batchify_fn(examples)
    return paddle.to_tensor(input_ids, stop_gradient=False), paddle.to_tensor(segment_ids, stop_gradient=False)

In [7]:
import jieba

from trustai.interpretation import get_word_offset

contexts = []
batch_words = []
for example in data:
    contexts.append("[CLS]" + example['query'] + "[SEP]" + example['title'] + "[SEP]")
    batch_words.append(["[CLS]"] + list(jieba.cut(example['query'])) + ["[SEP]"] + list(jieba.cut(example['title'])) + ["[SEP]"] )
word_offset_maps = []
subword_offset_maps = []
for i in range(len(contexts)):
    word_offset_maps.append(get_word_offset(contexts[i], batch_words[i]))
    subword_offset_maps.append(tokenizer.get_offset_mapping(contexts[i]))

print("\nword_offset_map:")
for word, (offset_start, offset_end) in zip(batch_words[0], word_offset_maps[0]):
    print(word, offset_start, offset_end)
print("\nsubword_offset_map:")
for subword, (offset_start, offset_end) in zip(tokenizer._tokenize(contexts[0]), subword_offset_maps[0]):
    print(subword, offset_start, offset_end)

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Dumping model to file cache /tmp/jieba.cache
Dump cache file failed.
Traceback (most recent call last):
  File "/home/zhangshuai/miniconda3/envs/zs_py39/lib/python3.9/site-packages/jieba/__init__.py", line 154, in initialize
    _replace_file(fpath, cache_file)
PermissionError: [Errno 1] Operation not permitted: '/tmp/tmp_stck7rw' -> '/tmp/jieba.cache'
Loading model cost 0.806 seconds.
Prefix dict has been built successfully.

word_offset_map:
[CLS] 0 5
手机 5 7
运行 7 9
内存 9 11
能 11 12
扩展 12 14
吗 14 15
， 15 16
怎么 16 18
扩 18 19
？ 19 20
[SEP] 20 25
手机 25 27
运行 27 29
内存 29 31
能 31 32
扩展 32 34
吗 34 35
？ 35 36
如果 36 38
有 38 39
怎么 39 41
扩展 41 43
？ 43 44
[SEP] 44 49

subword_offset_map:
[ 0 1
cl 1 3
##s 3 4
[UNK] 4 5
手 5 6
机 6 7
运 7 8
行 8 9
内 9 10
存 10 11
能 11 12
扩 12 13
展 13 14
吗 14 15
， 15 16
怎 16 17
么 17 18
扩 18 19
？ 19 20
[ 20 21
sep 21 24
[UNK] 24 25
手 25 26
机 26 27
运 27 28
行 28 29
内 29 30
存 30 31

# IG Interpreter


In [8]:
from trustai.interpretation.token_level import IntGradInterpreter
ig = IntGradInterpreter(model, device="gpu")
result = ig(preprocess_fn(data), steps=100)

align_res = ig.alignment(result, contexts, batch_words, word_offset_maps, subword_offset_maps, special_tokens=["[CLS]", '[SEP]'])

# process for vbisualize
recs = []
for i in range(len(align_res)):
    recs.append(VisualizationTextRecord(align_res[i],true_label=true_labels[i]))
html = visualize_text(recs)


Golden Label,Predicted Label (Prob),Important scores
1.0,1 (1.00),[CLS] 手机 运行 内存 能 扩展 吗 ， 怎么 扩 ？ [SEP] 手机 运行 内存 能 扩展 吗 ？ 如果 有 怎么 扩展 ？ [SEP]
,,
1.0,1 (0.91),[CLS] 宝宝 起名 五行 缺什么 ？ [SEP] 测 五行 缺什么 。 起名 ？ [SEP]
,,
0.0,0 (0.98),[CLS] 真空 压缩 袋 怎么样 ？ [SEP] 真空 压缩 袋 怎么 卖 ？ [SEP]
,,


## print interpret result

In [9]:
print("interpret result of first example:")
for field in result[0].__dataclass_fields__:
    print(field, ":", getattr(result[0], field))
print("alignment result of first example:")
for field in align_res[0].__dataclass_fields__:
    print(field, ":", getattr(align_res[0], field))


interpret result of first example:
attributions : [ 9.8212082e-03  1.3883565e-01  1.3197863e-01  7.5971864e-02
  6.5383144e-02  7.6201648e-02  7.2356835e-02  5.9253935e-02
  3.6213800e-02  1.3086946e-02  3.0525709e-02  9.0835625e-03
  1.0339208e-02  2.9540770e-03  4.7282107e-02 -6.3666585e-03
 -2.5567221e-02  6.7195721e-02  3.3217926e-02  6.7754209e-02
  5.0005946e-02  2.9551987e-02  4.5666732e-02  2.9116267e-02
  2.4607677e-02  1.6491744e-04 -3.8130442e-05 -1.6418841e-02
  2.5379380e-02  3.3628747e-03  1.4054486e-02 -2.9295105e-02
 -1.7831795e-02 -5.8200909e-05 -2.1824688e-02 -4.4304539e-02
 -1.2704406e-02]
pred_label : 1
pred_proba : [0.00307661 0.9969234 ]
error_percent : -2.2277641
alignment result of first example:
words : ['[CLS]', '手机', '运行', '内存', '能', '扩展', '吗', '，', '怎么', '扩', '？', '[SEP]', '手机', '运行', '内存', '能', '扩展', '吗', '？', '如果', '有', '怎么', '扩展', '？', '[SEP]']
word_attributions : [0.009821208193898201, 0.27081428468227386, 0.1413550078868866, 0.14855848252773285, 0.05925

# LIME Interpreter

In [10]:
from trustai.interpretation.token_level import LIMEInterpreter
lime = LIMEInterpreter(model, device="gpu",
    unk_id=tokenizer.convert_tokens_to_ids('[UNK]'),
    pad_id=tokenizer.convert_tokens_to_ids('[PAD]'))

result = lime(preprocess_fn(data), num_samples=1000)
align_res = lime.alignment(result, contexts, batch_words, word_offset_maps, subword_offset_maps, special_tokens=["[CLS]", '[SEP]'])

# process for vbisualize
recs = []
for i in range(len(align_res)):
    recs.append(VisualizationTextRecord(align_res[i], true_label=true_labels[i]))
html = visualize_text(recs)

Golden Label,Predicted Label (Prob),Important scores
1.0,1 (1.00),[CLS] 手机 运行 内存 能 扩展 吗 ， 怎么 扩 ？ [SEP] 手机 运行 内存 能 扩展 吗 ？ 如果 有 怎么 扩展 ？ [SEP]
,,
1.0,1 (0.91),[CLS] 宝宝 起名 五行 缺什么 ？ [SEP] 测 五行 缺什么 。 起名 ？ [SEP]
,,
0.0,0 (0.98),[CLS] 真空 压缩 袋 怎么样 ？ [SEP] 真空 压缩 袋 怎么 卖 ？ [SEP]
,,


# Attention Interpreter

In [11]:
from trustai.interpretation.token_level.common import attention_predict_fn_on_paddlenlp
from trustai.interpretation.token_level import AttentionInterpreter

att = AttentionInterpreter(model, device="gpu", predict_fn=attention_predict_fn_on_paddlenlp)

result = att(preprocess_fn(data))
align_res = att.alignment(result, contexts, batch_words, word_offset_maps, subword_offset_maps, special_tokens=["[CLS]", '[SEP]'])

# process for vbisualize
recs = []
for i in range(len(align_res)):
    recs.append(VisualizationTextRecord(align_res[i], true_label=true_labels[i]))
html = visualize_text(recs)

Golden Label,Predicted Label (Prob),Important scores
1.0,1 (1.00),[CLS] 手机 运行 内存 能 扩展 吗 ， 怎么 扩 ？ [SEP] 手机 运行 内存 能 扩展 吗 ？ 如果 有 怎么 扩展 ？ [SEP]
,,
1.0,1 (0.91),[CLS] 宝宝 起名 五行 缺什么 ？ [SEP] 测 五行 缺什么 。 起名 ？ [SEP]
,,
0.0,0 (0.98),[CLS] 真空 压缩 袋 怎么样 ？ [SEP] 真空 压缩 袋 怎么 卖 ？ [SEP]
,,


# GradShap Interpreter

In [12]:
from trustai.interpretation.token_level import GradShapInterpreter
gradshap = GradShapInterpreter(model, device="gpu", n_samples=50, noise_amount=0.1)
result = gradshap(preprocess_fn(data))
align_res = gradshap.alignment(result, contexts, batch_words, word_offset_maps, subword_offset_maps, special_tokens=["[CLS]", '[SEP]'])

# process for vbisualize
recs = []
for i in range(len(align_res)):
    recs.append(VisualizationTextRecord(align_res[i], true_label=true_labels[i]))
html = visualize_text(recs)

Golden Label,Predicted Label (Prob),Important scores
1.0,1 (1.00),[CLS] 手机 运行 内存 能 扩展 吗 ， 怎么 扩 ？ [SEP] 手机 运行 内存 能 扩展 吗 ？ 如果 有 怎么 扩展 ？ [SEP]
,,
1.0,1 (0.91),[CLS] 宝宝 起名 五行 缺什么 ？ [SEP] 测 五行 缺什么 。 起名 ？ [SEP]
,,
0.0,0 (0.98),[CLS] 真空 压缩 袋 怎么样 ？ [SEP] 真空 压缩 袋 怎么 卖 ？ [SEP]
,,
