# Load the Pretrained Model and the dataset
We use ernie-3.0-base-zh as the model and chnsenticorp as the dataset for example. More models can be found in [PaddleNLP Model Zoo](https://paddlenlp.readthedocs.io/zh/latest/model_zoo/transformers.html#transformer).

Obviously, PaddleNLP is needed to run this notebook, which is easy to install:
```bash
pip install setuptools_scm 
pip install --upgrade paddlenlp
```

In [1]:
import paddle
import paddlenlp
from assets.ernie import ErnieForSequenceClassification
from paddlenlp.transformers import ErnieTokenizer

MODEL_NAME = "ernie-3.0-base-zh"

model = ErnieForSequenceClassification.from_pretrained(MODEL_NAME, num_classes=2)
tokenizer = ErnieTokenizer.from_pretrained(MODEL_NAME)

  from .autonotebook import tqdm as notebook_tqdm
[32m[2022-07-05 19:57:13,681] [    INFO][0m - Already cached /root/.paddlenlp/models/ernie-3.0-base-zh/ernie_3.0_base_zh.pdparams[0m
W0705 19:57:13.684257 149389 gpu_context.cc:278] Please NOTE: device: 0, GPU Compute Capability: 8.0, Driver API Version: 11.2, Runtime API Version: 11.2
W0705 19:57:13.688616 149389 gpu_context.cc:306] device: 0, cuDNN Version: 8.1.
[32m[2022-07-05 19:57:23,465] [    INFO][0m - Already cached /root/.paddlenlp/models/ernie-3.0-base-zh/ernie_3.0_base_zh_vocab.txt[0m
[32m[2022-07-05 19:57:23,518] [    INFO][0m - tokenizer config file saved in /root/.paddlenlp/models/ernie-3.0-base-zh/tokenizer_config.json[0m
[32m[2022-07-05 19:57:23,520] [    INFO][0m - Special tokens file saved in /root/.paddlenlp/models/ernie-3.0-base-zh/special_tokens_map.json[0m


In [3]:
from paddlenlp.datasets import load_dataset
DATASET_NAME = 'chnsenticorp'
train_ds, dev_ds, test_ds = load_dataset(
    DATASET_NAME, splits=["train", "dev", "test"]
)

# Prepare the Model

## Train the model

In [3]:
# training the model and save to save_dir
# only needs to run once.
# total steps ~900 (3 epochs)

from assets.utils import training_model
training_model(model, tokenizer, train_ds, dev_ds, save_dir=f'assets/{DATASET_NAME}-{MODEL_NAME}')

# global step 900, epoch: 3, batch: 300, loss: 0.00739, acc: 0.98438
# eval loss: 0.19582, accu: 0.94750

dataset labels: ['0', '1']
dataset examples:
{'text': '选择珠江花园的原因就是方便，有电动扶梯直接到达海边，周围餐馆、食廊、商场、超市、摊位一应俱全。酒店装修一般，但还算整洁。 泳池在大堂的屋顶，因此很小，不过女儿倒是喜欢。 包的早餐是西式的，还算丰富。 服务吗，一般', 'label': 1, 'qid': ''}
{'text': '15.4寸笔记本的键盘确实爽，基本跟台式机差不多了，蛮喜欢数字小键盘，输数字特方便，样子也很美观，做工也相当不错', 'label': 1, 'qid': ''}
{'text': '房间太小。其他的都一般。。。。。。。。。', 'label': 0, 'qid': ''}
{'text': '1.接电源没有几分钟,电源适配器热的不行. 2.摄像头用不起来. 3.机盖的钢琴漆，手不能摸，一摸一个印. 4.硬盘分区不好办.', 'label': 0, 'qid': ''}
{'text': '今天才知道这书还有第6卷,真有点郁闷:为什么同一套书有两种版本呢?当当网是不是该跟出版社商量商量,单独出个第6卷,让我们的孩子不会有所遗憾。', 'label': 1, 'qid': ''}
Training Starts:
global step 100, epoch: 1, batch: 100, loss: 0.13706, acc: 0.80875
global step 200, epoch: 1, batch: 200, loss: 0.43083, acc: 0.85531
global step 300, epoch: 1, batch: 300, loss: 0.09329, acc: 0.87771
eval loss: 0.18998, accu: 0.93500


[32m[2022-07-04 19:10:15,094] [    INFO][0m - tokenizer config file saved in assets/chnsenticorp-ernie-3.0-base-zh/tokenizer_config.json[0m
[32m[2022-07-04 19:10:15,096] [    INFO][0m - Special tokens file saved in assets/chnsenticorp-ernie-3.0-base-zh/special_tokens_map.json[0m


## Or Load the trained model

In [4]:
# Load the trained model.
state_dict = paddle.load(f'assets/{DATASET_NAME}-{MODEL_NAME}/model_state.pdparams')
model.set_dict(state_dict)

# Prepare for Interpretations

In [6]:
import interpretdl as it
import numpy as np
from assets.utils import convert_example, aggregate_subwords_and_importances
from paddlenlp.data import Stack, Tuple, Pad
from interpretdl.data_processor.visualizer import VisualizationTextRecord, visualize_text

def preprocess_fn(data):
    examples = []
    
    if not isinstance(data, list):
        data = [data]
    
    for text in data:
        input_ids, segment_ids = convert_example(
            text,
            tokenizer,
            max_seq_length=128,
            is_test=True
        )
        examples.append((input_ids, segment_ids))

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input id
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # segment id
    ): fn(samples)
    
    input_ids, segment_ids = batchify_fn(examples)
    return paddle.to_tensor(input_ids, stop_gradient=False), paddle.to_tensor(segment_ids, stop_gradient=False)

## BT Interpreter

### Token-wise

In [5]:
from assets.utils import predict

data = [
    {"text":'这个宾馆比较陈旧了，特价的房间也很一般。总体来说一般'},
#     {"text":'怀着十分激动的心情放映，可是看着看着发现，在放映完毕后，出现一集米老鼠的动画片'},
#     {"text":'作为老的四星酒店，房间依然很整洁，相当不错。机场接机服务很好，可以在车上办理入住手续，节省时间。'},
]

label_map = {0: 'negative', 1: 'positive'}

batch_size = 32

results = predict(
    model, data, tokenizer, label_map, batch_size=batch_size)

for idx, text in enumerate(data):
    print('Data: {} \t Lable: {}'.format(text, results[idx]))

Data: {'text': '这个宾馆比较陈旧了，特价的房间也很一般。总体来说一般'} 	 Lable: negative


In [6]:
bt = it.BTNLPInterpreter(model, device='gpu:0')

interp_class = 0
pred = model(*preprocess_fn(data))
subword_importances = bt.interpret(
    ap_mode="token",
    data=preprocess_fn(data),
    label=interp_class,
    start_layer=9)

true_label = 0
recs = []

subwords = " ".join(tokenizer._tokenize(data[0]['text'])).split(' ')

words, word_importances = aggregate_subwords_and_importances(subwords, subword_importances[0])
word_importances = np.array(word_importances) / np.linalg.norm(
        word_importances)
    
recs.append(
        VisualizationTextRecord(words, word_importances, true_label,
                               np.argmax(pred), pred[0, np.argmax(pred)].item(), interp_class)
    )

visualize_text(recs)
# The visualization is not available at github

True Label,Predicted Label (Prob),Target Label,Word Importance
0.0,0 (1.37),0.0,这 个 宾 馆 比 较 陈 旧 了 ， 特 价 的 房 间 也 很 一 般 。 总 体 来 说 一 般
,,,


### Head-wise

In [7]:
interp_class = 0
pred = model(*preprocess_fn(data))
subword_importances = bt.interpret(
    data=preprocess_fn(data),
    label=interp_class,
    start_layer=11)

true_label = 0
recs = []

subwords = " ".join(tokenizer._tokenize(data[0]['text'])).split(' ')

words, word_importances = aggregate_subwords_and_importances(subwords, subword_importances[0])
word_importances = np.array(word_importances) / np.linalg.norm(
        word_importances)
    
recs.append(
        VisualizationTextRecord(words, word_importances, true_label,
                               np.argmax(pred), pred[0, np.argmax(pred)].item(), interp_class)
    )

visualize_text(recs)
# The visualization is not available at github

True Label,Predicted Label (Prob),Target Label,Word Importance
0.0,0 (1.37),0.0,这 个 宾 馆 比 较 陈 旧 了 ， 特 价 的 房 间 也 很 一 般 。 总 体 来 说 一 般
,,,


## GA Interpreter

In [8]:
from assets.utils import predict

data = [
    {"text":'这个宾馆比较陈旧了，特价的房间也很一般。总体来说一般'},
#     {"text":'怀着十分激动的心情放映，可是看着看着发现，在放映完毕后，出现一集米老鼠的动画片'},
#     {"text":'作为老的四星酒店，房间依然很整洁，相当不错。机场接机服务很好，可以在车上办理入住手续，节省时间。'},
]

label_map = {0: 'negative', 1: 'positive'}

batch_size = 32

results = predict(
    model, data, tokenizer, label_map, batch_size=batch_size)

for idx, text in enumerate(data):
    print('Data: {} \t Lable: {}'.format(text, results[idx]))

Data: {'text': '这个宾馆比较陈旧了，特价的房间也很一般。总体来说一般'} 	 Lable: negative


In [9]:
bt = it.GANLPInterpreter(model, device='gpu:0')

interp_class = 0
pred = model(*preprocess_fn(data))
subword_importances = bt.interpret(
    data=preprocess_fn(data),
    label=interp_class,
    start_layer=11)

true_label = 0
recs = []

subwords = " ".join(tokenizer._tokenize(data[0]['text'])).split(' ')

words, word_importances = aggregate_subwords_and_importances(subwords, subword_importances[0])
word_importances = np.array(word_importances) / np.linalg.norm(
        word_importances)
    
recs.append(
        VisualizationTextRecord(words, word_importances, true_label,
                               np.argmax(pred), pred[0, np.argmax(pred)].item(), interp_class)
    )

visualize_text(recs)

True Label,Predicted Label (Prob),Target Label,Word Importance
0.0,0 (1.37),0.0,这 个 宾 馆 比 较 陈 旧 了 ， 特 价 的 房 间 也 很 一 般 。 总 体 来 说 一 般
,,,
