# Load the Pretrained Model and the Dataset

In [1]:
import sys

sys.path.append("../..")
sys.path.append("../../../")

In [2]:
import paddle
import paddlenlp
from paddlenlp.transformers import ErnieForSequenceClassification, ErnieTokenizer

MODEL_NAME = 'ernie-1.0'

model = ErnieForSequenceClassification.from_pretrained(MODEL_NAME, num_classes=2)
tokenizer = ErnieTokenizer.from_pretrained(MODEL_NAME)

[32m[2022-05-17 16:55:14,795] [    INFO][0m - Already cached /home/zhangshuai/.paddlenlp/models/ernie-1.0/ernie_v1_chn_base.pdparams[0m
W0517 16:55:14.798794 27221 device_context.cc:447] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 11.4, Runtime API Version: 10.2
W0517 16:55:14.804426 27221 device_context.cc:465] device: 0, cuDNN Version: 8.2.
[32m[2022-05-17 16:55:20,053] [    INFO][0m - Already cached /home/zhangshuai/.paddlenlp/models/ernie-1.0/vocab.txt[0m


In [3]:
from paddlenlp.datasets import load_dataset

DATASET_NAME = 'chnsenticorp'
train_ds, dev_ds, test_ds = load_dataset(DATASET_NAME, splits=["train", "dev", "test"])

# Prepare the Model
## Train the model

In [None]:
from assets.utils import training_model

training_model(model, tokenizer, train_ds, dev_ds, save_dir=f'../../assets/{DATASET_NAME}-{MODEL_NAME}')


## Or Load the trained model

In [4]:
# Load the trained model.
!wget --no-check-certificate -c https://trustai.bj.bcebos.com/chnsenticorp-ernie-1.0.tar
!tar -xvf ./chnsenticorp-ernie-1.0.tar -C ../../assets/
!rm ./chnsenticorp-ernie-1.0.tar

state_dict = paddle.load(f'../../assets/{DATASET_NAME}-{MODEL_NAME}/model_state.pdparams')
model.set_dict(state_dict)

--2022-05-17 16:55:20--  https://trustai.bj.bcebos.com/chnsenticorp-ernie-1.0.tar
Resolving trustai.bj.bcebos.com (trustai.bj.bcebos.com)... 10.70.0.165
Connecting to trustai.bj.bcebos.com (trustai.bj.bcebos.com)|10.70.0.165|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 399595520 (381M) [application/x-tar]
Saving to: ‘chnsenticorp-ernie-1.0.tar’


2022-05-17 16:55:24 (97.9 MB/s) - ‘chnsenticorp-ernie-1.0.tar’ saved [399595520/399595520]

chnsenticorp-ernie-1.0/
chnsenticorp-ernie-1.0/tokenizer_config.json
chnsenticorp-ernie-1.0/vocab.txt
chnsenticorp-ernie-1.0/model_state.pdparams
chnsenticorp-ernie-1.0/model_config.json


# See the prediciton results

In [5]:
from assets.utils import predict

test_data = [{'text': '本来不想评价了，但为了携程的携粉们，还是说一下，这称不上是九点，细说就真没必要了，就一个字：差'}]

label_map = {0: 'negative', 1: 'positive'}

batch_size = 32

results = predict(
    model, test_data, tokenizer, label_map, batch_size=batch_size)

for idx, text in enumerate(test_data):
    print('Data: {} \t Lable: {}'.format(text, results[idx]))

Data: {'text': '本来不想评价了，但为了携程的携粉们，还是说一下，这称不上是九点，细说就真没必要了，就一个字：差'} 	 Lable: negative


# Prepare for Interpretations

In [6]:
from functools import partial

from paddlenlp.data import Stack, Tuple, Pad

from assets.utils import create_dataloader, convert_example


batch_size = 1 # attention
max_seq_length = 128

trans_func = partial(convert_example,
                     tokenizer=tokenizer,
                     max_seq_length=max_seq_length)
batchify_fn = lambda samples, fn=Tuple(
    Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
    Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # segment
    Stack(dtype="int64")  # label
): [data for data in fn(samples)]
train_data_loader = create_dataloader(train_ds,
                                      mode='train',
                                      batch_size=batch_size,
                                      batchify_fn=batchify_fn,
                                      trans_fn=trans_func,
                                      shuffle=False)

# Gradient similarity Interpreter

In [7]:
from trustai.interpretation.example_level.method.gradient_similarity import GradientSimilarityModel

# classifier_layer_name is the layer name of the last output layer
grad_sim = GradientSimilarityModel(model, train_data_loader, classifier_layer_name='classifier', cached_train_grad=None)

Extracting gradient for given dataloader, it will take some time...


In [8]:
from assets.utils import create_dataloader_from_scratch, predict

label_map = {0: "negative", 1: "positive"}
test_predict_labels = predict(model, test_data, tokenizer, label_map)
for i in range(len(test_data)):
    test_data[i]["label"] = 0 if test_predict_labels[i] == "negative" else 1

# process text to model input
test_dataloader = create_dataloader_from_scratch(test_data, tokenizer, with_label=True)


sim_fn = "cos"
sample_num = 3
predict_labels, most_sim_examples = grad_sim.interpret(
        test_dataloader, sample_num=sample_num, sim_fn=sim_fn
    )

for i in range(len(test_data)):
    print("test data")
    print(f"text: {test_data[i]['text']}\tpredict label: {predict_labels[i]}")
    print("most similar examples")
    for example in most_sim_examples[i][0]:
        print(
            f"text: {train_ds.data[example]['text']}\tgold label: {train_ds.data[example]['label']}"
        )
    print("most dissimilar examples")
    for example in most_sim_examples[i][1]:
        print(
            f"text: {train_ds.data[example]['text']}\tgold label: {train_ds.data[example]['label']}"
        )
    print()

Extracting gradient for given dataloader, it will take some time...
test data
text: 本来不想评价了，但为了携程的携粉们，还是说一下，这称不上是九点，细说就真没必要了，就一个字：差	predict label: 0
most similar examples
text: 我选分期付款，在上海扣的款，在上海开的发票，东西却从北京用快递发出，真是舍近求远，害的我等了一星期才收到货，真是脑残！	gold label: 0
text: 看到评价那么高，就买了，但女儿不喜欢，我看了一下，也不喜欢，不知所云，汽车什么的，都是过时的，或生活中没有的。不明白，为什么有这么高的评价。	gold label: 0
text: 看到评价那么高，就买了，但女儿不喜欢，我看了一下，也不喜欢，不知所云，汽车什么的，都是过时的，或生活中没有的。不明白，为什么有这么高的评价。	gold label: 0
most dissimilar examples
text: 单位用户千万别买，支付太不方便。 一定要支票到帐才发货，结果给了支票，居然两天不到财务那里，找了一天才找到支票在那里。 18号下的订单，28号才送到，而且还是支付的现金，支票依然没有到帐。不知道那个单位可以忍受这样的服务。	gold label: 1
text: 这款机子，我没发现任何值得说好的地方！等待我不停的投诉吧！。。。（下面的评价不让多写字，写了很多发不了）	gold label: 1
text: 已经评过了，可是还要求我来评价，这是什么玩意啊？当当的一些功能特别的差劲！难道我买基本就得评价几次么？希望有人处理下这个功能，一点都不人性和智能！！！	gold label: 1



In [9]:
from assets.utils import create_dataloader_from_scratch

label_map = {0: "negative", 1: "positive"}
test_predict_labels = predict(model, test_data, tokenizer, label_map)
for i in range(len(test_data)):
    test_data[i]["label"] = 0 if test_predict_labels[i] == "negative" else 1

# process text to model input
test_dataloader = create_dataloader_from_scratch(test_data, tokenizer, with_label=True)

sim_fn = "dot"
sample_num = 3
predict_labels, most_sim_examples = grad_sim.interpret(
        test_dataloader, sample_num=sample_num, sim_fn=sim_fn
    )

for i in range(len(test_data)):
    print("test data")
    print(f"text: {test_data[i]['text']}\tpredict label: {predict_labels[i]}")
    print("most similar examples")
    for example in most_sim_examples[i][0]:
        print(
            f"text: {train_ds.data[example]['text']}\tgold label: {train_ds.data[example]['label']}"
        )
    print("most dissimilar examples")
    for example in most_sim_examples[i][1]:
        print(
            f"text: {train_ds.data[example]['text']}\tgold label: {train_ds.data[example]['label']}"
        )
    print()

Extracting gradient for given dataloader, it will take some time...
test data
text: 本来不想评价了，但为了携程的携粉们，还是说一下，这称不上是九点，细说就真没必要了，就一个字：差	predict label: 0
most similar examples
text: 交通还比较方便,价格相对便宜,但是设施有些陈旧,没有网络可上网,服务生的素质相对较低些,如果重新装修在这个位置应该还是不错.	gold label: 0
text: 看评论说有卖到4299还送瑞星杀毒，就是不知道是何时，如果消息属实，那确实会有点郁闷！	gold label: 0
text: 答应当初买电脑送包，可到消灾快一个月了，还没有送，准备退货了，不诚信，以后买东西必须东西全才可以付款	gold label: 0
most dissimilar examples
text: 太一般了 网络慢的跟蜗牛一个等级 还高速呢 真晕 补充点评 2008年4月2日 ： 也没早餐 点评	gold label: 1
text: 自己去松江配送点拿的货，一打看就看到显示屏下方的有一处黑漆掉色严重．是帮朋友买的，郁闷中．	gold label: 1
text: 笔记本不错，京东不够厚道，钻石抢的和现在促销价没差几分，这不是忽悠我们钻石一族吗？？？	gold label: 1

