# Load the Pretrained Model and the Dataset

In [1]:
import sys

sys.path.insert(0, "../..")
sys.path.insert(0, "../../../")

In [2]:
import paddle
import paddlenlp
from paddlenlp.transformers import ErnieForSequenceClassification, ErnieTokenizer

MODEL_NAME = 'ernie-1.0'

model = ErnieForSequenceClassification.from_pretrained(MODEL_NAME, num_classes=2)
tokenizer = ErnieTokenizer.from_pretrained(MODEL_NAME)

[32m[2022-06-27 00:31:37,791] [    INFO][0m - Already cached /home/zhangshuai/.paddlenlp/models/ernie-1.0/ernie_v1_chn_base.pdparams[0m
W0627 00:31:37.793457 11209 gpu_context.cc:278] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 11.4, Runtime API Version: 10.2
W0627 00:31:37.798871 11209 gpu_context.cc:306] device: 0, cuDNN Version: 8.2.
[32m[2022-06-27 00:31:48,109] [    INFO][0m - Already cached /home/zhangshuai/.paddlenlp/models/ernie-1.0/vocab.txt[0m


In [3]:
from paddlenlp.datasets import load_dataset

DATASET_NAME = 'chnsenticorp'
train_ds, dev_ds, test_ds = load_dataset(DATASET_NAME, splits=["train", "dev", "test"])

INFO 2022-06-27 00:31:48,143 download.py:117] unique_endpoints {''}


# Prepare the Model
## Train the model

In [None]:
from assets.utils import training_model

training_model(model, tokenizer, train_ds, dev_ds, save_dir=f'../../assets/{DATASET_NAME}-{MODEL_NAME}')


## Or Load the trained model

In [4]:
# Load the trained model.
!wget --no-check-certificate -c https://trustai.bj.bcebos.com/chnsenticorp-ernie-1.0.tar
!tar -xvf ./chnsenticorp-ernie-1.0.tar -C ../../assets/
!rm ./chnsenticorp-ernie-1.0.tar

state_dict = paddle.load(f'../../assets/{DATASET_NAME}-{MODEL_NAME}/model_state.pdparams')
model.set_dict(state_dict)

--2022-06-27 00:31:48--  https://trustai.bj.bcebos.com/chnsenticorp-ernie-1.0.tar
Resolving trustai.bj.bcebos.com (trustai.bj.bcebos.com)... 10.70.0.165
Connecting to trustai.bj.bcebos.com (trustai.bj.bcebos.com)|10.70.0.165|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 399595520 (381M) [application/x-tar]
Saving to: ‘chnsenticorp-ernie-1.0.tar’


2022-06-27 00:31:51 (120 MB/s) - ‘chnsenticorp-ernie-1.0.tar’ saved [399595520/399595520]

chnsenticorp-ernie-1.0/
chnsenticorp-ernie-1.0/tokenizer_config.json
chnsenticorp-ernie-1.0/vocab.txt
chnsenticorp-ernie-1.0/model_state.pdparams
chnsenticorp-ernie-1.0/model_config.json


# See the prediciton results

In [5]:
from assets.utils import predict

test_data = [{'text': '本来不想评价了，但为了携程的携粉们，还是说一下，这称不上是九点，细说就真没必要了，就一个字：差'}]

label_map = {0: 'negative', 1: 'positive'}

batch_size = 32

results = predict(
    model, test_data, tokenizer, label_map, batch_size=batch_size)

for idx, text in enumerate(test_data):
    print('Data: {} \t Lable: {}'.format(text, results[idx]))

Data: {'text': '本来不想评价了，但为了携程的携粉们，还是说一下，这称不上是九点，细说就真没必要了，就一个字：差'} 	 Lable: negative


# Prepare for Interpretations

In [6]:
from functools import partial

from paddlenlp.data import Stack, Tuple, Pad

from assets.utils import create_dataloader, convert_example


batch_size = 32
max_seq_length = 128
learning_rate = 5e-5 
epochs = 3
warmup_proportion = 0.1
weight_decay = 0.01

trans_func = partial(
        convert_example,
        tokenizer=tokenizer,
        max_seq_length=max_seq_length,
        is_test=True,
    )
batchify_fn = lambda samples, fn=Tuple(
    Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
    Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # segment
): [data for data in fn(samples)]
train_data_loader = create_dataloader(
    train_ds,
    mode='train',
    batch_size=batch_size,
    batchify_fn=batchify_fn,
    trans_fn=trans_func,
    shuffle=False)

# Rrepresenter Pointer Interpreter

In [7]:
from trustai.interpretation.example_level.method.representer_point import RepresenterPointModel

# classifier_layer_name is the layer name of the last output layer
representer_model = RepresenterPointModel(model, train_data_loader, classifier_layer_name='classifier')

Extracting feature from given dataloader, it will take some time...
INFO 2022-06-27 00:33:10,949 representer_point.py:131] Eopch:   0	loss:[0.06501473]	phi_loss:[0.04564954]	grad:[0.00098725]
Training representer point model, it will take several minutes...
INFO 2022-06-27 00:33:13,883 representer_point.py:131] Eopch:1000	loss:[0.04877458]	phi_loss:[0.04610152]	grad:[4.310665e-05]
INFO 2022-06-27 00:33:15,629 representer_point.py:124] stopping criteria reached in epoch:1717
L1 difference between ground truth prediction and prediction by representer theorem decomposition
[0.00264857]
pearson correlation between ground truth  prediction and prediciton by representer theorem
0.9998619306573363


In [8]:
from assets.utils import create_dataloader_from_scratch, print_result

test_data = [{'text': '本来不想评价了，但为了携程的携粉们，还是说一下，这称不上是九点，细说就真没必要了，就一个字：差'}]

# process text to model input
test_dataloader = create_dataloader_from_scratch(test_data, tokenizer)

res = []
for batch in test_dataloader:
    res += representer_model.interpret(batch)

print_result(test_data, train_ds, res, data_name='chnsenticorp')

test data
text: 本来不想评价了，但为了携程的携粉们，还是说一下，这称不上是九点，细说就真没必要了，就一个字：差	predict label: 0
examples with positive influence
text: 感觉非常奇怪,这套书我明明都写了两次评论了,可我的当当始终提醒我对这套书写评论!晕啊!这是套很好的书,也不用我写几次评论吧!	gold label: 1	score: 0.03509486839175224
text: 1）背面少个螺丝钉,说是thinkpad都少，靠 2）键盘周围的壳不平整，按下去发现有：“滋啦滋啦”声音，我才意识到，那是个双面胶，按下去就不上来了，过会儿还是回弹上来，很明显仅靠双面胶是 粘不住的，你还不如拿502呢，起码这样粘得严实还能让我心里舒服（但是这样只是弥补质量问题），何必还弄个滋啦兹啦的声音，多闹心啊，（还有一地方用了双面胶，我换内存的时候发现键盘下部盖子左侧打不开，一直不敢用力	gold label: 1	score: 0.03008781559765339
text: 用了6年的THINKPAD,一直认为是笔记本中最好的! 现在这台新的让我......哎!!	gold label: 0	score: 0.029884016141295433
examples with negative influence
text: 是LINUX系统 相当及其恶心 不知道这狗 日 的是什么想法 要强行逼我们使用啊 买了两台电脑 一个事VISTA系统 一个 是 LINUX 就没见一个XP的 网上销售这东西 最重要的是打架尽量不要涉及到售后服务这块 尽量是都搞好了相安无事 其实网上的售后服务比没有售后服务还差劲 我的THINKPAD SL400就是因为换货期间以为是键盘小问题就懒得换了	gold label: 1	score: -0.07112713158130646
text: 盼了2周终于拿到本了，一开机就屏不亮，本人自己跑回总部退机，现在还在等着检测，说要等上15个工作日，呵呵，买个电脑容易吗？时间浪费的起吗？请问？	gold label: 0	score: -0.0723314955830574
text: 价格确实比较高，而且还没有早餐提供。 携程拿到的价格不好？还是自己保留起来不愿意让利给我们这些客户呢？