In [None]:
# 查看当前挂载的数据集目录, 该目录下的变更重启环境后会自动还原
# View dataset directory. 
# This directory will be recovered automatically after resetting environment. 
!ls /home/aistudio/data

In [None]:
# 查看工作区文件, 该目录下的变更将会持久保存. 请及时清理不必要的文件, 避免加载过慢.
# View personal work directory. 
# All changes under this directory will be kept even after reset. 
# Please clean unnecessary files in time to speed up environment loading. 
!ls /home/aistudio/work

In [None]:
# 如果需要进行持久化安装, 需要使用持久化路径, 如下方代码示例:
# If a persistence installation is required, 
# you need to use the persistence path as the following: 
!mkdir /home/aistudio/external-libraries
!pip install beautifulsoup4 -t /home/aistudio/external-libraries

In [None]:
# 同时添加如下代码, 这样每次环境(kernel)启动的时候只要运行下方代码即可: 
# Also add the following code, 
# so that every time the environment (kernel) starts, 
# just run the following code: 
import sys 
sys.path.append('/home/aistudio/external-libraries')

In [1]:
from tqdm import tqdm 
import pandas as pd
import os
from functools import partial
import numpy as np
import time

# 导入paddle库
import paddle
import paddle.nn.functional as F
import paddle.nn as nn
from paddle.io import DataLoader
from paddle.dataset.common import md5file
# 导入paddlenlp的库
import paddlenlp as ppnlp
from paddlenlp.transformers import LinearDecayWithWarmup
from paddlenlp.metrics import ChunkEvaluator
from paddlenlp.transformers import BertTokenizer,BertPretrainedModel
from paddlenlp.data import Stack, Tuple, Pad, Dict
from paddlenlp.datasets import DatasetBuilder,get_path_from_url
# 导入所需要的py包
from paddle.io import Dataset

In [2]:
!unzip -o data/data110628/剧本角色情感识别.zip -d data

Archive:  data/data110628/剧本角色情感识别.zip
  inflating: data/submit_example.tsv  
  inflating: data/__MACOSX/._submit_example.tsv  
  inflating: data/test_dataset.tsv   
  inflating: data/__MACOSX/._test_dataset.tsv  
  inflating: data/train_dataset_v2.tsv  
  inflating: data/__MACOSX/._train_dataset_v2.tsv  


In [3]:
with open('data/train_dataset_v2.tsv', 'r', encoding='utf-8') as handler:
    lines = handler.read().split('\n')[1:-1]

    data = list()
    for line in tqdm(lines):
        sp = line.split('\t')
        if len(sp) != 4:
            print("ERROR:", sp)
            continue
        data.append(sp)

train = pd.DataFrame(data)
train.columns = ['id', 'content', 'character', 'emotions']

test = pd.read_csv('data/test_dataset.tsv', sep='\t')
submit = pd.read_csv('data/submit_example.tsv', sep='\t')
train = train[train['emotions'] != '']

100%|██████████| 42790/42790 [00:00<00:00, 704489.23it/s]


In [4]:
train['text'] = train[ 'content'].astype(str)  +'角色: ' + train['character'].astype(str)
test['text'] = test['content'].astype(str) + ' 角色: ' + test['character'].astype(str)

train['emotions'] = train['emotions'].apply(lambda x: [int(_i) for _i in x.split(',')])

train[['love', 'joy', 'fright', 'anger', 'fear', 'sorrow']] = train['emotions'].values.tolist()
test[['love', 'joy', 'fright', 'anger', 'fear', 'sorrow']] =[0,0,0,0,0,0]

train.to_csv('data/train.csv',columns=['id', 'content', 'character','text','love', 'joy', 'fright', 'anger', 'fear', 'sorrow'],
            sep='\t',
            index=False)

test.to_csv('data/test.csv',columns=['id', 'content', 'character','text','love', 'joy', 'fright', 'anger', 'fear', 'sorrow'],
            sep='\t',
            index=False)

In [5]:
target_cols=['love', 'joy', 'fright', 'anger', 'fear', 'sorrow']
# PRE_TRAINED_MODEL_NAME="bert-base-chinese"
# PRE_TRAINED_MODEL_NAME='macbert-base-chinese'

# 读者可以在这里切换语言模型


# 加载BERT的分词器
# PRE_TRAINED_MODEL_NAME='macbert-large-chinese'
# tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
# base_model=ppnlp.transformers.BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)

# PRE_TRAINED_MODEL_NAME='bert-wwm-ext-chinese'
# base_model = ppnlp.transformers.BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)

# roberta
PRE_TRAINED_MODEL_NAME='roberta-wwm-ext'
tokenizer = ppnlp.transformers.RobertaTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
base_model = ppnlp.transformers.RobertaModel.from_pretrained(PRE_TRAINED_MODEL_NAME)  # 加载预训练模型
# model = ppnlp.transformers.BertForSequenceClassification.from_pretrained(MODEL_NAME, num_classes=2)

[2022-04-13 16:19:40,619] [    INFO] - Downloading https://paddlenlp.bj.bcebos.com/models/transformers/roberta_base/vocab.txt and saved to /home/aistudio/.paddlenlp/models/roberta-wwm-ext
[2022-04-13 16:19:40,622] [    INFO] - Downloading vocab.txt from https://paddlenlp.bj.bcebos.com/models/transformers/roberta_base/vocab.txt
100%|██████████| 107/107 [00:00<00:00, 1558.55it/s]
[2022-04-13 16:19:40,800] [    INFO] - Downloading https://paddlenlp.bj.bcebos.com/models/transformers/roberta_base/roberta_chn_base.pdparams and saved to /home/aistudio/.paddlenlp/models/roberta-wwm-ext
[2022-04-13 16:19:40,805] [    INFO] - Downloading roberta_chn_base.pdparams from https://paddlenlp.bj.bcebos.com/models/transformers/roberta_base/roberta_chn_base.pdparams
100%|██████████| 399505/399505 [00:07<00:00, 51719.84it/s]
W0413 16:19:48.635334   163 device_context.cc:447] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 10.1, Runtime API Version: 10.1
W0413 16:19:48.641345   163

In [6]:
class RoleDataset(Dataset):

    def __init__(self, mode='train',trans_func=None):

        super(RoleDataset, self).__init__()

        if mode == 'train':
            self.data = pd.read_csv('data/train.csv',sep='\t')
        else:
            self.data = pd.read_csv('data/test.csv',sep='\t')
        self.texts=self.data['text'].tolist()
        self.labels=self.data[target_cols].to_dict('records')
        self.trans_func=trans_func
    
    def __getitem__(self, index):

        text=str(self.texts[index])
        label=self.labels[index]
        sample = {
            'text': text
        }
        for label_col in target_cols:
            sample[label_col] =label[label_col]
        sample=self.trans_func(sample)
        return sample

    def __len__(self):

        return len(self.texts)

# 转换成id的函数
def convert_example(example, tokenizer, max_seq_length=512, is_test=False):
    # print(example)
    sample={}
    encoded_inputs = tokenizer(text=example["text"], max_seq_len=max_seq_length)
    sample['input_ids'] = encoded_inputs["input_ids"]
    sample['token_type_ids'] = encoded_inputs["token_type_ids"]

    sample['love'] = np.array(example["love"], dtype="float32")
    sample['joy'] = np.array(example["joy"], dtype="float32")
    sample['anger'] = np.array(example["anger"], dtype="float32")

    sample['fright'] = np.array(example["fright"], dtype="float32")
    sample['fear'] = np.array(example["fear"], dtype="float32")
    sample['sorrow'] = np.array(example["sorrow"], dtype="float32")

    return sample


max_seq_length=128
trans_func = partial(
        convert_example,
        tokenizer=tokenizer,
        max_seq_length=max_seq_length)
train_ds=RoleDataset('train',trans_func)
test_ds=RoleDataset('test',trans_func)

In [7]:
print(test_ds[0])

{'input_ids': [101, 4959, 4708, 5520, 2552, 4638, 9338, 7008, 3341, 8024, 4692, 4692, 2797, 3322, 8024, 676, 4157, 749, 511, 6235, 5682, 131, 9338, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'love': array(0., dtype=float32), 'joy': array(0., dtype=float32), 'anger': array(0., dtype=float32), 'fright': array(0., dtype=float32), 'fear': array(0., dtype=float32), 'sorrow': array(0., dtype=float32)}


In [9]:
epochs=3
weight_decay=0.0
data_path='data'
warmup_proportion=0.0
init_from_ckpt=None
batch_size=32


learning_rate=5e-5


# # 把训练集合转换成id
# train_ds = train_ds.map(partial(convert_example, tokenizer=tokenizer))

# # 构建训练集合的dataloader
# train_batch_sampler = paddle.io.BatchSampler(dataset=train_ds, batch_size=32, shuffle=True)
# train_data_loader = paddle.io.DataLoader(dataset=train_ds, batch_sampler=train_batch_sampler, return_list=True)

In [10]:
def create_dataloader(dataset,
                      mode='train',
                      batch_size=1,
                      batchify_fn=None):

    shuffle = True if mode == 'train' else False
    if mode == 'train':
        batch_sampler = paddle.io.DistributedBatchSampler(
            dataset, batch_size=batch_size, shuffle=shuffle)
    else:
        batch_sampler = paddle.io.BatchSampler(
            dataset, batch_size=batch_size, shuffle=shuffle)

    return paddle.io.DataLoader(
        dataset=dataset,
        batch_sampler=batch_sampler,
        collate_fn=batchify_fn,
        return_list=True)

In [11]:
def collate_func(batch_data):
    # 获取batch数据的大小
    batch_size = len(batch_data)
    # 如果batch_size为0，则返回一个空字典
    if batch_size == 0:
        return {}
    input_ids_list, attention_mask_list = [], []
    love_list,joy_list,anger_list=[],[],[]
    fright_list,fear_list,sorrow_list=[],[],[]
    # 遍历batch数据，将每一个数据，转换成tensor的形式
    for instance in batch_data:
        input_ids_temp = instance["input_ids"]
        attention_mask_temp = instance["token_type_ids"]

        love=instance['love'] 
        joy=instance['joy'] 
        anger=instance['anger'] 

        fright= instance['fright'] 
        fear=instance['fear'] 
        sorrow=instance['sorrow'] 

        input_ids_list.append(paddle.to_tensor(input_ids_temp, dtype="int64"))
        attention_mask_list.append(paddle.to_tensor(attention_mask_temp, dtype="int64"))

        love_list.append(love)
        joy_list.append(joy)
        anger_list.append(anger)

        fright_list.append(fright)
        fear_list.append(fear)
        sorrow_list.append(sorrow)

    # 对一个batch内的数据，进行padding
    return {"input_ids": Pad(pad_val=0, axis=0)(input_ids_list),
            "token_type_ids": Pad(pad_val=0, axis=0)(attention_mask_list),
            "love": Stack(dtype="int64")(love_list),
            "joy": Stack(dtype="int64")(joy_list),
            "anger": Stack(dtype="int64")(anger_list),
            "fright": Stack(dtype="int64")(fright_list),
            "fear": Stack(dtype="int64")(fear_list),
            "sorrow": Stack(dtype="int64")(sorrow_list),
            }

In [12]:
train_data_loader = create_dataloader(
        train_ds,
        mode='train',
        batch_size=batch_size,
        batchify_fn=collate_func)


In [13]:
class EmotionClassifier(nn.Layer):
    def __init__(self, bert,n_classes):
        super(EmotionClassifier, self).__init__()
        self.bert = bert
        self.out_love = nn.Linear(self.bert.config["hidden_size"], n_classes)
        self.out_joy = nn.Linear(self.bert.config["hidden_size"], n_classes)
        self.out_fright = nn.Linear(self.bert.config["hidden_size"], n_classes)
        self.out_anger = nn.Linear(self.bert.config["hidden_size"], n_classes)
        self.out_fear = nn.Linear(self.bert.config["hidden_size"], n_classes)
        self.out_sorrow = nn.Linear(self.bert.config["hidden_size"], n_classes)

    def forward(self, input_ids, token_type_ids):

        _, pooled_output = self.bert(
            input_ids=input_ids,
            token_type_ids=token_type_ids
        )
        love = self.out_love(pooled_output)
        joy = self.out_joy(pooled_output)
        fright = self.out_fright(pooled_output)
        anger = self.out_anger(pooled_output)
        fear = self.out_fear(pooled_output)
        sorrow = self.out_sorrow(pooled_output)
        return {
            'love': love, 'joy': joy, 'fright': fright,
            'anger': anger, 'fear': fear, 'sorrow': sorrow,
        }

class_names=[1]
model = EmotionClassifier(base_model,4)

In [14]:
num_train_epochs=3
num_training_steps = len(train_data_loader) * num_train_epochs

# 定义 learning_rate_scheduler，负责在训练过程中对 lr 进行调度
lr_scheduler = LinearDecayWithWarmup(learning_rate, num_training_steps, 0.0)

# Generate parameter names needed to perform weight decay.
# All bias and LayerNorm parameters are excluded.
decay_params = [
    p.name for n, p in model.named_parameters()
    if not any(nd in n for nd in ["bias", "norm"])
]

# 定义 Optimizer
optimizer = paddle.optimizer.AdamW(
    learning_rate=lr_scheduler,
    parameters=model.parameters(),
    weight_decay=0.0,
    apply_decay_param_fun=lambda x: x in decay_params)
# 交叉熵损失
criterion = paddle.nn.loss.CrossEntropyLoss()
# 评估的时候采用准确率指标
metric = paddle.metric.Accuracy()

In [16]:
def do_train( model, data_loader,  criterion,  optimizer, scheduler,  metric ):
    
    model.train()
    global_step = 0
    tic_train = time.time()
    log_steps=100
    for epoch in range(num_train_epochs):
        losses = []
        # optimizer .zero_gard()
        for step,sample in enumerate(data_loader):
            # print(sample)
            input_ids = sample["input_ids"]
            token_type_ids = sample["token_type_ids"]
            outputs = model(input_ids=input_ids,
                token_type_ids=token_type_ids)
            # print(outputs)

            loss_love = criterion(outputs['love'], sample['love'])
            loss_joy = criterion(outputs['joy'], sample['joy'])
            loss_fright = criterion(outputs['fright'], sample['fright'])

            loss_anger = criterion(outputs['anger'], sample['anger'])
            loss_fear = criterion(outputs['fear'], sample['fear'])
            loss_sorrow = criterion(outputs['sorrow'], sample['sorrow'])

            loss = loss_love + loss_joy + loss_fright + loss_anger + loss_fear + loss_sorrow

            for label_col in target_cols:
                correct = metric.compute(outputs[label_col], sample[label_col])
                metric.update(correct)

            acc = metric.accumulate()

            losses.append(loss.numpy())
            loss.backward()
            # nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            global_step += 1
            # 每间隔 log_steps 输出训练指标
            if global_step % log_steps == 0:
                print("global step %d, epoch: %d, batch: %d, loss: %.5f, accuracy: %.5f, speed: %.2f step/s"
                % (global_step, epoch, step, loss, acc,
                    log_steps / (time.time() - tic_train)))

            optimizer.step()
            scheduler.step()
            optimizer.clear_grad()

        metric.reset()
    return np.mean(losses)

do_train(model,train_data_loader,criterion,optimizer,lr_scheduler,metric)

global step 100, epoch: 0, batch: 99, loss: 2.75459, accuracy: 0.90583, speed: 5.66 step/s
global step 200, epoch: 0, batch: 199, loss: 1.56995, accuracy: 0.90971, speed: 2.83 step/s
global step 300, epoch: 0, batch: 299, loss: 1.81763, accuracy: 0.91165, speed: 1.89 step/s
global step 400, epoch: 0, batch: 399, loss: 1.46230, accuracy: 0.91280, speed: 1.41 step/s
global step 500, epoch: 0, batch: 499, loss: 1.36762, accuracy: 0.91340, speed: 1.13 step/s
global step 600, epoch: 0, batch: 599, loss: 1.39702, accuracy: 0.91343, speed: 0.93 step/s
global step 700, epoch: 0, batch: 699, loss: 1.63529, accuracy: 0.91405, speed: 0.80 step/s
global step 800, epoch: 0, batch: 799, loss: 1.49754, accuracy: 0.91436, speed: 0.70 step/s
global step 900, epoch: 0, batch: 899, loss: 2.63329, accuracy: 0.91436, speed: 0.62 step/s
global step 1000, epoch: 0, batch: 999, loss: 2.00582, accuracy: 0.91466, speed: 0.56 step/s
global step 1100, epoch: 0, batch: 1099, loss: 1.56080, accuracy: 0.91471, speed

1.1591964

In [17]:
from collections import defaultdict

test_data_loader = create_dataloader(
        test_ds,
        mode='test',
        batch_size=batch_size,
        batchify_fn=collate_func)
        
test_pred = defaultdict(list)
for step, batch in tqdm(enumerate(test_data_loader)):
    b_input_ids = batch['input_ids']
    token_type_ids = batch['token_type_ids']
    logits = model(input_ids=b_input_ids, token_type_ids=token_type_ids)
    for col in target_cols:
        out2 = paddle.argmax(logits[col], axis=1)
        test_pred[col].append(out2.numpy())
    print(test_pred)
    # print(logits)
    break

0it [00:00, ?it/s]

defaultdict(<class 'list'>, {'love': [array([0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)], 'joy': [array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)], 'fright': [array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)], 'anger': [array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)], 'fear': [array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)], 'sorrow': [array([0, 0, 0, 0, 0, 2, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)]})





In [20]:
def predict(model,test_data_loader):
    val_loss = 0
    test_pred = defaultdict(list)
    model.eval()
    for step, batch in tqdm(enumerate(test_data_loader)):
        b_input_ids = batch['input_ids']
        token_type_ids = batch['token_type_ids']

        with paddle.no_grad():
            logits = model(input_ids=b_input_ids, token_type_ids=token_type_ids)
            for col in target_cols:
                out2 = paddle.argmax(logits[col], axis=1)
                test_pred[col].extend(out2.numpy().tolist())
    return test_pred

submit = pd.read_csv('data/submit_example.tsv', sep='\t')
test_pred = predict(model,test_data_loader)

668it [00:44, 15.00it/s]


In [21]:
print(test_pred['love'][:10])
print(len(test_pred['love']))

[0, 0, 0, 0, 0, 0, 0, 0, 2, 0]
21376


In [22]:
label_preds = []
for col in target_cols:
    preds = test_pred[col]
    label_preds.append(preds)
print(len(label_preds[0]))
sub = submit.copy()
sub['emotion'] = np.stack(label_preds, axis=1).tolist()
sub['emotion'] = sub['emotion'].apply(lambda x: ','.join([str(i) for i in x]))
sub.to_csv('baseline_{}.tsv'.format(PRE_TRAINED_MODEL_NAME), sep='\t', index=False)
sub.head()

21376


Unnamed: 0,id,emotion
0,34170_0002_A_12,0
1,34170_0002_A_14,0
2,34170_0003_A_16,0
3,34170_0003_A_17,0
4,34170_0003_A_18,0


请点击[此处](https://ai.baidu.com/docs#/AIStudio_Project_Notebook/a38e5576)查看本环境基本用法.  <br>
Please click [here ](https://ai.baidu.com/docs#/AIStudio_Project_Notebook/a38e5576) for more detailed instructions. 