# 处理数据
原本的CpedDataset需要每次重新处理数据，事先进行处理并保存结果可以节省时间

In [6]:
import logging
logger = logging.getLogger('cped.data')
logger.setLevel(level=logging.INFO)

## 设置tokenizer
需要根据分词结果来为每个token嵌入信息

In [7]:
# T5仅仅是作为样例
from transformers import BertTokenizer, MT5ForConditionalGeneration, Text2TextGenerationPipeline
tokenizer:BertTokenizer = BertTokenizer.from_pretrained("t5/t5-v1_1-small-chinese-cluecorpussmall")
# model = MT5ForConditionalGeneration.from_pretrained("t5/t5-v1_1-small-chinese-cluecorpussmall")

In [8]:
from utils.cped_dataset import cped_get_data_from_dir, CpedDataset

data_path = '../data/CPED'
cache_path = './cache/cped_cache'

cped_filenames = {"train":"train_split.csv", 
                  "valid":"valid_split.csv", 
                  "test":"test_split.csv"}
data, sample = cped_get_data_from_dir(dir_path=data_path, 
                                      cache_path=cache_path,
                                      tokenizer=tokenizer,
                                      logger=logger, 
                                      filenames=cped_filenames)

logger.info("读取CPED数据集")
train_data = data["train"]
valid_data = data["valid"]
train_dataset = CpedDataset(data=train_data, 
                            tokenizer=tokenizer, 
                            emotion_type='Emotion',
                            da_type='DA', 
                            persona_type=["Gender","Neuroticism","Extraversion","Openness","Agreeableness","Conscientiousness", "Age"],
                            max_history=25,
                            batch_first=True, 
                            lm_labels=True, 
                            with_current_speaker=True,
                            with_current_persona=True,
                            with_current_emotion=True,
                            with_current_da=True,
                            with_emotion=True, 
                            with_da=False,
                            use_speaker_name_as_speaker_list=False,
                            set_eda_in_speaker=False,
                            set_current_speaker_mask=False)


valid_dataset = CpedDataset(data=valid_data, 
                            tokenizer=tokenizer, 
                            emotion_type='Emotion',
                            da_type='DA', 
                            persona_type=["Gender","Neuroticism","Extraversion","Openness","Agreeableness","Conscientiousness", "Age"],
                            max_history=25,
                            batch_first=True, 
                            lm_labels=True, 
                            with_current_speaker=True,
                            with_current_persona=True,
                            with_current_emotion=True,
                            with_current_da=True,
                            with_emotion=True, 
                            with_da=False,
                            use_speaker_name_as_speaker_list=False,
                            set_eda_in_speaker=False,
                            set_current_speaker_mask=False)

test_data = data["test"]
test_dataset = CpedDataset(data=test_data, 
                            tokenizer=tokenizer, 
                            emotion_type='Emotion',
                            da_type='DA', 
                            persona_type=["Gender","Neuroticism","Extraversion","Openness","Agreeableness","Conscientiousness", "Age"],
                            max_history=25,
                            batch_first=True, 
                            lm_labels=True, 
                            with_current_speaker=True,
                            with_current_persona=True,
                            with_current_emotion=True,
                            with_current_da=True,
                            with_emotion=True, 
                            with_da=False,
                            use_speaker_name_as_speaker_list=False,
                            set_eda_in_speaker=False,
                            set_current_speaker_mask=False)

In [9]:
tokenizer.decode(tokenizer('深度学习真的烦')['input_ids'])

'[CLS] 深 度 学 习 真 的 烦 [SEP]'

In [10]:
import json
persona_type = ["Gender","Neuroticism","Extraversion","Openness","Agreeableness","Conscientiousness","Age"]

In [11]:
with open('data/t5/train.jsonl', mode='w', encoding='utf8') as fp:
    dialog = {}
    for i in range(len(train_dataset)):
        dialog['input'] = tokenizer.decode(train_dataset[i]['input_ids'])
        dialog['speaker'] = train_dataset[i]['token_type_ids']
        for persona in persona_type:
            dialog[persona.lower()] = train_dataset[i][persona]
        dialog['emotion'] = train_dataset[i]['emotion_ids']
        fp.write(json.dumps(dialog, ensure_ascii=False) + '\n')

In [12]:
with open('data/t5/valid.jsonl', mode='w', encoding='utf8') as fp:
    dialog = {}
    for i in range(len(valid_dataset)):
        dialog['input'] = tokenizer.decode(valid_dataset[i]['input_ids'])
        dialog['speaker'] = valid_dataset[i]['token_type_ids']
        for persona in persona_type:
            dialog[persona.lower()] = valid_dataset[i][persona]
        dialog['emotion'] = valid_dataset[i]['emotion_ids']
        fp.write(json.dumps(dialog, ensure_ascii=False) + '\n')

In [13]:
# 可以用来检查对应关系
# with open('test.csv', mode='w', encoding='gbk') as fp:
#     for i in range(len(test_dataset)):
#         fp.write((','.join(tokenizer.tokenize(tokenizer.decode(test_dataset[i]['input_ids'])))) + '\n')
#         fp.write(','.join(test_dataset[i]['token_type_ids']) + '\n')
#         for persona in persona_type:
#             fp.write(','.join(test_dataset[i][persona]) + '\n')
#         fp.write(','.join(test_dataset[i]['emotion_ids']) + '\n')
with open('data/t5/test.jsonl', mode='w', encoding='utf8') as fp:
    dialog = {}
    for i in range(len(test_dataset)):
        dialog['input'] = tokenizer.decode(test_dataset[i]['input_ids'])
        dialog['speaker'] = test_dataset[i]['token_type_ids']
        for persona in persona_type:
            dialog[persona.lower()] = test_dataset[i][persona]
        dialog['emotion'] = test_dataset[i]['emotion_ids']
        fp.write(json.dumps(dialog, ensure_ascii=False) + '\n')

## 验证新的Dataset是否能用

In [15]:
from pec_dataset import PecDataset

train_pec = PecDataset('data/t5/train.jsonl')
print('train set')
for i in range(3):
    print(train_pec[i])

valid_pec = PecDataset('data/t5/valid.jsonl')
print('valid set')
for i in range(3):
    print(valid_pec[i])

test_pec = PecDataset('data/t5/test.jsonl')
print('test set')
for i in range(3):
    print(test_pec[i])

train set
{'input': '[CLS] 您 想 去 哪 [SEP] 你 来 这 几 年 了 [SEP] 5 年 [SEP]', 'speaker': ['[CLS]', '[speaker2]', '[speaker2]', '[speaker2]', '[speaker2]', '[speaker2]', '[speaker1]', '[speaker1]', '[speaker1]', '[speaker1]', '[speaker1]', '[speaker1]', '[speaker1]', '[speaker2]', '[speaker2]', '[speaker2]'], 'gender': ['[CLS]', 'female', 'female', 'female', 'female', 'female', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'female', 'female', 'female'], 'neuroticism': ['[CLS]', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low'], 'extraversion': ['[CLS]', 'high', 'high', 'high', 'high', 'high', 'high', 'high', 'high', 'high', 'high', 'high', 'high', 'high', 'high', 'high'], 'openness': ['[CLS]', 'high', 'high', 'high', 'high', 'high', 'high', 'high', 'high', 'high', 'high', 'high', 'high', 'high', 'high', 'high'], 'agreeableness': ['[CLS]', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low', '