In [1]:
import pickle
import json

train_review_path = "/projects/prjs1158/KG/redail/MESE_review/DATA/dataset/redial/nltk/train_conv_idx_to_review_info.pkl"
valid_review_path = "/projects/prjs1158/KG/redail/MESE_review/DATA/dataset/redial/nltk/valid_conv_idx_to_review_info.pkl"
test_review_path = "/projects/prjs1158/KG/redail/MESE_review/DATA/dataset/redial/nltk/test_conv_idx_to_review_info.pkl"

train_review_data = pickle.load(open(train_review_path, "rb"))
valid_review_data = pickle.load(open(valid_review_path, "rb"))
test_review_data = pickle.load(open(test_review_path, "rb"))

entity2id_path = "/projects/prjs1158/KG/redail/MESE_review/DATA/dataset/redial/nltk/entity2id.json"
token2id_path = "/projects/prjs1158/KG/redail/MESE_review/DATA/dataset/redial/nltk/token2id.json"
entity2id = json.load(open(entity2id_path, 'r'))
token2id = json.load(open(token2id_path, 'r'))
id2entity = {v: k for k, v in entity2id.items()}
id2token = {v: k for k, v in token2id.items()}

entity_name_to_review = {}
for review_data in [train_review_data, valid_review_data, test_review_data]:
    for key in review_data.keys():
        for cnt, entity_id in enumerate(review_data[key]['selected_entityIds']):
            review = review_data[key]['selected_infoListListInt'][cnt]
            # transform review to text
            new_review = [id2token[item] for item in review]
            new_review = " ".join(new_review)
            entity_name = id2entity[int(entity_id)]
            entity_name_to_review[entity_name] = new_review



In [2]:
import json
import re

import html
from tqdm.auto import tqdm

movie_pattern = re.compile(r'@\d+')


def process_utt(utt, movieid2name, replace_movieId):
    def convert(match):
        movieid = match.group(0)[1:]
        if movieid in movieid2name:
            movie_name = movieid2name[movieid]
            movie_name = ' '.join(movie_name.split())
            return movie_name
        else:
            return match.group(0)

    if replace_movieId:
        utt = re.sub(movie_pattern, convert, utt)
    utt = ' '.join(utt.split())
    utt = html.unescape(utt)

    return utt


def process(data_file, out_file, movie_set):
    with open(data_file, 'r', encoding='utf-8') as fin, open(out_file, 'w', encoding='utf-8') as fout:
        for line in tqdm(fin):
            dialog = json.loads(line)
            if len(dialog['messages']) == 0:
                continue

            movieid2name = dialog['movieMentions']
            user_id, resp_id = dialog['initiatorWorkerId'], dialog['respondentWorkerId']
            context, resp = [], ''
            entity_list = []
            messages = dialog['messages']
            turn_i = 0
            while turn_i < len(messages):
                worker_id = messages[turn_i]['senderWorkerId']
                utt_turn = []
                entity_turn = []
                movie_turn = []

                turn_j = turn_i
                while turn_j < len(messages) and messages[turn_j]['senderWorkerId'] == worker_id:
                    utt = process_utt(messages[turn_j]['text'], movieid2name, replace_movieId=True)
                    utt_turn.append(utt)

                    entity_ids = [entity2id[entity] for entity in messages[turn_j]['entity'] if entity in entity2id]
                    entity_turn.extend(entity_ids)

                    movie_ids = [entity2id[movie] for movie in messages[turn_j]['movie'] if movie in entity2id]
                    movie_turn.extend(movie_ids)

                    turn_j += 1

                utt = ' '.join(utt_turn)

                # if worker_id == user_id:
                #     context.append(utt)
                #     entity_list.append(entity_turn + movie_turn)
                # else:
                resp = utt

                context_entity_list = [entity for entity_l in entity_list for entity in entity_l]
                context_entity_list_extend = []
                # entity_links = [id2entity[id] for id in context_entity_list if id in id2entity]
                # for entity in entity_links:
                #     if entity in node2entity:
                #         for e in node2entity[entity]['entity']:
                #             if e in entity2id:
                #                 context_entity_list_extend.append(entity2id[e])
                context_entity_list_extend += context_entity_list
                context_entity_list_extend = list(set(context_entity_list_extend))

                if len(context) == 0:
                    context.append('')
                turn = {
                    'context': context,
                    'resp': resp,
                    'rec': list(set(movie_turn + entity_turn)),
                    'entity': context_entity_list_extend,
                }
                fout.write(json.dumps(turn, ensure_ascii=False) + '\n')

                context.append(resp)
                entity_list.append(movie_turn + entity_turn)
                movie_set |= set(movie_turn)

                turn_i = turn_j



with open('entity2id.json', 'r', encoding='utf-8') as f:
    entity2id = json.load(f)
item_set = set()
# with open('node2text_link_clean.json', 'r', encoding='utf-8') as f:
#     node2entity = json.load(f)

process('valid_data_dbpedia.jsonl', 'valid_data_processed.jsonl', item_set)
process('test_data_dbpedia.jsonl', 'test_data_processed.jsonl', item_set)
process('train_data_dbpedia.jsonl', 'train_data_processed.jsonl', item_set)

with open('item_ids.json', 'w', encoding='utf-8') as f:
    json.dump(list(item_set), f, ensure_ascii=False)
print(f'#item: {len(item_set)}')


  from .autonotebook import tqdm as notebook_tqdm
1000it [00:00, 2549.04it/s]
1342it [00:00, 2828.35it/s]
9006it [00:02, 3094.39it/s]


#item: 6281


In [5]:
id2review = {}
for key in entity_name_to_review.keys():
    if key in entity2id:
        id = entity2id[key]
        id2review[id] = entity_name_to_review[key]

In [7]:
# save the dict of id2review
import pickle
with open('id2review.pkl', 'wb') as f:
    pickle.dump(id2review, f)