In [1]:
import os
import random
import json
import pickle
from copy import deepcopy
from tqdm import tqdm
from collections import defaultdict

import numpy as np
from transformers import BertTokenizerFast

In [2]:
random.seed(0)

In [30]:
dataset = 'crime_book'  # crime_book, children
data_name = 'goodreads_reviews_mystery_thriller_crime' # goodreads_reviews_mystery_thriller_crime, goodreads_reviews_children
output_dir='xxx'

# Generate Training Data (Edge Classification)

In [31]:
# read raw data
with open(f'{dataset}/{data_name}.json') as f:
    data = []
    readin = f.readlines()
    for line in tqdm(readin):
        data.append(json.loads(line))
random.shuffle(data)

100%|██████████| 1849236/1849236 [00:12<00:00, 144038.79it/s]


In [32]:
len(data)

1849236

In [18]:
data[7]

{'user_id': 'a2d6dd1685e5aa0a72c9410f8f55e056',
 'book_id': '7327423',
 'review_id': '15708dad2f2dfb42043d5c87c58df57f',
 'rating': 0,
 'review_text': 'O',
 'date_added': 'Thu Sep 22 18:58:39 -0700 2016',
 'date_updated': 'Fri Jan 27 13:49:40 -0800 2017',
 'read_at': '',
 'started_at': '',
 'n_votes': 0,
 'n_comments': 0}

In [19]:
# text processing function
def text_process(text):
    p_text = ' '.join(text.split('\r\n'))
    p_text = ' '.join(text.split('\n\r'))
    p_text = ' '.join(text.split('\n'))
    p_text = ' '.join(p_text.split('\t'))
    p_text = ' '.join(p_text.split('\rm'))
    p_text = ' '.join(p_text.split('\r'))
    p_text = ''.join(p_text.split('$'))
    p_text = ''.join(p_text.split('*'))

    return p_text

In [20]:
## rate distribution

rate_dict = defaultdict(int)
user_id2idx = {}
item_id2idx = {}

for d in tqdm(data):
    rate_dict[d['rating']] += 1
    if d['user_id'] not in user_id2idx:
        user_id2idx[d['user_id']] = len(user_id2idx)
    if d['book_id'] not in item_id2idx:
        item_id2idx[d['book_id']] = len(item_id2idx)

print(rate_dict)

100%|██████████| 734640/734640 [00:01<00:00, 503020.85it/s]

defaultdict(<class 'int'>, {3: 148210, 4: 253185, 5: 251400, 0: 31113, 2: 40006, 1: 10726})





In [21]:
## generate data samples

samples = []

for d in tqdm(data):
    samples.append((text_process(d['review_text']), user_id2idx[d['user_id']], item_id2idx[d['book_id']], d['rating']))

100%|██████████| 734640/734640 [00:04<00:00, 175659.83it/s]


In [22]:
## split train/val/test as 7:1:2 or 8:1:1
### user_pos_reviews/user_neg_reviews: key<-userID, value<-list(reviews)
### item_pos_reviews/item_neg_reviews: key<-productID, value<-list(reviews)
### train_user_neighbor: key<-userID, value<-list(tuple(reviews,p/n))
### train_item_neighbor: key<-userID, value<-list(tuple(reviews,p/n))

sample_num = len(samples)
random.seed(0)

train_bound = int(sample_num * 0.7)
val_bound = int(sample_num * 0.8)
print(train_bound, val_bound - train_bound, sample_num - val_bound)

random.shuffle(samples)

514247 73465 146928


In [23]:
# generate and save train file

with open(f'{output_dir}/{dataset}/train.tsv','w') as fout:
    for s in tqdm(samples[:train_bound]):
        fout.write(s[0]+'\$\$'+str(s[1])+'\$\$'+str(s[2])+'\$\$'+str(int(s[3]))+'\n')

100%|██████████| 514247/514247 [00:01<00:00, 367931.56it/s]


In [24]:
# generate and save val file

with open(f'{output_dir}/{dataset}/val.tsv','w') as fout:
    for s in tqdm(samples[train_bound:val_bound]):
        fout.write(s[0]+'\$\$'+str(s[1])+'\$\$'+str(s[2])+'\$\$'+str(int(s[3]))+'\n')

100%|██████████| 73465/73465 [00:00<00:00, 353659.34it/s]


In [25]:
# generate and save test file

with open(f'{output_dir}/{dataset}/test.tsv','w') as fout:
    for s in tqdm(samples[val_bound:]):
        fout.write(s[0]+'\$\$'+str(s[1])+'\$\$'+str(s[2])+'\$\$'+str(int(s[3]))+'\n')

100%|██████████| 146928/146928 [00:00<00:00, 359500.66it/s]


In [26]:
# save side files

pickle.dump(user_id2idx,open(f'{output_dir}/{dataset}/user_id2idx.pkl','wb'))
pickle.dump(item_id2idx,open(f'{output_dir}/{dataset}/item_id2idx.pkl','wb'))
pickle.dump([len(user_id2idx),len(item_id2idx),6],open(f'{output_dir}/{dataset}/node_num.pkl','wb'))