In [1]:
import os
import random
import json
import pickle
from copy import deepcopy
from tqdm import tqdm
from collections import defaultdict

from transformers import BertTokenizerFast

In [2]:
random.seed(0)

In [3]:
dataset = 'children'  # crime_book
data_name = 'goodreads_reviews_children' # reviews_Electronics_5, 
output_dir='xxx'

# read review data

In [4]:
# read raw data
with open(f'{dataset}/{data_name}.json') as f:
    data = []
    readin = f.readlines()
    for line in tqdm(readin):
        data.append(json.loads(line))
random.shuffle(data)

100%|██████████| 734640/734640 [00:04<00:00, 161640.10it/s]


In [5]:
len(data)

734640

In [6]:
data[7]

{'user_id': '160d1c77ab7241b4439826555c9ba7bd',
 'book_id': '24396876',
 'review_id': '3daf5e7c037a0235c676921f02c47021',
 'rating': 5,
 'review_text': 'Great story about surviving middle school and really finding yourself. With a wonderful dose of science neatly holding it all together. I read this in one sitting, I could not put it down.',
 'date_added': 'Thu Oct 20 17:58:02 -0700 2016',
 'date_updated': 'Thu Oct 20 20:52:56 -0700 2016',
 'read_at': 'Thu Oct 20 00:00:00 -0700 2016',
 'started_at': 'Thu Oct 20 00:00:00 -0700 2016',
 'n_votes': 0,
 'n_comments': 0}

In [7]:
# text processing function
def text_process(text):
    p_text = ' '.join(text.split('\r\n'))
    p_text = ' '.join(text.split('\n\r'))
    p_text = ' '.join(text.split('\n'))
    p_text = ' '.join(p_text.split('\t'))
    p_text = ' '.join(p_text.split('\rm'))
    p_text = ' '.join(p_text.split('\r'))
    p_text = ''.join(p_text.split('$'))
    p_text = ''.join(p_text.split('*'))

    return p_text

In [8]:
## rate distribution

rate_dict = defaultdict(int)

for d in tqdm(data):
    rate_dict[d['rating']] += 1
    
print(rate_dict)

100%|██████████| 734640/734640 [00:00<00:00, 1290694.01it/s]

defaultdict(<class 'int'>, {4: 253185, 5: 251400, 0: 31113, 3: 148210, 1: 10726, 2: 40006})





In [9]:
## user/item statistics
### we see 5 score as positive edge(review), 1-4 as negative ones.
### user_pos_reviews/user_neg_reviews: key<-userID, value<-list(reviews)
### item_pos_reviews/item_neg_reviews: key<-productID, value<-list(reviews)
### user_reviews_dict/item_reviews_dict: key<-userID/productID, value<-list(tuple(reviews,p/n))

user_pos_reviews = defaultdict(list)
user_neg_reviews = defaultdict(list)
item_pos_reviews = defaultdict(list)
item_neg_reviews = defaultdict(list)
user_set = set()
item_set = set()

user_reviews_dict = defaultdict(list)
item_reviews_dict = defaultdict(list)

blank_review_cnt = 0

for d in tqdm(data):
    if 'review_text' not in d:
        blank_review_cnt += 1
        continue
    
    text = text_process(d['review_text'])
    user_set.add(d['user_id'])
    item_set.add(d['book_id'])
    if d['rating'] in [5.0, 4.0]:
        user_pos_reviews[d['user_id']].append(text)
        item_pos_reviews[d['book_id']].append(text)
        
        user_reviews_dict[d['user_id']].append((text,d['book_id'],1))
        item_reviews_dict[d['book_id']].append((text,d['user_id'],1))
    elif d['rating'] in [1,2,3,0]:
        user_neg_reviews[d['user_id']].append(text)
        item_neg_reviews[d['book_id']].append(text)
        
        user_reviews_dict[d['user_id']].append((text,d['book_id'],0))
        item_reviews_dict[d['book_id']].append((text,d['user_id'],0))
    else:
        raise ValueError('Error!')
        
print(f'Number of blank review:{blank_review_cnt}')
print(f'Number of user:{len(user_set)}, Number of item:{len(item_set)}')
print(f'user_pos_reviews.len:{len(user_pos_reviews)},user_neg_reviews.len:{len(user_neg_reviews)}')
print(f'item_pos_reviews.len:{len(item_pos_reviews)},item_neg_reviews.len:{len(item_neg_reviews)}')
print(f'user.avg.pos_review:{(rate_dict[5]+rate_dict[4])/len(user_set)},user.avg.neg_review:{(rate_dict[1]+rate_dict[2]+rate_dict[3])/len(user_set)}')
print(f'item.avg.pos_review:{(rate_dict[5]+rate_dict[4])/len(item_set)},item.avg.neg_review:{(rate_dict[1]+rate_dict[2]+rate_dict[3])/len(item_set)}')

100%|██████████| 734640/734640 [00:07<00:00, 101765.33it/s]

Number of blank review:0
Number of user:92667, Number of item:123946
user_pos_reviews.len:81071,user_neg_reviews.len:38237
item_pos_reviews.len:95458,item_neg_reviews.len:73154
user.avg.pos_review:5.44514228366085,user.avg.neg_review:2.1468483926316813
item.avg.pos_review:4.071006728736708,item.avg.neg_review:1.6050699498168557





In [10]:
## split train/val/test as 7:1:2 or 8:1:1
### user_pos_reviews/user_neg_reviews: key<-userID, value<-list(reviews)
### item_pos_reviews/item_neg_reviews: key<-productID, value<-list(reviews)
### train_user_neighbor: key<-userID, value<-list(tuple(reviews,p/n))
### train_item_neighbor: key<-userID, value<-list(tuple(reviews,p/n))

sample_num = len(data)
random.seed(0)

train_tuples = []
val_tuples = []
test_tuples = []
train_item_set = set()
user_id2idx = {}
item_id2idx = {}
train_user_pos_neighbor = defaultdict(list)
train_user_neg_neighbor = defaultdict(list)
train_item_pos_neighbor = defaultdict(list)
train_item_neg_neighbor = defaultdict(list)

c1 = 0
c2 = 0
c3 = 0

for uid in tqdm(user_reviews_dict):
    if uid not in user_id2idx:
        user_id2idx[uid] = len(user_id2idx)
    random.shuffle(user_reviews_dict[uid])
    
    for i in range(int(len(user_reviews_dict[uid])*0.7)):
    #for i in range(int(len(user_reviews_dict[uid])*0.8)):
        train_tuples.append((uid,user_reviews_dict[uid][i]))
        train_item_set.add(user_reviews_dict[uid][i][1])
        
        # add to item_id2idx
        if user_reviews_dict[uid][i][1] not in item_id2idx:
            item_id2idx[user_reviews_dict[uid][i][1]] = len(item_id2idx)

        # add to train_user_neighbor/train_item_neighbor
        if user_reviews_dict[uid][i][2] == 1:
            train_user_pos_neighbor[uid].append(user_reviews_dict[uid][i])
            train_item_pos_neighbor[user_reviews_dict[uid][i][1]].append((user_reviews_dict[uid][i][0],uid,user_reviews_dict[uid][i][2]))
        elif user_reviews_dict[uid][i][2] == 0:
            train_user_neg_neighbor[uid].append(user_reviews_dict[uid][i])
            train_item_neg_neighbor[user_reviews_dict[uid][i][1]].append((user_reviews_dict[uid][i][0],uid,user_reviews_dict[uid][i][2]))
        else:
            raise ValueError('Error!')
            
    for i in range(int(len(user_reviews_dict[uid])*0.7),int(len(user_reviews_dict[uid])*0.8)):
    #for i in range(int(len(user_reviews_dict[uid])*0.8),int(len(user_reviews_dict[uid])*0.9)):
        val_tuples.append((uid,user_reviews_dict[uid][i]))

    for i in range(int(len(user_reviews_dict[uid])*0.8),len(user_reviews_dict[uid])):
    #for i in range(int(len(user_reviews_dict[uid])*0.9),len(user_reviews_dict[uid])):
        test_tuples.append((uid,user_reviews_dict[uid][i]))

        
    #c1 += int(len(user_reviews_dict[uid])*0.8)
    #c2 += int(len(user_reviews_dict[uid])*0.9)-int(len(user_reviews_dict[uid])*0.8)
    #c3 += len(user_reviews_dict[uid])-int(len(user_reviews_dict[uid])*0.9)
    #print(c1,c2,c3)
        
print(f'Number of item appearing in train_set:{len(train_item_set)} or {len(item_id2idx)}')
print(f'Train/Val/Test size:{len(train_tuples)},{len(val_tuples)},{len(test_tuples)}')
#print(c1,c2,c3)

100%|██████████| 92667/92667 [00:02<00:00, 36986.90it/s]

Number of item appearing in train_set:99369 or 99369
Train/Val/Test size:463031,67801,203808





In [11]:
# generate and save train file
## user pos neighbor: 8, user neg neighbor: 8
## item pos neighbor: 10, item neg neighbor: 10

upos = 6
uneg = 3
ipos = 4
ineg = 2

random.seed(0)

with open(f'{output_dir}/{dataset}/train.tsv','w') as fout:
    for d in tqdm(train_tuples):
        
        # prepare sample pool for user and item
        user_pos_pool = set(deepcopy(train_user_pos_neighbor[d[0]]))
        user_neg_pool = set(deepcopy(train_user_neg_neighbor[d[0]]))
        item_pos_pool = set(deepcopy(train_item_pos_neighbor[d[1][1]]))
        item_neg_pool = set(deepcopy(train_item_neg_neighbor[d[1][1]]))
        
        if d[1][2] == 1:
            user_pos_pool.remove(d[1])
            item_pos_pool.remove((d[1][0],d[0],d[1][2]))
        elif d[1][2] == 0:
            user_neg_pool.remove(d[1])
            item_neg_pool.remove((d[1][0],d[0],d[1][2]))
        else:
            raise ValueError('Error!')
        
        user_pos_pool = list(user_pos_pool)
        item_pos_pool = list(item_pos_pool)
        user_neg_pool = list(user_neg_pool)
        item_neg_pool = list(item_neg_pool)
        random.shuffle(user_pos_pool)
        random.shuffle(user_neg_pool)
        random.shuffle(item_pos_pool)
        random.shuffle(item_neg_pool)
        
        # sample for user
        if len(user_pos_pool) >= upos:
            user_pos_samples = user_pos_pool[:upos]
        else:
            user_pos_samples = user_pos_pool + [('',-1)] * (upos-len(user_pos_pool))
        
        if len(user_neg_pool) >= uneg:
            user_neg_samples = user_neg_pool[:uneg]
        else:
            user_neg_samples = user_neg_pool + [('',-1)] * (uneg-len(user_neg_pool))
        
        # sample for item
        if len(item_pos_pool) >= ipos:
            item_pos_samples = item_pos_pool[:ipos]
        else:
            item_pos_samples = item_pos_pool + [('',-1)] * (ipos-len(item_pos_pool))
        
        if len(item_neg_pool) >= ineg:
            item_neg_samples = item_neg_pool[:ineg]
        else:
            item_neg_samples = item_neg_pool + [('',-1)] * (ineg-len(item_neg_pool))
        
        # prepare for writing file
        user_pos_text = '\t'.join([up[0] for up in user_pos_samples])
        user_pos_neighbor = '\t'.join([str(item_id2idx[up[1]]) if up[1] != -1 else str(-1) for up in user_pos_samples])
        user_neg_text = '\t'.join([un[0] for un in user_neg_samples])
        user_neg_neighbor = '\t'.join([str(item_id2idx[un[1]]) if un[1] != -1 else str(-1) for un in user_neg_samples])
        
        item_pos_text = '\t'.join([ip[0] for ip in item_pos_samples])
        item_pos_neighbor = '\t'.join([str(user_id2idx[ip[1]]) if ip[1] != -1 else str(-1) for ip in item_pos_samples])
        item_neg_text = '\t'.join([inn[0] for inn in item_neg_samples])
        item_neg_neighbor = '\t'.join([str(user_id2idx[inn[1]]) if inn[1] != -1 else str(-1) for inn in item_neg_samples])
        
        user_line = str(user_id2idx[d[0]]) + '\*\*' + user_pos_text + '\*\*' + user_neg_text + '\*\*' + user_pos_neighbor + '\*\*' + user_neg_neighbor
        item_line = str(item_id2idx[d[1][1]]) + '\*\*' + item_pos_text + '\*\*' + item_neg_text + '\*\*' + item_pos_neighbor + '\*\*' + item_neg_neighbor
        
        fout.write(user_line+'\$\$'+item_line+'\$\$'+str(d[1][2])+'\n')

100%|██████████| 463031/463031 [11:23<00:00, 677.16it/s] 


In [12]:
# generate and save val file (make sure to delete items that are not in train set)

random.seed(0)

valid_dev_edges = 0

with open(f'{output_dir}/{dataset}/val.tsv','w') as fout:
    for d in tqdm(val_tuples):
        # if item not in train item set, continue
        if d[1][1] not in train_item_set:
            continue

        # counting
        valid_dev_edges += 1

        # prepare sample pool for user and item
        user_pos_pool = deepcopy(train_user_pos_neighbor[d[0]])
        user_neg_pool = deepcopy(train_user_neg_neighbor[d[0]])
        item_pos_pool = deepcopy(train_item_pos_neighbor[d[1][1]])
        item_neg_pool = deepcopy(train_item_neg_neighbor[d[1][1]])
        
        random.shuffle(user_pos_pool)
        random.shuffle(user_neg_pool)
        random.shuffle(item_pos_pool)
        random.shuffle(item_neg_pool)
        
        # sample for user
        if len(user_pos_pool) >= upos:
            user_pos_samples = user_pos_pool[:upos]
        else:
            user_pos_samples = user_pos_pool + [('',-1)] * (upos-len(user_pos_pool))
        
        if len(user_neg_pool) >= uneg:
            user_neg_samples = user_neg_pool[:uneg]
        else:
            user_neg_samples = user_neg_pool + [('',-1)] * (uneg-len(user_neg_pool))
        
        # sample for item
        if len(item_pos_pool) >= ipos:
            item_pos_samples = item_pos_pool[:ipos]
        else:
            item_pos_samples = item_pos_pool + [('',-1)] * (ipos-len(item_pos_pool))
        
        if len(item_neg_pool) >= ineg:
            item_neg_samples = item_neg_pool[:ineg]
        else:
            item_neg_samples = item_neg_pool + [('',-1)] * (ineg-len(item_neg_pool))
        
        # prepare for writing file
        user_pos_text = '\t'.join([up[0] for up in user_pos_samples])
        user_pos_neighbor = '\t'.join([str(item_id2idx[up[1]]) if up[1] != -1 else str(-1) for up in user_pos_samples])
        user_neg_text = '\t'.join([un[0] for un in user_neg_samples])
        user_neg_neighbor = '\t'.join([str(item_id2idx[un[1]]) if un[1] != -1 else str(-1) for un in user_neg_samples])
        
        item_pos_text = '\t'.join([ip[0] for ip in item_pos_samples])
        item_pos_neighbor = '\t'.join([str(user_id2idx[ip[1]]) if ip[1] != -1 else str(-1) for ip in item_pos_samples])
        item_neg_text = '\t'.join([inn[0] for inn in item_neg_samples])
        item_neg_neighbor = '\t'.join([str(user_id2idx[inn[1]]) if inn[1] != -1 else str(-1) for inn in item_neg_samples])
        
        user_line = str(user_id2idx[d[0]]) + '\*\*' + user_pos_text + '\*\*' + user_neg_text + '\*\*' + user_pos_neighbor + '\*\*' + user_neg_neighbor
        item_line = str(item_id2idx[d[1][1]]) + '\*\*' + item_pos_text + '\*\*' + item_neg_text + '\*\*' + item_pos_neighbor + '\*\*' + item_neg_neighbor
        
        fout.write(user_line+'\$\$'+item_line+'\$\$'+str(d[1][2])+'\n')

print(f'Number of Valid Dev Edges:{valid_dev_edges} | Total:{len(val_tuples)}')

100%|██████████| 67801/67801 [01:28<00:00, 763.44it/s] 

Number of Valid Dev Edges:60323 | Total:67801





In [13]:
# generate and save test file (make sure to delete items that are not in train set)

random.seed(0)

valid_test_edges = 0

with open(f'{output_dir}/{dataset}/test.tsv','w') as fout:
    for d in tqdm(test_tuples):
        # if item not in train item set, continue
        if d[1][1] not in train_item_set:
            continue

        # counting
        valid_test_edges += 1

        # prepare sample pool for user and item
        user_pos_pool = deepcopy(train_user_pos_neighbor[d[0]])
        user_neg_pool = deepcopy(train_user_neg_neighbor[d[0]])
        item_pos_pool = deepcopy(train_item_pos_neighbor[d[1][1]])
        item_neg_pool = deepcopy(train_item_neg_neighbor[d[1][1]])
        
        random.shuffle(user_pos_pool)
        random.shuffle(user_neg_pool)
        random.shuffle(item_pos_pool)
        random.shuffle(item_neg_pool)
        
        # sample for user
        if len(user_pos_pool) >= upos:
            user_pos_samples = user_pos_pool[:upos]
        else:
            user_pos_samples = user_pos_pool + [('',-1)] * (upos-len(user_pos_pool))
        
        if len(user_neg_pool) >= uneg:
            user_neg_samples = user_neg_pool[:uneg]
        else:
            user_neg_samples = user_neg_pool + [('',-1)] * (uneg-len(user_neg_pool))
        
        # sample for item
        if len(item_pos_pool) >= ipos:
            item_pos_samples = item_pos_pool[:ipos]
        else:
            item_pos_samples = item_pos_pool + [('',-1)] * (ipos-len(item_pos_pool))
        
        if len(item_neg_pool) >= ineg:
            item_neg_samples = item_neg_pool[:ineg]
        else:
            item_neg_samples = item_neg_pool + [('',-1)] * (ineg-len(item_neg_pool))
        
        # prepare for writing file
        user_pos_text = '\t'.join([up[0] for up in user_pos_samples])
        user_pos_neighbor = '\t'.join([str(item_id2idx[up[1]]) if up[1] != -1 else str(-1) for up in user_pos_samples])
        user_neg_text = '\t'.join([un[0] for un in user_neg_samples])
        user_neg_neighbor = '\t'.join([str(item_id2idx[un[1]]) if un[1] != -1 else str(-1) for un in user_neg_samples])
        
        item_pos_text = '\t'.join([ip[0] for ip in item_pos_samples])
        item_pos_neighbor = '\t'.join([str(user_id2idx[ip[1]]) if ip[1] != -1 else str(-1) for ip in item_pos_samples])
        item_neg_text = '\t'.join([inn[0] for inn in item_neg_samples])
        item_neg_neighbor = '\t'.join([str(user_id2idx[inn[1]]) if inn[1] != -1 else str(-1) for inn in item_neg_samples])
        
        user_line = str(user_id2idx[d[0]]) + '\*\*' + user_pos_text + '\*\*' + user_neg_text + '\*\*' + user_pos_neighbor + '\*\*' + user_neg_neighbor
        item_line = str(item_id2idx[d[1][1]]) + '\*\*' + item_pos_text + '\*\*' + item_neg_text + '\*\*' + item_pos_neighbor + '\*\*' + item_neg_neighbor
        
        fout.write(user_line+'\$\$'+item_line+'\$\$'+str(d[1][2])+'\n')

print(f'Number of Valid Test Edges:{valid_test_edges} | Total:{len(test_tuples)}')

100%|██████████| 203808/203808 [04:28<00:00, 759.46it/s] 

Number of Valid Test Edges:182238 | Total:203808





In [14]:
# save side files

pickle.dump([upos,uneg,ipos,ineg],open(f'{output_dir}/{dataset}/neighbor_sampling.pkl','wb'))
pickle.dump(user_id2idx,open(f'{output_dir}/{dataset}/user_id2idx.pkl','wb'))
pickle.dump(item_id2idx,open(f'{output_dir}/{dataset}/item_id2idx.pkl','wb'))
pickle.dump([len(user_id2idx),len(item_id2idx),2],open(f'{output_dir}/{dataset}/node_num.pkl','wb'))

In [15]:
# save neighbor file

pickle.dump(train_user_pos_neighbor,open(f'{output_dir}/{dataset}/neighbor/train_user_pos_neighbor.pkl','wb'))
pickle.dump(train_user_neg_neighbor,open(f'{output_dir}/{dataset}/neighbor/train_user_neg_neighbor.pkl','wb'))
pickle.dump(train_item_pos_neighbor,open(f'{output_dir}/{dataset}/neighbor/train_item_pos_neighbor.pkl','wb'))
pickle.dump(train_item_neg_neighbor,open(f'{output_dir}/{dataset}/neighbor/train_item_neg_neighbor.pkl','wb'))