In [1]:
import os
import random
import json
import pickle
from copy import deepcopy
from tqdm import tqdm
from collections import defaultdict

import numpy as np
from transformers import BertTokenizerFast

In [2]:
random.seed(0)

In [3]:
dataset = 'movie'  # Electronics, movie, CDs, Apps
data_name = 'reviews_Movies_and_TV_5' # reviews_Electronics_5, reviews_CDs_and_Vinyl_5, reviews_Apps_for_Android_5
output_dir='xxx'

# Generate Training Data (Edge Classification)

In [4]:
# read raw data
with open(f'{dataset}/{data_name}.json') as f:
    data = []
    readin = f.readlines()
    for line in tqdm(readin):
        data.append(json.loads(line))
random.shuffle(data)

100%|████████████████████████████████████████████████████████████████████████| 1697533/1697533 [00:28<00:00, 60303.87it/s]


In [5]:
len(data)

1697533

In [6]:
data[7]

{'reviewerID': 'A1SHNOU2ODKIDG',
 'asin': 'B004MOLFR2',
 'reviewerName': 'Gregorio',
 'helpful': [0, 2],
 'reviewText': 'The Asylum has been producing low budget films for years and while their technical quality has now reached YouTube level, the films are getting worse.  ALMIGHTY THOR is no MEGASHARK or even INTERMEDIO or JOLLY ROGER.  What is especially infuriating about this one is that the film could have been better with just a few obvious fixes.  Yes, the script is incomprehensible and entirely awful and has a dazed Richard Grieco muttering lines like: "you don\'t understand the power of the bone."  Even so, there are mistakes in this awful film that are entirely without excuse.  The Asylum has gotten to the point where they are occasionally able to produce visual effects of a barely acceptable quality (a huge leap forward for them), yet they continue to insert effects shots that look to be less than half finished.  Standard photography is also frequently out of focus.  Okay . . 

In [7]:
# text processing function
def text_process(text):
    p_text = ' '.join(text.split('\r\n'))
    p_text = ' '.join(text.split('\n\r'))
    p_text = ' '.join(text.split('\n'))
    p_text = ' '.join(p_text.split('\t'))
    p_text = ' '.join(p_text.split('\rm'))
    p_text = ' '.join(p_text.split('\r'))
    p_text = ''.join(p_text.split('$'))
    p_text = ''.join(p_text.split('*'))

    return p_text

In [8]:
## rate distribution

rate_dict = defaultdict(int)
user_id2idx = {}
item_id2idx = {}

for d in tqdm(data):
    rate_dict[d['overall']] += 1
    if d['reviewerID'] not in user_id2idx:
        user_id2idx[d['reviewerID']] = len(user_id2idx)
    if d['asin'] not in item_id2idx:
        item_id2idx[d['asin']] = len(item_id2idx)

print(rate_dict)

100%|███████████████████████████████████████████████████████████████████████| 1697533/1697533 [00:03<00:00, 549152.44it/s]

defaultdict(<class 'int'>, {4.0: 382994, 5.0: 906608, 3.0: 201302, 1.0: 104219, 2.0: 102410})





In [10]:
## generate data samples

samples = []

for d in tqdm(data):
    samples.append((text_process(d['reviewText']), user_id2idx[d['reviewerID']], item_id2idx[d['asin']], d['overall']-1))

100%|██████████| 1697533/1697533 [00:11<00:00, 144173.16it/s]


In [11]:
## split train/val/test as 7:1:2 or 8:1:1
### user_pos_reviews/user_neg_reviews: key<-userID, value<-list(reviews)
### item_pos_reviews/item_neg_reviews: key<-productID, value<-list(reviews)
### train_user_neighbor: key<-userID, value<-list(tuple(reviews,p/n))
### train_item_neighbor: key<-userID, value<-list(tuple(reviews,p/n))

sample_num = len(samples)
random.seed(0)

train_bound = int(sample_num * 0.7)
val_bound = int(sample_num * 0.8)
print(train_bound, val_bound - train_bound, sample_num - val_bound)

random.shuffle(samples)

1188273 169753 339507


In [12]:
# generate and save train file

with open(f'{output_dir}/{dataset}/train.tsv','w') as fout:
    for s in tqdm(samples[:train_bound]):
        fout.write(s[0]+'\$\$'+str(s[1])+'\$\$'+str(s[2])+'\$\$'+str(int(s[3]))+'\n')

100%|██████████| 1188273/1188273 [00:23<00:00, 50297.30it/s] 


In [13]:
# generate and save val file

with open(f'{output_dir}/{dataset}/val.tsv','w') as fout:
    for s in tqdm(samples[train_bound:val_bound]):
        fout.write(s[0]+'\$\$'+str(s[1])+'\$\$'+str(s[2])+'\$\$'+str(int(s[3]))+'\n')

100%|██████████| 169753/169753 [00:08<00:00, 19092.58it/s]


In [14]:
# generate and save test file

with open(f'{output_dir}/{dataset}/test.tsv','w') as fout:
    for s in tqdm(samples[val_bound:]):
        fout.write(s[0]+'\$\$'+str(s[1])+'\$\$'+str(s[2])+'\$\$'+str(int(s[3]))+'\n')

100%|██████████| 339507/339507 [00:05<00:00, 64473.92it/s] 


In [15]:
# save side files

pickle.dump(user_id2idx,open(f'{output_dir}/{dataset}/user_id2idx.pkl','wb'))
pickle.dump(item_id2idx,open(f'{output_dir}/{dataset}/item_id2idx.pkl','wb'))
pickle.dump([len(user_id2idx),len(item_id2idx),5],open(f'{output_dir}/{dataset}/node_num.pkl','wb'))