In [6]:
import json, csv, os
import pandas as pd
from tqdm import tqdm
from random import sample

In [7]:
data_dir = "../Data/"

### Pass 1 - Get a mapping of users and number of reviews

In [None]:
user_item_dict = {}
def populate_user_item_dict(data_path, total_rows):
    with open(data_path, encoding="utf-8") as file:
        with tqdm(total=total_rows, desc="Processing", leave=True, unit_scale=True) as pbar:
            for line in file:
                    row = json.loads(line)
                    uid = row.get("reviewerID")
                    user_item_dict[uid] = user_item_dict.get(uid, 0) + 1
                    pbar.update()

In [None]:
data_path = os.path.join(data_dir, "All_Amazon_Review_5.json")
total_rows = 157260921
populate_user_item_dict(data_path, total_rows)

In [None]:
with open(os.path.join(data_dir, "user_reviews.json")) as file:
    json.dump(user_item_dict)

### Pass 2 - Sample Data

In [8]:
with open("./user_reviews.json") as file:
    user_reviews_dict = json.load(file)

In [9]:
frequent_reviewers = [key for key, val in user_reviews_dict.items() if val > 10]
print(len(frequent_reviewers))

4198335


In [10]:
users_100k = set(sample(frequent_reviewers, 100000))
users_500k = set(sample(frequent_reviewers, 500000))
print(len(users_100k), len(users_500k))

100000 500000


In [11]:
def sampleDataJSON(totalSize:int, dataPath:str, output_100k:str, output_500k:str, columns:list):
    """function used to sample a small csv dataset from a large JSON file

    Args:
        totalSize (int): the size of the original dataset
        dataPath (str): original dataset path
        output_100k (str): sample_100k dataset output path
        output_500k (str): sample_500k dataset output path
        columns (list): list of cloumn required in the sampled dataset
    """

    with open(output_100k, "w", newline="") as output_100k, \
    open(output_500k, "w", newline="") as output_500k:

        writer_100k = csv.DictWriter(output_100k, columns)
        writer_100k.writeheader()
        writer_500k = csv.DictWriter(output_500k, columns)
        writer_500k.writeheader()
        
        with open(dataPath, encoding="utf-8") as file:
            with tqdm(total=totalSize, desc="Processing", leave=True, unit_scale=True) as pbar:
                for line in file:
                    row = json.loads(line)
                    uid = row.get("reviewerID")
                    if uid in users_100k:
                        writer_100k.writerow(dict([(key, row.get(key, "")) for key in columns]))
                    if uid in users_500k:
                        writer_500k.writerow(dict([(key, row.get(key, "")) for key in columns]))
                    pbar.update()
    
                    
data_dir = "../Data/"
input_json_path = os.path.join(data_dir, "All_Amazon_Review_5.json")
output_100k = os.path.join(data_dir, "sampled_100k.csv")
output_500k = os.path.join(data_dir, "sampled_500k.csv")
total_records = 157260921
columns = ['verified', 'asin', 'reviewerID', 'overall', 'reviewText', 'unixReviewTime', 'summary', 'vote']
sampleDataJSON(total_records, input_json_path, output_100k, output_500k, columns)

Processing: 100%|█████████▉| 157M/157M [1:01:58<00:00, 42.3kit/s]   


### Train - Validation - Test Split

In [42]:
df = pd.read_csv(os.path.join(data_dir, "sampled_100k.csv"), low_memory = False)

In [45]:
df.sort_values(['reviewerID', 'unixReviewTime'], ascending=[True, False], inplace=True)
df.reset_index(drop=True, inplace=True)

def update(val):
    if type(val) == str:
        val = val.replace(",", "")
    return int(val)

df.vote = df.vote.fillna(0).apply(update)

In [47]:
df.head(30)

Unnamed: 0,verified,asin,reviewerID,overall,reviewText,unixReviewTime,summary,vote
0,True,B00ZJRHSRW,A0003492LQH8LJXPWDMZ,3.0,"So the shipping box was fine, but the actual b...",1524787200,Lose parts,0
1,True,B01FY9XN00,A0003492LQH8LJXPWDMZ,5.0,Product is great! This is my 3rd pair. There p...,1522886400,Product is great! This is my 3rd pair,0
2,True,B00U0I3N4M,A0003492LQH8LJXPWDMZ,5.0,Great quality. Fit as expected. Boyfriend wear...,1508025600,Five Stars,0
3,True,B00DJACZ56,A0003492LQH8LJXPWDMZ,5.0,Cute design and keeps my car a lot cooler,1508025600,Five Stars,0
4,True,B01F88RU6U,A0003492LQH8LJXPWDMZ,2.0,"Super cute top, however only got to wear once ...",1508025600,Fringe bad,0
5,True,B01FY9XN00,A0003492LQH8LJXPWDMZ,3.0,"Decent material, the shorts were a bit bigger ...",1504569600,Overall not bad for the price,0
6,False,B004PBCFG2,A0003492LQH8LJXPWDMZ,1.0,The stand it's self is a bit smaller than I ex...,1504569600,The stand it's self is a bit smaller than I ...,0
7,True,0373799713,A0003492LQH8LJXPWDMZ,5.0,Love the blaze series!! Always a good read!,1504137600,Five Stars,0
8,False,B01BXL5I88,A0003492LQH8LJXPWDMZ,2.0,"Cute gift, boyfriend loved it, but cheaply mad...",1502668800,"boyfriend loved it, but cheaply made",0
9,True,B018GEHIW4,A0003492LQH8LJXPWDMZ,5.0,Very cute shakers. Packaged very securely with...,1494633600,Absolutely love this purchase,0


In [112]:
start = 0
count = len(pd.unique(df.reviewerID))

train_indices = []
validation_indices = []
test_indices = []

with tqdm(total=count, desc="Processing", leave=True, unit_scale=True) as pbar:
    for i in range(count):
        uid = df.iloc[start, :].reviewerID
        num_records = user_reviews_dict.get(uid)
        test_indices.extend(list(range(start, start + int(num_records * 0.2))))
        validation_indices.extend(list(range(start + int(num_records * 0.2), start + int(num_records * 0.4))))
        train_indices.extend(list(range(start + int(num_records * 0.4), start + num_records)))
        start += num_records
        pbar.update()

Processing: 100%|██████████| 100k/100k [00:21<00:00, 4.63kit/s] 


In [119]:
train_100k = df.iloc[train_indices]
train_100k.reset_index(drop=True, inplace=True)
train_100k.to_csv(os.path.join(data_dir, "train_100k.csv"), index=False)
validation_100k = df.iloc[validation_indices]
validation_100k.reset_index(drop=True, inplace=True)
validation_100k.to_csv(os.path.join(data_dir, "validation_100k.csv"), index=False)
test_100k = df.iloc[test_indices]
test_100k.reset_index(drop=True, inplace=True)
test_100k.to_csv(os.path.join(data_dir, "test_100k.csv"), index=False)