# Initial Mapping of Amazon Reviews

In [1]:
import pandas as pd
from random import randint
from tqdm import tqdm
import pickle
import multiprocessing
pd.options.mode.chained_assignment = None  # default='warn' # this is needed because setting temporary column value on chunk

16

In [2]:
ratings = [1,2,3,4,5]
columns = ['reviewerID', *[f'{rating}*' for rating in ratings]]
# initialize empty csv with proper headers
empty_df = pd.DataFrame(columns=[columns])
empty_df.to_csv("./reviewer_data_non_reduced.csv", index=False)

In [3]:
# this is essentially a miniature map-reduce
# will append the 1* -> 5* columns, using vectorization (which already uses all cores)
# then remove the unnecessary columns, to reduce memory usage.
# then reduce by the reviewerID, adding up the columns

def process_chunk(review_chunk):
    review_chunk = review_chunk[['reviewerID', 'overall']]
    for rating in ratings:
        review_chunk[f'{rating}*'] = (review_chunk['overall'] == rating).astype(int)
    review_chunk.drop(['overall'], axis=1, inplace=True)
    review_chunk = review_chunk.groupby('reviewerID').sum()
    return review_chunk

In [4]:
num_rows = 230 * (10**6) # around 230 million rows
chunksize = 1000000

with tqdm(total=num_rows / chunksize) as pbar:
    for chunk in pd.read_json(
        "../Amazon_Review_Data/All_Amazon_Review.json.gz",
        lines=True,
        chunksize=chunksize,
        compression="gzip",
    ):
        processed_chunk = process_chunk(chunk)
        # need to specify header=False to prevent column names from being added as a row
        # mode="a" to append to existing csv
        # index=True (default) because I don't want the reviewerID to become the index
        processed_chunk.to_csv("./Intermediate_Datasets/reviewer_data_non_reduced.csv", mode="a", header=False)
        pbar.update(1)

234it [56:54, 14.59s/it]                                                                                                
