## Collect the 1* reviews from active reviewers, and merge in the posters average rating

## end format:

## fullText(summary + reviewText) | reviewer_average_rating

In [None]:
import pandas as pd
from tqdm import tqdm
import swifter
pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
reviewer_df = pd.read_csv("../Processed_Data/reviewer_data.csv")
active_reviewer_df = reviewer_df.query("active_reviewer == True")
active_reviewer_ids = set(active_reviewer_df['reviewerID'].unique())
active_reviewer_df.head(3)

Unnamed: 0,reviewerID,1*,2*,3*,4*,5*,activity,avg_rating,std_reviews,active_reviewer
8,A0024320PFCKHZ0HWVFN,0,0,1,0,5,6,4.666667,0.745356,True
15,A0040714X0G8QUCER7Q,0,0,1,2,4,7,4.428571,0.728431,True
27,A0081581LX99MYDYNRIB,1,0,1,0,5,7,4.142857,1.456863,True


In [3]:
reviewer_id_average_rating_map = dict (zip(active_reviewer_df['reviewerID'], active_reviewer_df['avg_rating']))

In [4]:
reviewer_id_average_rating_map['A0081581LX99MYDYNRIB']

4.142857142857143

In [5]:
# initialize empty csv, which will be populated
empty_df = pd.DataFrame(columns=['full_text', 'reviewer_avg_rating'])
empty_df.to_csv("../Training_Data/one_star_reviews.csv", index=False)

In [6]:
def get_avg_rating_by_reviewer_id(reviewer_row):
    reviewerID = reviewer_row['reviewerID']
    avg_rating = reviewer_id_average_rating_map[reviewerID]
    return avg_rating

In [7]:
num_rows = 234 * (10**6) # around 230 million rows
chunksize = 1000000
last_chunk = None

with tqdm(total=num_rows / chunksize) as pbar:
    for chunk in pd.read_json(
        "../Amazon_Review_Data/All_Amazon_Review.json.gz",
        lines=True,
        chunksize=chunksize,
        compression="gzip",
    ):
        # first filter to the 1* reviews
        one_star_reviews = chunk.loc[chunk['overall'] == 1]
        active_reviewer_reviews = one_star_reviews.loc[chunk['reviewerID'].isin(active_reviewer_ids)]
        save_data = active_reviewer_reviews[['reviewerID', 'summary', 'reviewText']]
        save_data['full_text'] = save_data['summary'] + " " + save_data['reviewText']
        save_data['reviewer_avg_rating'] = save_data.apply(get_avg_rating_by_reviewer_id, axis=1)
        save_data = save_data[['full_text', 'reviewer_avg_rating']]
        save_data.to_csv("../Training_Data/one_star_reviews.csv", mode="a", header=False)
        pbar.update(1)

 91%|██████████████████████████████████████████████████████████████████████       | 213/234.0 [1:38:28<07:39, 21.89s/it]IOStream.flush timed out
100%|█████████████████████████████████████████████████████████████████████████████| 234/234.0 [1:46:40<00:00, 27.35s/it]


## Seems like there is weird formatting stuff

In [None]:
training_data = pd.read_csv("../Training_Data/one_star_reviews.csv")

In [None]:
training_data.head(3)

Unnamed: 0,full_text,reviewer_avg_rating
0,Returning to Pretty crappy. Won&rsquo;t connec...,3.333333
1,Can not connect to ECHO Not happy. Can not con...,2.333333
2,Pathetic Who would think you would have to spe...,3.384615


In [15]:
bad_indices = []
for index, row in tqdm(training_data.iterrows(), total=training_data.shape[0]):
    full_text, reviewer_avg_rating = row['full_text'], row['reviewer_avg_rating']
    if type(full_text) != str:
        bad_indices.append(index)

100%|██████████████████████████████████████████████████████████████████████| 4062204/4062204 [02:03<00:00, 33023.69it/s]


In [17]:
training_data = training_data.drop(bad_indices)

In [19]:
training_data.to_csv("../Training_Data/one_star_reviews.csv", index=False)