In [2]:
# To get overwrite files, just remove the .mark files

# This is specifically for cleaning reviews for predicting ratings
# going to clean the reviews here.
import ast
import sys
import string
import json
import random
import time
import autopep8
import joblib
import gc
import os
import glob
import numpy as np
import pandas as pd
from IPython.display import display
from pandas.api.types import is_numeric_dtype
from scipy.sparse import hstack
from io import StringIO

# do get rid of annoying warnings
pd.set_option('future.no_silent_downcasting', True)

In [3]:
# global, for the print statements
verbose = True
# set to True so that the notebook tries smaller chunks and only does 5 chunks
test = False
# chunk_size (will only use this if test if False)
CHUNK_SIZE = 200000

chunk_save_path = f"files/review_chunks/no_label/rating_group/review_chunk"
# mark's only purpose is to inform the loop that this file exists or not
mark_path = f"files/review_chunks/no_label/rating_group/mark_{'_test' if test else ''}_0"
# starting at 0 since ML models like 0 indexed variables
star_values = [0,1,2,3,4]

rwpath = 'reviews/yelpReviews/yelp_academic_dataset_review.json'
# cols we want from the reviews
rwsub = ['user_id', 'business_id', 'text', 'date', 'stars']
star_dist_path = 'files/star_dist.npy'

In [19]:
# converts to lowercase and strip punctuation
def clean_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

def print_star_dist(star_values):
    total = 0
    total = sum(star_values)
    print(f'Total is {total}')
    for i, star in enumerate(star_values):
        percent = 0 if total == 0 else star / total
        print(f"Star_{i}: {(percent * 100):.2f}%\t count = {star}")

In [20]:
# going to have read the json file in chunks, the thing is almost 5 gigs
chunk_size = 200 if test else CHUNK_SIZE

with open(rwpath, 'r', encoding='utf-8') as file:
    # create a chunk for each star, we will be saving these grouped together
    chunk_dic = {}
    for value in star_values:
        chunk_dic[value] = []
    chunk_count = []
    count = 1
    # to keep track of the distribution of stars
    # so that I can keep that stratification during during
    star_count = [0,0,0,0,0]

    print(f'Starting labeling')
    t0 = time.time()

    # check if this chunk exists already (for restarts)
    # will also load the star_count distribution.
    check_path = f'{mark_path}{count}.mark' 
    if os.path.exists(check_path):
        chunk_count.append(count)
        # will only load here, since if this is skipped we're starting from scratch anyway
        if os.path.exists(star_dist_path):
            star_count = np.load(star_dist_path) 
        else:
            print(f'!!!WARNING: Mark file exists but star_dist does not. Delete mark files to get proper star distribution.!!!')
            sys.exit()
    
    for index, line in enumerate(file):
        # only load and save the line if this chunk hasn't been done yet
        if count not in chunk_count:
            # read each line as a dataframe then append to a list
            data = json.loads(line)
            #data = pd.read_json(StringIO(line), lines = True)
            # determine which dataframe to put this json object in
            # star = star - 1 to make it zero indexed
            star = data['stars'] = int(data['stars'])
            chunk_dic[star - 1].append(data)
        
        # check if chunk is full 
        if (index + 1) % chunk_size == 0:
            if count in chunk_count:
                print(f'Chunks {count} already exits. Skipping')
            else:
                # convert json object lists into dataframes. remove
                # columns we don't care about. The text also gets cleaned
                for star in star_values:
                    data = chunk_dic[star]
                    data_df = pd.DataFrame(data) # convert to df
                    data_df['text'] = data_df['text'].apply(clean_text) # clean text
                    data_df['stars'] = data_df['stars'] - 1 # change so stars are z
                    data_df = data_df[rwsub] # only keep relevant cols
                    star_count[star] += len(data_df)
                    data_df.to_csv(f"{chunk_save_path}_star{star}_0{count}.csv", index = False)
                    np.save(star_dist_path, star_count)
                    
                    del data_df, data, chunk_dic[star]
                    gc.collect()
                    
                    if verbose:
                        print(f'Chunk {count} for star value {star} saved.')

                if verbose:
                    print(f'chunk {count} finished at {time.time() - t0} seconds.\n')

            # mark this as finished by saving the mark
            mark_save_path = f"{mark_path}{count}.mark"
            with open(mark_save_path, 'w') as f:
                pass
            
            # clear dictionary for next loop
            chunk_dic.clear()
            for value in star_values:
                chunk_dic[value] = []
            
            if count not in chunk_count:
                chunk_count.append(count)
            count += 1

            mark_save_path = f"{mark_path}{count}.mark"
            if os.path.exists(mark_save_path):
                chunk_count.append(count)
            
            if test and count > 5:
                break
                
np.save(star_dist_path, star_count)
if test:
    print('TEST RUN')
print(f'Finished chunking reviews after {(time.time() - t0) / 60.0} minutes. Files are seperated into chunks of {chunk_size} lines.')
print(f'\nDistribution of stars is as follows')
print_star_dist(star_count)


Starting labeling
Chunks 1 already exits. Skipping
Chunks 2 already exits. Skipping
Chunks 3 already exits. Skipping
Chunks 4 already exits. Skipping
Chunks 5 already exits. Skipping
TEST RUN
Finished chunking reviews after 5.239248275756836e-05 minutes. Files are seperated into chunks of 200 lines.
Distribution of stars is as follows
Total is 1000
Star_0: 11.20%	 count = 112
Star_1: 7.40%	 count = 74
Star_2: 12.60%	 count = 126
Star_3: 22.80%	 count = 228
Star_4: 46.00%	 count = 460


In [None]:
Finished chunking reviews after 0.016843199729919434 minutes. Files are seperated into chunks of 200 lines.
Distribution of stars is as follows
Star_0: 0% count = 112
Star_1: 0% count = 74
Star_2: 0% count = 126
Star_3: 0% count = 228
Star_4: 0% count = 460
