In [31]:
# To get overwrite files, just remove the .mark files

# This is specifically for cleaning reviews for predicting ratings
# going to clean the reviews here.
import ast
import sys
import string
import json
import random
import time
import autopep8
import joblib
import gc
import os
import glob
import numpy as np
import pandas as pd
from IPython.display import display
from pandas.api.types import is_numeric_dtype
from scipy.sparse import hstack
from io import StringIO

# do get rid of annoying warnings
pd.set_option('future.no_silent_downcasting', True)

In [35]:
# global, for the print statements
verbose = True
# set to True so that the notebook tries smaller chunks and only does 5 chunks
test = False
# chunk_size (will only use this if test if False)
CHUNK_SIZE = 5000

chunk_save_path = f"files/review_chunks/no_label/rating_group/review_chunk"
# mark's only purpose is to inform the loop that this file exists or not
mark_path = f"files/review_chunks/no_label/rating_group/mark_{'_test' if test else ''}_0"
# starting at 0 since ML models like 0 indexed variables
star_values = [0,1,2,3,4]

rwpath = 'reviews/yelpReviews/yelp_academic_dataset_review.json'
# cols we want from the reviews
rwsub = ['user_id', 'business_id', 'text', 'date', 'stars']
                

In [36]:
# converts to lowercase and strip punctuation
def clean_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

In [43]:
# going to have read the json file in chunks, the thing is almost 5 gigs
chunk_size = 200 if test else CHUNK_SIZE

with open(rwpath, 'r', encoding='utf-8') as file:
    # create a chunk for each star, we will be saving these grouped together
    chunk_dic = {}
    for value in star_values:
        chunk_dic[value] = []
    chunk_count = []
    count = 1

    print(f'Starting labeling')
    t0 = time.time()

    # check if this chunk exists already (for restarts)
    check_path = f'{mark_path}{count}.mark' 
    if os.path.exists(check_path):
        chunk_count.append(count)
    
    for index, line in enumerate(file):
        # only load and save the line if this chunk hasn't been done yet
        if count not in chunk_count:
            # read each line as a dataframe then append to a list
            data = json.loads(line)
            #data = pd.read_json(StringIO(line), lines = True)
            # determine which dataframe to put this json object in
            # star = star - 1 to make it zero indexed
            star = data['stars'] = int(data['stars'])
            chunk_dic[star - 1].append(data)
        
        # check if chunk is full 
        if (index + 1) % chunk_size == 0:
            if count in chunk_count:
                print(f'Chunks {count} already exits. Skipping')
            else:
                # convert json object lists into dataframes. remove
                # columns we don't care about. The text also gets cleaned
                for star in star_values:
                    data = chunk_dic[star]
                    data_df = pd.DataFrame(data) # convert to df
                    data_df['text'] = data_df['text'].apply(clean_text) # clean text
                    data_df['stars'] = data_df['stars'] - 1 # change so stars are z
                    data_df = data_df[rwsub] # only keep relevant cols
                    data_df.to_csv(f"{chunk_save_path}_star{star}_0{count}.csv", index = False)
                    
                    del data_df, data, chunk_dic[star]
                    gc.collect()
                    
                    if verbose:
                        print(f'Chunk {count} for star value {star} saved.')

                if verbose:
                    print(f'chunk {count} finished at {time.time() - t0} seconds.\n')

            # mark this as finished by saving the mark
            mark_save_path = f"{mark_path}{count}.mark"
            with open(mark_save_path, 'w') as f:
                pass
            
            # clear dictionary for next loop
            chunk_dic.clear()
            for value in star_values:
                chunk_dic[value] = []
            
            if count not in chunk_count:
                chunk_count.append(count)
            count += 1

            mark_save_path = f"{mark_path}{count}.mark"
            if os.path.exists(mark_save_path):
                chunk_count.append(count)
            
            if test and count > 5:
                break
if test:
    print('TEST RUN')
print(f'Finished chunking reviews after {(time.time() - t0) / 60.0} minutes. Files are seperated into chunks of {chunk_size} lines.')


Starting labeling
Chunk 1 for star value 0 saved.
Chunk 1 for star value 1 saved.
Chunk 1 for star value 2 saved.
Chunk 1 for star value 3 saved.
Chunk 1 for star value 4 saved.
chunk 1 finished at 0.3356940746307373 seconds.

Chunk 2 for star value 0 saved.
Chunk 2 for star value 1 saved.
Chunk 2 for star value 2 saved.
Chunk 2 for star value 3 saved.
Chunk 2 for star value 4 saved.
chunk 2 finished at 0.6643383502960205 seconds.

Chunk 3 for star value 0 saved.
Chunk 3 for star value 1 saved.
Chunk 3 for star value 2 saved.
Chunk 3 for star value 3 saved.
Chunk 3 for star value 4 saved.
chunk 3 finished at 0.9719483852386475 seconds.

Chunk 4 for star value 0 saved.
Chunk 4 for star value 1 saved.
Chunk 4 for star value 2 saved.
Chunk 4 for star value 3 saved.
Chunk 4 for star value 4 saved.
chunk 4 finished at 1.252509355545044 seconds.

Chunk 5 for star value 0 saved.
Chunk 5 for star value 1 saved.
Chunk 5 for star value 2 saved.
Chunk 5 for star value 3 saved.
Chunk 5 for star va