In [1]:
# going to clean the reviews here.
import ast
import sys
import string
import json
import random
import time
import autopep8
import joblib
import gc
import os
import glob
import numpy as np
import pandas as pd
from IPython.display import display
from pandas.api.types import is_numeric_dtype
from scipy.sparse import hstack
from io import StringIO

# do get rid of annoying warnings
pd.set_option('future.no_silent_downcasting', True)

In [2]:
# global, for the print statements
verbose = True
# set to True so that the notebook tries smaller chunks and only does 5 chunks
test = False
# How confident does the model need to be to accept the psuedo-label
threshold = 0.8
# chunk_size (will only use this if test if False)
CHUNK_SIZE = 100000
# for calculating account age
today = pd.Timestamp.today()

chunk_save_path = f"files/review_chunks/user_clean/user_chunk{'_test' if test else ''}_0"
csv_save_path = 'files/user_cleaned.csv'

user_file_path = 'reviews/yelpReviews/yelp_academic_dataset_user.json'
ussub = ['user_id', 'review_count', 'yelping_since']

# going to have read the json file in chunks, the thing is almost 5 gigs
chunk_size = 200 if test else CHUNK_SIZE

with open(user_file_path, 'r', encoding='utf-8') as file:
        chunk = []
        chunk_df = pd.DataFrame()
        chunk_count = []
        count = 1
        print(f'Starting cleaning')
        t0 = time.time()
        # check if this chunk exists already (for restarts)
        chunk_path = f'{chunk_save_path}{count}.csv'
        
        if os.path.exists(chunk_path):
            if test:
                os.remove(chunk_path)
            chunk_count.append(count)
        
        for index, line in enumerate(file):            
            if count not in chunk_count:
                # read each line as a dataframe then append to a list
                data = json.loads(line)
                #data = pd.read_json(StringIO(line), lines = True)
                chunk.append(data)
            
            # check if chunk is full / right now we exit since I'm just trying to clean the thing rn.
            if (index + 1) % chunk_size == 0:
                if os.path.exists(chunk_path):
                    print(f'Chunk {count} already exits at {chunk_path}. Skipping')
                else:
                    print(f'Starting labeling of chunk {count}.')
                    chunk_df = pd.DataFrame(chunk)
                    
                    # remove the columns we don't care about
                    chunk_df = chunk_df[ussub]
                    
                    chunk_df['yelping_since'] = pd.to_datetime(chunk_df['yelping_since'])
                    chunk_df['account_age_years'] = (today - chunk_df['yelping_since']).dt.days / 365.25
                    chunk_df = chunk_df.drop(['yelping_since'], axis=1)
                    
                    # write each chunk to its own file, will combine them later
                    chunk_df.to_csv(chunk_path, index=False)
                    if verbose:
                        print(f'chunk {count} finished at {time.time() - t0} seconds. Saved at {chunk_path}\n')
                        
                # garbage collection
                del chunk, chunk_df
                gc.collect()
        
                chunk = []
                chunk_df = pd.DataFrame()
                
                if count not in chunk_count:
                    chunk_count.append(count)
                count += 1
                chunk_path = f'{chunk_save_path}{count}.csv'
                
                if os.path.exists(chunk_path):
                    if test:
                        os.remove(chunk_path)
                    else:
                        chunk_count.append(count)
                
                if test and count > 5:
                    break
if test:
    print('TEST RUN')
print(f'Finished labeling reviews after {(time.time() - t0) / 60.0} minutes. Files are seperated into chunks of {chunk_size} lines.')

print('Combining chunks')
chunk_files = sorted(glob.glob(f'{chunk_save_path}*.csv'))
df = pd.concat([pd.read_csv(file) for file in chunk_files], ignore_index = True)
df.to_csv(csv_save_path, index=False)
print(f'Combined. Cleaned review files lives at {csv_save_path}')

Starting cleaning
Chunk 1 already exits at files/review_chunks/user_clean/user_chunk_01.csv. Skipping
Chunk 2 already exits at files/review_chunks/user_clean/user_chunk_02.csv. Skipping
Chunk 3 already exits at files/review_chunks/user_clean/user_chunk_03.csv. Skipping
Chunk 4 already exits at files/review_chunks/user_clean/user_chunk_04.csv. Skipping
Chunk 5 already exits at files/review_chunks/user_clean/user_chunk_05.csv. Skipping
Chunk 6 already exits at files/review_chunks/user_clean/user_chunk_06.csv. Skipping
Chunk 7 already exits at files/review_chunks/user_clean/user_chunk_07.csv. Skipping
Chunk 8 already exits at files/review_chunks/user_clean/user_chunk_08.csv. Skipping
Chunk 9 already exits at files/review_chunks/user_clean/user_chunk_09.csv. Skipping
Chunk 10 already exits at files/review_chunks/user_clean/user_chunk_010.csv. Skipping
Chunk 11 already exits at files/review_chunks/user_clean/user_chunk_011.csv. Skipping
Chunk 12 already exits at files/review_chunks/user_cle