In [2]:
import json
import string
import time
import gc
from io import StringIO

import joblib
import numpy as np
import pandas as pd
from IPython.display import display
from scipy.sparse import hstack
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC

pd.set_option('future.no_silent_downcasting', True)



In [19]:
verbose = True
# set to true to only write 3 chunks at 100 lines each, otherwise it will run it for the entire dataset
# for testing
test = True

In [25]:
# converts to lowercase and strip punctuation
def convertLine(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))

    text = json.dumps(text.split('\t', 3))
    text = pd.read_json(StringIO(text), lines=True)

    return text


# going to read it in chunks
chunk_size = 100 if test else 100000


# training the fake reviews model to an acceptable accuracy.
# load the file
review_path = 'reviews/YelpZip/reviewContent'
meta_path = 'reviews/YelpZip/metadata'
final_path = 'files/review_chunks/zip_cleaned_data_0'

# actual col re-names and the masks were going to use
review_cols = {0: 'user_id', 1: 'blank', 2: 'date', 3: 'text'}
meta_cols = {0: 'user_id', 1: 'blank', 2: 'rating', 3: 'label', 4: 'date'}
review_col_mask = ['text']
meta_col_mask = ['rating', 'label']

with open(review_path, 'r', encoding='utf-8') as f_review:
    with open(meta_path, 'r', encoding='utf-8') as f_meta:
        
        print(f'Starting zipping')
        t0 = time.time()
        count = 1
        chunk_r = []
        chunk_m = []

        for index, (line_f, line_m) in enumerate(zip(f_review, f_meta)):
            
            # read each line as a dataframe then append to a list
            processed_line = convertLine(line_f)
            chunk_r.append(processed_line)

            processed_line = json.dumps(line_m.split('\t'))
            chunk_m.append(pd.read_json(StringIO(processed_line), lines=True))
            
            # save chunk to disk
            if (index + 1) % chunk_size == 0:
                chunk_r_df = pd.concat(chunk_r, ignore_index=True).rename(columns = review_cols)[review_col_mask]
                chunk_m_df = pd.concat(chunk_m, ignore_index=True).rename(columns = meta_cols)[meta_col_mask]
        
                # remove the columns we don't care about and then concat them into the final data frame
                # chunk_r_df = chunk_r_df[rwsub_less]
                # chunk_m_df = chunk_m_df[]
                final_df = pd.concat([chunk_r_df, chunk_m_df], axis=1)
                final_df['rating'] = final_df['rating'] / 5.0

                # write the cleaned and organized dataframe to a file
                save_path = f'{final_path}{count}.csv'
                final_df.to_csv(save_path, index=False)
                
                # clear out garbage
                del chunk_r, chunk_m, chunk_r_df, chunk_m_df, final_df
                gc.collect()
                
                if verbose:
                    print(f'Chunk {count} saved at {save_path}')
                    print(f'Time elapsed: {time.time() - t0} seconds')
                count += 1

                chunk_r = []
                chunk_m = []

                if test and count > 3:
                    break

print('Finished zipping')
if verbose:
    print(f'Time elapsed: {time.time() - t0} seconds')

Starting zipping
Chunk 1 saved at files/review_chunks/zip_cleaned_data_01.csv
Time elapsed: 0.4526481628417969 seconds
Chunk 2 saved at files/review_chunks/zip_cleaned_data_02.csv
Time elapsed: 0.8368091583251953 seconds
Chunk 3 saved at files/review_chunks/zip_cleaned_data_03.csv
Time elapsed: 1.223862886428833 seconds
Finished zipping
Time elapsed: 1.2240910530090332 seconds
