In [4]:
import json
import string
import time
import gc
from io import StringIO

import joblib
import numpy as np
import pandas as pd
from IPython.display import display
from scipy.sparse import hstack
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC

pd.set_option('future.no_silent_downcasting', True)

verbose = True

In [13]:
# converts to lowercase and strip punctuation
def convertLine(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))

    text = json.dumps(text.split('\t', 3))
    text = pd.read_json(StringIO(text), lines=True)

    return text


# going to read it in chunks
chunk_size = 10


# training the fake reviews model to an acceptable accuracy.
# load the file
review_path = 'reviews/YelpZip/reviewContent'
meta_path = 'reviews/YelpZip/metadata'
final_path = 'files/review_chunks/zip_cleaned_data_0'

# actual col re-names and the masks were going to use
review_cols = {0: 'user_id', 1: 'blank', 2: 'date', 3: 'text'}
meta_cols = {0: 'user_id', 1: 'blank', 2: 'rating', 3: 'label', 4: 'date'}
review_col_mask = ['text']
meta_col_mask = ['rating', 'label']

with open(review_path, 'r', encoding='utf-8') as f_review:
    with open(meta_path, 'r', encoding='utf-8') as f_meta:
        chunk_r = []
        chunk_m = []

        chunk_r_df = pd.DataFrame()
        chunk_m_df = pd.DataFrame()
        
        print(f'Starting zipping')
        t0 = time.time()
        count = 1
        
        for index, (line_f, line_m) in enumerate(zip(f_review, f_meta)):
            # read each line as a dataframe then append to a list
            processed_line = convertLine(line_f)
            chunk_r.append(processed_line)

            processed_line = json.dumps(line_m.split('\t'))
            chunk_m.append(pd.read_json(StringIO(processed_line), lines=True))

            print(line_m)
            
            # save chunk to disk
            if (index + 1) % chunk_size == 0:
                chunk_r_df = pd.concat(chunk_r, ignore_index=True).rename(columns = review_cols)[review_col_mask]
                chunk_m_df = pd.concat(chunk_m, ignore_index=True).rename(columns = meta_cols)[meta_col_mask]
        
                # remove the columns we don't care about and then concat them into the final data frame
                # chunk_r_df = chunk_r_df[rwsub_less]
                # chunk_m_df = chunk_m_df[]
                final_df = pd.concat([chunk_r_df, chunk_m_df], axis=1)
                final_df['rating'] = final_df['rating'] / 5.0
                display(final_df)
                final_df.convert_dtypes()
                display(final_df)
                # write the cleaned and organized dataframe to a file
                save_path = f'{final_path}{count}.csv'
                final_df.to_csv(save_path, index=False)
                
                # clear out garbage
                del chunk_r, chunk_m, chunk_r_df, chunk_m_df, final_df
                gc.collect()
                
                if verbose:
                    print(f'Chunk {count} saved at {save_path}')
                    print(f'Time elapsed: {time.time() - t0} seconds')
                count += 1
                break

print('Finished zipping')
if verbose:
    print(f'Time elapsed: {time.time() - t0} seconds')

Starting zipping
linef
5044	0	2014-11-16	Drinks were bad, the hot chocolate was watered down and the latte had a burnt taste to it. The food was also poor quality, but the service was the worst part, their cashier was very rude.

      0  1         2                                                  3
0  5044  0  20141116  drinks were bad the hot chocolate was watered ...
linem
5044	0	1.0	-1	2014-11-16

["5044", "0", "1.0", "-1", "2014-11-16\n"]
5044	0	1.0	-1	2014-11-16

linef
5045	0	2014-09-08	This was the worst experience I've ever had a casual coffee/light fare place.  The server disappeared for 20 minutes, just talking to his friend by the window as my girlfriend and I sat dumbfounded that this dude had the nerve to do that on the job.  We're trying to make eye contact, but clearly getting paid to talk to his bud was more important to him. My girlfriend went up to the counter once the server disappeared into the back for another 5 minutes (what is this guy doing?) and asked if she s

Unnamed: 0,text,rating,label
0,drinks were bad the hot chocolate was watered ...,0.2,-1
1,this was the worst experience ive ever had a c...,0.2,-1
2,this is located on the site of the old spruce ...,0.6,-1
3,i enjoyed coffee and breakfast twice at toast ...,1.0,-1
4,i love toast the food choices are fantastic i...,1.0,-1
5,the egg on an english muffin their take on egg...,1.0,-1
6,wonderful relaxed vibe and fantastic homemade ...,1.0,-1
7,extremely slow kitchen i went with an hour to ...,0.2,1
8,i really wanted to love toast its quaint and c...,0.4,1
9,first brunch experience here in philly actuall...,0.8,1


Unnamed: 0,text,rating,label
0,drinks were bad the hot chocolate was watered ...,0.2,-1
1,this was the worst experience ive ever had a c...,0.2,-1
2,this is located on the site of the old spruce ...,0.6,-1
3,i enjoyed coffee and breakfast twice at toast ...,1.0,-1
4,i love toast the food choices are fantastic i...,1.0,-1
5,the egg on an english muffin their take on egg...,1.0,-1
6,wonderful relaxed vibe and fantastic homemade ...,1.0,-1
7,extremely slow kitchen i went with an hour to ...,0.2,1
8,i really wanted to love toast its quaint and c...,0.4,1
9,first brunch experience here in philly actuall...,0.8,1


Chunk 1 saved at files/review_chunks/zip_cleaned_data_01.csv
Time elapsed: 0.12780117988586426 seconds
Finished zipping
Time elapsed: 0.12829899787902832 seconds
