In [1]:
# going to clean the reviews here.
import ast
import string
import json
import random
import time
import autopep8
import joblib
import numpy as np
import pandas as pd
from IPython.display import display
from pandas.api.types import is_numeric_dtype
from scipy.sparse import hstack
from io import StringIO

# do get rid of annoying warnings
pd.set_option('future.no_silent_downcasting', True)

In [2]:
# global, for the print statements
verbose = True

# load the tf-idf vectorizer and support_svm
vizer = joblib.load('models/support_vectorizer.pkl')

support_svm = joblib.load('models/support_svm.pkl')
support_log = joblib.load('models/support_log.pkl')
support_for = joblib.load('models/support_for.pkl')
support_models = {
    'svm' : support_svm,
    'log' : support_log,
    'for' : support_for
} 

# this should probably be on a seperate cell so I don't constantly reload the dataframes
fDefPath = 'reviews/yelpReviews/yelp_academic_dataset_'
# constants so I don't have to keep changing names
BS = 'business'
CH = 'checkin'
TI = 'tip'
RW = 'review'
US = 'user'

# subsets of what i care about
bssub = ['business_id', 'postal_code',
         'review_count', 'attributes', 'categories']
ussub = ['user_id', 'review_count', 'yelping_since']
# this top one is for when we use for final training
rwsub = ['user_id', 'business_id', 'stars', 'text', 'date']
rwsub_less = ['stars', 'text']

# constants for the file path
bspath = f'{fDefPath}{BS}.json'
chpath = f'{fDefPath}{CH}.json'
tipath = f'{fDefPath}{TI}.json'
rwpath = f'{fDefPath}{RW}.json'
uspath = f'{fDefPath}{US}.json'

chunk_save_path = 'files/review_chunks/review_chunk_0'

In [3]:
# clean and then predict. returns predictions
def clean_predict(in_chunk_df):
    in_chunk_df = in_chunk_df
    # clean text and then normalize rating
    in_chunk_df['text'] = in_chunk_df['text'].apply(clean_text)
    in_chunk_df['stars'] = in_chunk_df['stars'] / 5.0

    # vectorize chunk's text
    x_text = vizer.transform(in_chunk_df['text'])
    # convert starts (rating) to 2d array
    rate_feature = in_chunk_df['stars'].values.reshape(-1,1)
    
    # crate the hstack to be used in the support model
    X = hstack([x_text, rate_feature])

    y_pred = pd.DataFrame()

    for model_name in support_models.keys():
        # return predictions
        if model_name == 'svm':
            conf_scores = support_models[model_name].decision_function(X)
            mask = conf_scores > 0.8
        else:
            conf_scores = support_models[model_name].predict_proba(X)
            mask = conf_scores[:, 1] > 0.8
            #print(mask)

        final_mask = np.where(mask, 1, 0)
        pred = pd.DataFrame(final_mask, columns=[model_name])
        y_pred = pd.concat([y_pred, pred], axis = 1)

    if verbose:
        print (len(y_pred))
        print (in_chunk_df.shape)
    
    return y_pred

# converts to lowercase and strip punctuation
def clean_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

In [4]:
# going to have read the json file in chunks, the thing is almost 5 gigs
chunk_size = 10000

with open(rwpath, 'r', encoding='utf-8') as file:
    chunk = []
    count = 1
    print(f'Starting labeling')
    t0 = time.time()
    for index, line in enumerate(file):
        # read each line as a dataframe then append to a list
        data = pd.read_json(StringIO(line), lines = True)
        chunk.append(data)

        # check if chunk is full / right now we exit since I'm just trying to clean the thing rn.
        if (index + 1) % chunk_size == 0:
            #print(chunk)
            chunk_df = pd.concat(chunk, ignore_index=True)
            
            # remove the columns we don't care about
            chunk_df = chunk_df[rwsub_less]
            chunk_df = pd.concat([chunk_df, clean_predict(chunk_df)], axis = 1)
            
            # write each chunk to its own file, will combine them later
            chunk_path = f'{chunk_save_path}{count}.csv'
            chunk_df.to_csv(chunk_path, index=False)

            if verbose:
                print(f'chunk {count} finished')
                
            count += 1
            break
            
print(f'Finished labeling reviews after {(time.time() - t0) / 60.0} minutes. Written as {chunk_save_path}number, will need to combined later.')

Starting labeling
10000
(10000, 2)
chunk 1 finished
Finished labeling reviews after 0.563196583588918 minutes. Written as files/review_chunks/review_chunk_0number, will need to combined later.
