In [11]:
import json
import string
import time
import gc
from io import StringIO

import joblib
import numpy as np
import pandas as pd
from IPython.display import display
from scipy.sparse import hstack
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC

pd.set_option('future.no_silent_downcasting', True)

verbose = True

In [10]:
# converts to lowercase and strip punctuation
def convertLine(text, is_review):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation + '/xa0'))

    if is_review:
        text = json.dumps(text.split('\t', 3))
    else:
        text = json.dumps(text.split('\t'))
    text = pd.read_json(StringIO(text), lines=True)

    return text


# going to read it in chunks
chunk_size = 100000


# training the fake reviews model to an acceptable accuracy.
# load the file
review_path = 'reviews/YelpZip/reviewContent'
meta_path = 'reviews/YelpZip/metadata'
final_path = 'files/review_chunks/zip_cleaned_data_0'

# actual col re-names and the masks were going to use
review_cols = {0: 'user_id', 1: 'blank', 2: 'date', 3: 'text'}
meta_cols = {0: 'user_id', 1: 'blank', 2: 'rating', 3: 'label', 4: 'date'}
review_col_mask = ['text']
meta_col_mask = ['rating', 'label']

with open(review_path, 'r', encoding='utf-8') as f_review:
    with open(meta_path, 'r', encoding='utf-8') as f_meta:
        chunk_r = []
        chunk_m = []

        chunk_r_df = pd.DataFrame(columns = review_cols)
        chunk_m_df = pd.DataFrame(columns = meta_cols)
        
        print(f'Starting zipping')
        t0 = time.time()
        count = 1
        
        for index, (line_f, line_m) in enumerate(zip(f_review, f_meta)):
            # read each line as a dataframe then append to a list
            # review structure [index,
            chunk_r.append(convertLine(line_f, True))
            chunk_m.append(convertLine(line_m, False))

            # save chunk to disk
            if (index + 1) % chunk_size == 0:
                chunk_r_df = pd.concat(chunk_r, ignore_index=True).rename(columns = review_cols)[review_col_mask]
                chunk_m_df = pd.concat(chunk_m, ignore_index=True).rename(columns = meta_cols)[meta_col_mask]
        
                # remove the columns we don't care about and then concat them into the final data frame
                # chunk_r_df = chunk_r_df[rwsub_less]
                # chunk_m_df = chunk_m_df[]
                final_df = pd.concat([chunk_r_df, chunk_m_df], axis=1)
                final_df['rating'] = final_df['rating'] / 5.0
                final_df.convert_dtypes()
        
                # write the cleaned and organized dataframe to a file
                save_path = f'{final_path}{count}.csv'
                final_df.to_csv(save_path, index=False)
                
                if verbose:
                    print(f'Chunk {count} saved at {save_path}')
                    print(f'Time elapsed: {time.time() - t0} seconds')
                count += 1

print('Finished zipping')
if verbose:
    print(f'Time elapsed: {time.time() - t0} seconds')

Starting zipping
Chunk 1 saved at files/review_chunks/zip_cleaned_data_01.csv
Time elapsed: 341.54082322120667 seconds
Chunk 2 saved at files/review_chunks/zip_cleaned_data_02.csv
Time elapsed: 702.17418384552 seconds
Chunk 3 saved at files/review_chunks/zip_cleaned_data_03.csv
Time elapsed: 1085.8003134727478 seconds
Chunk 4 saved at files/review_chunks/zip_cleaned_data_04.csv
Time elapsed: 1472.3731112480164 seconds
Chunk 5 saved at files/review_chunks/zip_cleaned_data_05.csv
Time elapsed: 1884.4284663200378 seconds
Chunk 6 saved at files/review_chunks/zip_cleaned_data_06.csv
Time elapsed: 2302.901432275772 seconds
Finished zipping
Time elapsed: 2333.0447268486023 seconds


In [16]:
# clear out garbage
del chunk_r, chunk_m, chunk_r_df, chunk_m_df, final_df
gc.collect()

1435

In [None]:
# vectorize with tf-idf
vizer = TfidfVectorizer()

x_text = vizer.fit_transform(fakeDf['text_'])

if verbose:
    print(x_text.shape)

In [None]:
# combine the sparse matrix with the dense ratings column

# turn into 2d array
rate_feature = fakeDf['rating'].values.reshape(-1, 1)

# combine vectorized text and ratings
# data
X = hstack([x_text, rate_feature])

# target labels
y = fakeDf['label']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

c_params = [1.0, 0.7, 0.5, 0.3]
clf_names = []

# train the thing || trying differnt C score on SVM and Logistic Regression
models = {}

for c in c_params:
    # create a svm models with this c value
    name = 'clf_svm' + str(c)
    clf_names.append(name)
    models[name] = LinearSVC(C=c)

    t0 = time.time()
    if verbose:
        print(f'Training {name}')
    # actual training
    models[name].fit(X_train, y_train)
    if verbose:
        print(f'finished after {time.time() - t0} seconds')

    # create a logistic regression models with this c value
    name = 'clf_log' + str(c)
    clf_names.append(name)
    models[name] = LogisticRegression(C=c, max_iter=1000)

    t0 = time.time()
    if verbose:
        print(f'Training {name}')
    # actual training
    models[name].fit(X_train, y_train)
    if verbose:
        print(f'finished after {time.time() - t0} seconds')

# train a single random forest classifier
name = 'clf_for'
clf_names.append(name)
t0 = time.time()
if verbose:
    print(f'Training {name}')
models[name] = RandomForestClassifier(n_estimators=100, random_state=42)
models[name].fit(X_train, y_train)
if verbose:
    print(f'finished after {time.time() - t0} seconds')

if verbose:
    print('Finished training')

In [None]:
for name in clf_names:
    y_pred = models[name].predict(X_test)
    print(f'Classification report for {name}')
    print(classification_report(y_test, y_pred))

In [None]:
# nothing really beat the SVM with C=1.0
# going to save that one and the vectorized (tf-idf)
# important because the model trained on this
# any new words might give me garbage.
joblib.dump(models['clf_svm1.0'], 'models/support_svm.pkl')
joblib.dump(models['clf_log1.0'], 'models/support_log.pkl')
joblib.dump(models['clf_for'], 'models/support_for.pkl')
joblib.dump(vizer, 'models/support_vectorizer.pkl')