In [1]:
import json
import string
import time
import gc
import glob
from io import StringIO

import joblib
import numpy as np
import pandas as pd
from IPython.display import display
from scipy.sparse import hstack
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC

pd.set_option('future.no_silent_downcasting', True)
verbose = True

In [8]:
zip_df_path = 'files/zip_cleaned_data_FULL.csv'
zip_df_chunks_path = 'files/review_chunks/zip_cleaned_data_0*.csv'

# Try stuff so I don't just keep getting errors if something goes wrong
try:
    print('Trying to load combined and cleaned YelpZip file')
    zip_df = pd.read_csv(zip_df_path)
    print('File found. File loaded into zip_df')
except FileNotFoundError:
    try:
        print('File not found. Trying to load cleaned file chunks.')
        chunk_files = sorted(glob.glob(zip_df_chunks_path))
        zip_df = pd.concat([pd.read_csv(file) for file in chunk_files], ignore_index = True)
        zip_df.to_csv('files/zip_cleaned_data_FULL.csv')
        print('Files combined and loaded into zip_df.')
    except Exception as e:
        print(f'Loading chunks failed.\nError: {e}')
except Exception as e:
    print(r'Something weird happened ¯\_(ツ)_/¯')
    print(f'The weird thing is: {e}')

Trying to load combined and cleaned YelpZip file
File not found. Trying to load cleaned file chunks.
Files combined and loaded into zip_df.


In [9]:
print(zip_df.shape)
print(zip_df['label'].value_counts())

(600000, 3)
label
 1    520528
-1     79472
Name: count, dtype: int64


In [10]:
# structure of the dataframe should be [text, rating, label]
# vectorize with tf-idf
vizer = TfidfVectorizer()

x_text = vizer.fit_transform(zip_df['text'])

if verbose:
    print(f'Dataframe shape: {zip_df.shape}')
    print(f'Vectorized text shape: {x_text.shape}')

(600000, 358661)


In [11]:
# combine the sparse matrix with the dense ratings column

# turn into 2d array
rate_feature = zip_df['rating'].values.reshape(-1, 1)

# combine vectorized text and ratings
# data
X = hstack([x_text, rate_feature])

# target labels
y = zip_df['label']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

c_params = [0.05, 0.1, 0.2, 0.5]
clf_names = []

# train the thing || trying differnt C score on SVM and Logistic Regression
models = {}

for c in c_params:
    # create a svm model with this c value
    # name = 'clf_svm' + str(c)
    # clf_names.append(name)
    # models[name] = LinearSVC(C=c)

    # t0 = time.time()
    # if verbose:
    #     print(f'Training {name}')
    # # actual training
    # models[name].fit(X_train, y_train)
    # if verbose:
    #     print(f'finished after {time.time() - t0} seconds')

    # create a logistic regression model with this c value
    name = 'clf_log' + str(c)
    clf_names.append(name)
    models[name] = LogisticRegression(C=c, max_iter=1000, class_weight='balanced')

    t0 = time.time()
    if verbose:
        print(f'Training {name}')
    # actual training
    models[name].fit(X_train, y_train)
    if verbose:
        print(f'finished after {time.time() - t0} seconds')


if verbose:
    print('Finished training')

Training clf_log0.05
finished after 7.183547258377075 seconds
Training clf_log0.1
finished after 13.426305294036865 seconds
Training clf_log0.2
finished after 20.486284732818604 seconds
Training clf_log0.5
finished after 41.79753518104553 seconds
Finished training


In [13]:
for name in clf_names:
    y_pred = models[name].predict(X_test)
    print(f'Classification report for {name}')
    print(classification_report(y_test, y_pred))

Classification report for clf_log0.05
              precision    recall  f1-score   support

          -1       0.25      0.69      0.36     15969
           1       0.93      0.67      0.78    104031

    accuracy                           0.68    120000
   macro avg       0.59      0.68      0.57    120000
weighted avg       0.84      0.68      0.73    120000

Classification report for clf_log0.1
              precision    recall  f1-score   support

          -1       0.25      0.69      0.36     15969
           1       0.93      0.68      0.79    104031

    accuracy                           0.68    120000
   macro avg       0.59      0.68      0.58    120000
weighted avg       0.84      0.68      0.73    120000

Classification report for clf_log0.2
              precision    recall  f1-score   support

          -1       0.25      0.68      0.37     15969
           1       0.93      0.69      0.79    104031

    accuracy                           0.69    120000
   macro avg    

In [15]:
# Best model for finding negative reviews is the logistic regression with c=0.1
# going to save that one and the vectorized (tf-idf)
# important because the model trained on this
# any new words might give me garbage.

# had tried svm's, they were bad at recalling the negative reviews.
# got higher accurary mainly by labeling everything real
joblib.dump(models['clf_log0.1'], 'models/support_log.pkl')
joblib.dump(vizer, 'models/support_vectorizer.pkl')

['models/support_vectorizer.pkl']