In [2]:
import string

import numpy as np
import pandas as pd
import time
import joblib
from IPython.display import display
from scipy.sparse import hstack
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier


pd.set_option('future.no_silent_downcasting', True)

verbose = True

In [3]:
# converts to lowercase and strip punctuation
def clean_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text


# training the fake reviews model to an acceptable accuracy.
# load the file
fakeDf = pd.read_csv('reviews/fakeReviews/fakeReviews.csv')
if verbose:
    print(f'Initial columns are {fakeDf.columns}')

# some preprocessing

# convert labels to binary
# fake will be 1 (the target we're looking for)
fakeDf['label'] = fakeDf['label'].replace(['CG', 'OR'], [1, 0])


# convert text to lower case and strip punctuation
fakeDf['text_'] = fakeDf['text_'].apply(clean_text)

# normalize ratings
fakeDf['rating'] = fakeDf['rating'] / 5.0

# remove category (not relevant for the yelp dataset, mismatch.)
fakeDf.drop('category', inplace=True, axis=1)

fakeDf = fakeDf.convert_dtypes()

if verbose:
    print('\nCurrent dataframe')
    display(fakeDf.head(5))

Initial columns are Index(['category', 'rating', 'label', 'text_'], dtype='object')

Current dataframe


Unnamed: 0,rating,label,text_
0,1.0,1,love this well made sturdy and very comfortab...
1,1.0,1,love it a great upgrade from the original ive...
2,1.0,1,this pillow saved my back i love the look and ...
3,0.2,1,missing information on how to use it but it is...
4,1.0,1,very nice set good quality we have had the set...


In [4]:
# vectorize with tf-idf
vizer = TfidfVectorizer()

x_text = vizer.fit_transform(fakeDf['text_'])

if verbose:
    print(x_text.shape)

(40432, 51256)


In [5]:
# combine the sparse matrix with the dense ratings column

# turn into 2d array
rate_feature = fakeDf['rating'].values.reshape(-1, 1)

# combine vectorized text and ratings
# data
X = hstack([x_text, rate_feature])

# target labels
y = fakeDf['label']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

c_params = [1.0, 0.7, 0.5, 0.3]
clf_names = []

# train the thing || trying differnt C score on SVM and Logistic Regression
models = {}

for c in c_params:
    # create a svm models with this c value
    name = 'clf_svm' + str(c)
    clf_names.append(name)
    models[name] = LinearSVC(C = c)
                            
    t0 = time.time()
    if verbose:
        print(f'Training {name}')
    # actual training
    models[name].fit(X_train, y_train)
    if verbose:
        print(f'finished after {time.time() - t0} seconds')
        
    #create a logistic regression models with this c value
    name = 'clf_log' + str(c)
    clf_names.append(name)
    models[name] = LogisticRegression(C = c, max_iter=1000)
    
    t0 = time.time()
    if verbose:
        print(f'Training {name}')
    # actual training
    models[name].fit(X_train, y_train)
    if verbose:
        print(f'finished after {time.time() - t0} seconds')

# train a single random forest classifier
name = 'clf_for'
clf_names.append(name)
t0 = time.time()
if verbose:
    print(f'Training {name}')
models[name] = RandomForestClassifier(n_estimators=100, random_state=42)
models[name].fit(X_train, y_train)
if verbose:
    print(f'finished after {time.time() - t0} seconds')
        
if verbose:
    print('Finished training')


Training clf_svm1.0
finished after 0.4129364490509033 seconds
Training clf_log1.0
finished after 6.531688451766968 seconds
Training clf_svm0.7
finished after 0.3484346866607666 seconds
Training clf_log0.7
finished after 7.3156983852386475 seconds
Training clf_svm0.5
finished after 0.31402063369750977 seconds
Training clf_log0.5
finished after 3.416294574737549 seconds
Training clf_svm0.3
finished after 0.2648434638977051 seconds
Training clf_log0.3
finished after 2.640948534011841 seconds
Training clf_for
finished after 83.78372406959534 seconds
Finished training


In [7]:
for name in clf_names:
    y_pred = models[name].predict(X_test)
    print(f'Classification report for {name}')
    print(classification_report(y_test, y_pred))

Classification report for clf_svm1.0
              precision    recall  f1-score   support

         0.0       0.91      0.91      0.91      4071
         1.0       0.91      0.91      0.91      4016

    accuracy                           0.91      8087
   macro avg       0.91      0.91      0.91      8087
weighted avg       0.91      0.91      0.91      8087

Classification report for clf_log1.0
              precision    recall  f1-score   support

         0.0       0.89      0.92      0.91      4071
         1.0       0.92      0.89      0.90      4016

    accuracy                           0.90      8087
   macro avg       0.90      0.90      0.90      8087
weighted avg       0.90      0.90      0.90      8087

Classification report for clf_svm0.7
              precision    recall  f1-score   support

         0.0       0.91      0.91      0.91      4071
         1.0       0.91      0.91      0.91      4016

    accuracy                           0.91      8087
   macro avg     

In [9]:
# nothing really beat the SVM with C=1.0
# going to save that one and the vectorized (tf-idf)
# important because the model trained on this
# any new words might give me garbage.
joblib.dump(models['clf_svm1.0'],'models/support_svm.pkl')
joblib.dump(models['clf_log1.0'],'models/support_log.pkl')
joblib.dump(models['clf_for'],'models/support_for.pkl')
joblib.dump(vizer, 'models/support_vectorizer.pkl')

['models/support_vectorizer.pkl']