

In [2]:
import pandas as pd
import numpy as np
import string
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from IPython.display import display
from scipy.sparse import hstack

pd.set_option('future.no_silent_downcasting', True)

verbose = True

In [3]:
# converts to lowercase and strip punctuation
def clean_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('','', string.punctuation))
    return text

# training the fake reviews model to an acceptable accuracy.
# load the file
fakeDf = pd.read_csv('reviews/fakeReviews/fakeReviews.csv')
if verbose:
    print(f'Initial columns are {fakeDf.columns}')

# some preprocessing

# convert labels to binary
# fake will be 1 (the target we're looking for)
fakeDf['label'] = fakeDf['label'].replace(['CG','OR'], [1,0])



# convert text to lower case and strip punctuation
fakeDf['text_'] = fakeDf['text_'].apply(clean_text)

# normalize ratings
fakeDf['rating'] = fakeDf['rating'] / 5.0

# remove category (not relevant for the yelp dataset, mismatch.)
fakeDf.drop('category', inplace=True, axis=1)

fakeDf = fakeDf.convert_dtypes()

if verbose:
    print('\nCurrent dataframe')
    display(fakeDf.head(5))

Initial columns are Index(['category', 'rating', 'label', 'text_'], dtype='object')

Current dataframe


Unnamed: 0,rating,label,text_
0,1.0,1,love this well made sturdy and very comfortab...
1,1.0,1,love it a great upgrade from the original ive...
2,1.0,1,this pillow saved my back i love the look and ...
3,0.2,1,missing information on how to use it but it is...
4,1.0,1,very nice set good quality we have had the set...


In [5]:
# vectorize with tf-idf
vizer = TfidfVectorizer()

xText = vizer.fit_transform(fakeDf['text_'])

if verbose:
    print(xText.shape)

(40432, 51256)


In [7]:
# combine the sparse matrix with the dense ratings column

# turn into 2d array
rateFeature = fakeDf['rating'].values.reshape(-1,1)

# combine ratings and vectorized text
# data
X = hstack([xText, rateFeature])

# target labels
y = fakeDf['label']

In [None]:
Xtrain