# Fake Review Detection using CNNs - Keras + NLTK

## Sample set: 20000 reviews
## Accuracy obtained: 63.74%


In [3]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding
import numpy as np

Using TensorFlow backend.


In [24]:
data = pd.read_csv("E:\\Yelp\\Unfiltered Data\\YelpZip\\Customs\\SAMPLE", header=None)
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,0,0,5051,2014-12-05,Extremely slow kitchen went hour eat shift tol...,0,1.0,1
1,1,1,5071,2014-05-20,AWFUL half hour kill bypassed Capogiro go read...,0,1.0,1
2,2,2,5071,2014-05-20,Lovely spot Especially eat outside Food excell...,0,1.0,1
3,3,3,5076,2014-03-12,Sunday ordered soy caramel latte caramel vanil...,0,1.0,1
4,4,4,5077,2014-02-20,Toast youre killing Ive tried many times like ...,0,1.0,1


In [13]:
from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()

In [25]:
data[4] = data[4].progress_apply(lambda txt: str(txt))

HBox(children=(IntProgress(value=0, max=20000), HTML(value='')))




In [53]:
def binarize(n):
    if n==1:
        return 1
    else:
        return 0

data[7] = data[7].progress_apply(lambda n: binarize(n))

HBox(children=(IntProgress(value=0, max=20000), HTML(value='')))

In [30]:
from nltk.stem.porter import PorterStemmer

#Stemming
stemmer = PorterStemmer()

def stem(txt):
    words = []
    for x in txt.split():
        words.append(stemmer.stem(x))
    return words

data[4] = data[4].progress_apply(lambda txt: stem(txt))

HBox(children=(IntProgress(value=0, max=20000), HTML(value='')))




In [35]:
#POS tagging
data[4] = data[4].progress_apply(lambda tokens: nltk.pos_tag(tokens))

HBox(children=(IntProgress(value=0, max=20000), HTML(value='')))

In [37]:
#Join back the list objects and their POS tags for CountVectorizer
def join_back(txt):
    new = ''
    for x in txt:
        new += '_'.join(x)+" "
    new = new[:-1]
    return new

data[4] = data[4].progress_apply(lambda tokens: join_back(tokens))

HBox(children=(IntProgress(value=0, max=20000), HTML(value='')))

In [49]:
tokenizer = Tokenizer(num_words= 20000)
tokenizer.fit_on_texts(data[4])

sequences = tokenizer.texts_to_sequences(data[4])
data1 = pad_sequences(sequences, maxlen=60)

In [65]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data1, data[7], test_size=0.25)

In [169]:
model = Sequential()
model.add(Embedding(20000, 32, input_length=60))
model.add(Flatten())
model.add(Dense(1024, activation='relu', input_shape=(20000,)))
model.add(Dropout(0.2))
model.add(Dense(1024, activation='relu', input_shape=(20000,)))
model.add(Dropout(0.2))
model.add(Dense(1024, activation='relu', input_shape=(20000,)))
model.add(Dropout(0.2))
model.add(Dense(512, activation='relu', input_shape=(20000,)))
model.add(Dense(512, activation='relu', input_shape=(20000,)))
model.add(Dense(256, activation='sigmoid'))
model.add(Dense(256, activation='sigmoid'))

# model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])    

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_29 (Embedding)     (None, 60, 32)            640000    
_________________________________________________________________
flatten_21 (Flatten)         (None, 1920)              0         
_________________________________________________________________
dense_144 (Dense)            (None, 1024)              1967104   
_________________________________________________________________
dropout_63 (Dropout)         (None, 1024)              0         
_________________________________________________________________
dense_145 (Dense)            (None, 1024)              1049600   
_________________________________________________________________
dropout_64 (Dropout)         (None, 1024)              0         
_________________________________________________________________
dense_146 (Dense)            (None, 1024)              1049600   
__________

In [170]:
# X_train = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1]))
# X_test = np.reshape(X_test, (X_test.shape[0], 1, X_test.shape[1]))

model.fit(X_train, y_train, validation_split=0.20, batch_size = 500, epochs=3)

Train on 12000 samples, validate on 3000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x2b3f9db9668>

In [171]:
scores = model.evaluate(X_test, y_test)



In [172]:
print("Accuracy: ", scores[1]*100, "%")

Accuracy:  63.739999999999995 %
