# Fake Review Detection Model

### Import dependencies

In [3]:
import os
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

import gensim
from gensim.utils import simple_preprocess

import nltk
import nltk.tag
from nltk.corpus import wordnet
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
from nltk.corpus import stopwords

from sklearn.svm import SVC
from sklearn import model_selection
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

from xgboost import XGBClassifier

import data_prep
import ngram
import linguistic_feature

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Import Data

In [4]:
df = data_prep.import_chi_review()
display(df[:5])

Unnamed: 0,Label,Rating,Ori_Review
0,1,1,Affinia Chicago is one of the worst hotels I h...
1,1,1,I recently stayed at the Affina Chicago hotel ...
2,1,1,I stayed at the Affina Chicago for my annivers...
3,1,1,If you are looking for a high end hotel on the...
4,1,1,I just returned from a long weekend in Chicago...


### Train-Test Split

In [5]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=0)

### Preprocess & Feature Extraction

In [6]:
df_train = data_prep.preprocess_ngram(df_train)
df_test = data_prep.preprocess_ngram(df_test)

In [7]:
unigrams, bigrams, trigrams, unigrams_dict, bigrams_dict, trigrams_dict = ngram.find_universal_ngrams(df_train['PP_Review'])

In [8]:
df_train['Ngram'] = df_train['Word_List'].apply(lambda x: ngram.find_ngrams(x, unigrams_dict, bigrams_dict, trigrams_dict))
df_test['Ngram'] = df_test['Word_List'].apply(lambda x: ngram.find_ngrams(x, unigrams_dict, bigrams_dict, trigrams_dict))
display(df_train[:3])
display(df_test[:3])

Unnamed: 0,Label,Rating,Ori_Review,Clean_Review,PP_Review,Word_List,Ngram
0,-1,1,"great bed, but when I first came in, I request...",great bed but when i first come in i request a...,great bed first come request lake view request...,"[great, bed, first, come, request, lake, view,...","[great, bed, first, come, request, lake, view,..."
1,1,1,Don't let the website fool you. I stayed at th...,do not let the website fool you i stay at the ...,let website fool amalfi last business trip pro...,"[let, website, fool, amalfi, last, business, t...","[let, website, amalfi, last, business, trip, p..."
2,-1,1,Just returned from a one night stay at the Kni...,just return from a one night stay at the knick...,return one night knickerbocker return come now...,"[return, one, night, knickerbocker, return, co...","[return, one, night, knickerbocker, return, co..."


Unnamed: 0,Label,Rating,Ori_Review,Clean_Review,PP_Review,Word_List,Ngram
0,1,5,My stay at the Hotel Monaco Chicago was amazin...,my stay at the hotel monaco chicago be amaze t...,monaco amaze staff polite well poise eager giv...,"[monaco, amaze, staff, polite, well, poise, ea...","[monaco, amaze, staff, polite, well, give, hel..."
1,1,1,"For the price, you would think this would be a...",for the price you would think this would be a ...,price think top quality nowhere close service ...,"[price, think, top, quality, nowhere, close, s...","[price, think, top, quality, close, service, c..."
2,-1,5,We just got back from 3 nights at the Sofitel....,we just get back from night at the sofitel we ...,get back night sofitel really nothing bad nega...,"[get, back, night, sofitel, really, nothing, b...","[get, back, night, sofitel, really, nothing, b..."


In [9]:
bigram_words = [' '.join(bigram) for bigram in bigrams]
trigram_words = [' '.join(trigram) for trigram in trigrams]
new_words = unigrams + bigram_words + trigram_words

In [10]:
df_train = data_prep.preprocess_ling_feature(df_train)
df_train = linguistic_feature.ling_feature(df_train)
df_test = data_prep.preprocess_ling_feature(df_test)
df_test = linguistic_feature.ling_feature(df_test)

In [11]:
word2ind = dict(zip(new_words, range(len(new_words))))

In [12]:
X_train, y_train = data_prep.df2matrix(df_train, word2ind)
X_test, y_test = data_prep.df2matrix(df_test, word2ind)

In [13]:
X_train_pos = X_train[X_train['Rating'] == 5]
y_train_pos = y_train[X_train['Rating'] == 5]
X_train_neg = X_train[X_train['Rating'] == 1]
y_train_neg = y_train[X_train['Rating'] == 1]

X_test_pos = X_test[X_test['Rating'] == 5]
y_test_pos = y_test[X_test['Rating'] == 5]
X_test_neg = X_test[X_test['Rating'] == 1]
y_test_neg = y_test[X_test['Rating'] == 1]

In [19]:
X_train_std = StandardScaler().fit_transform(X_train)
X_test_std = StandardScaler().fit_transform(X_test)

In [23]:
X_train_pos_std = X_train_std[X_train['Rating'] == 5]
X_train_neg_std = X_train_std[X_train['Rating'] == 1]

X_test_pos_std = X_test_std[X_test['Rating'] == 5]
X_test_neg_std = X_test_std[X_test['Rating'] == 1]

### Train Model

#### 1) Random Forest

In [15]:
rf_model = RandomForestClassifier(random_state=0)
rf_model.fit(X_train, y_train)

RandomForestClassifier(random_state=0)

In [16]:
rf_model = RandomForestClassifier(criterion='gini', random_state=0)
parameters = {'n_estimators': [100, 500],
              'max_features': [5, 10, 20],
              'max_depth': [20, 50],
              'max_samples': [100, 500, None]}
grid_search = GridSearchCV(rf_model, param_grid=parameters, cv=5, scoring='accuracy', verbose=2)
grid_search.fit(X_train, y_train)
print('Best Parameters: ', grid_search.best_params_)
print('Score', grid_search.best_score_)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV] END max_depth=20, max_features=5, max_samples=100, n_estimators=100; total time=   0.1s
[CV] END max_depth=20, max_features=5, max_samples=100, n_estimators=100; total time=   0.0s
[CV] END max_depth=20, max_features=5, max_samples=100, n_estimators=100; total time=   0.0s
[CV] END max_depth=20, max_features=5, max_samples=100, n_estimators=100; total time=   0.0s
[CV] END max_depth=20, max_features=5, max_samples=100, n_estimators=100; total time=   0.0s
[CV] END max_depth=20, max_features=5, max_samples=100, n_estimators=500; total time=   0.5s
[CV] END max_depth=20, max_features=5, max_samples=100, n_estimators=500; total time=   0.6s
[CV] END max_depth=20, max_features=5, max_samples=100, n_estimators=500; total time=   0.5s
[CV] END max_depth=20, max_features=5, max_samples=100, n_estimators=500; total time=   0.5s
[CV] END max_depth=20, max_features=5, max_samples=100, n_estimators=500; total time=   0.5s
[CV] END

[CV] END max_depth=20, max_features=20, max_samples=None, n_estimators=500; total time=   1.3s
[CV] END max_depth=20, max_features=20, max_samples=None, n_estimators=500; total time=   1.3s
[CV] END max_depth=20, max_features=20, max_samples=None, n_estimators=500; total time=   1.3s
[CV] END max_depth=50, max_features=5, max_samples=100, n_estimators=100; total time=   0.1s
[CV] END max_depth=50, max_features=5, max_samples=100, n_estimators=100; total time=   0.1s
[CV] END max_depth=50, max_features=5, max_samples=100, n_estimators=100; total time=   0.1s
[CV] END max_depth=50, max_features=5, max_samples=100, n_estimators=100; total time=   0.1s
[CV] END max_depth=50, max_features=5, max_samples=100, n_estimators=100; total time=   0.0s
[CV] END max_depth=50, max_features=5, max_samples=100, n_estimators=500; total time=   0.5s
[CV] END max_depth=50, max_features=5, max_samples=100, n_estimators=500; total time=   0.5s
[CV] END max_depth=50, max_features=5, max_samples=100, n_estima

[CV] END max_depth=50, max_features=20, max_samples=None, n_estimators=500; total time=   1.9s
[CV] END max_depth=50, max_features=20, max_samples=None, n_estimators=500; total time=   1.7s
[CV] END max_depth=50, max_features=20, max_samples=None, n_estimators=500; total time=   1.5s
[CV] END max_depth=50, max_features=20, max_samples=None, n_estimators=500; total time=   1.6s
[CV] END max_depth=50, max_features=20, max_samples=None, n_estimators=500; total time=   1.7s
Best Parameters:  {'max_depth': 20, 'max_features': 5, 'max_samples': None, 'n_estimators': 500}
Score 0.865625


In [18]:
rf_model = RandomForestClassifier(random_state=0, 
                                  criterion='gini', 
                                  max_depth=20, 
                                  max_features=5, 
                                  max_samples=None, 
                                  n_estimators=500)
rf_model.fit(X_train, y_train)
print('Random Forest Performance')
print('- Train Overall Accuracy: ', "{:.1%}".format(rf_model.score(X_train, y_train)))
print('- Test Overall Accuracy: ', "{:.1%}".format(rf_model.score(X_test, y_test)))
print('- Test Positive Accuracy: ', "{:.1%}".format(rf_model.score(X_test_pos, y_test_pos)))
print('- Test Negative Accuracy: ', "{:.1%}".format(rf_model.score(X_test_neg, y_test_neg)))

Random Forest Performance
- Train Overall Accuracy:  99.8%
- Test Overall Accuracy:  90.3%
- Test Positive Accuracy:  90.4%
- Test Negative Accuracy:  90.2%


#### 2) PCA + SVM

In [21]:
pca = PCA()
svm_model = SVC()
pipeline = Pipeline(steps=[('pca', pca), ('svm', svm_model)])
parameters = {'svm__C': [3, 4], 'pca__n_components': list(range(1, 1001, 50))}
grid_search = GridSearchCV(pipeline, param_grid=parameters, cv=5, scoring='accuracy')
grid_search.fit(X_train_std, y_train)
print('Best Parameters: ', grid_search.best_params_)
print('Score', grid_search.best_score_)

Best Parameters:  {'pca__n_components': 601, 'svm__C': 3}
Score 0.8546875


In [32]:
pca = PCA(n_components=601)
X_train_pca = pca.fit_transform(X_train_std)
X_test_pca = pca.transform(X_test_std)
X_test_pos_pca = pca.transform(X_test_pos_std)
X_test_neg_pca = pca.transform(X_test_neg_std)

svm_model = SVC(C=3)
svm_model.fit(X_train_pca, y_train)
print('SVM Performance')
print('- Train Overall Accuracy: ', "{:.1%}".format(svm_model.score(X_train_pca, y_train)))
print('- Test Overall Accuracy: ', "{:.1%}".format(svm_model.score(X_test_pca, y_test)))
print('- Test Positive Accuracy: ', "{:.1%}".format(svm_model.score(X_test_pos_pca, y_test_pos)))
print('- Test Negative Accuracy: ', "{:.1%}".format(svm_model.score(X_test_neg_pca, y_test_neg)))

SVM Performance
- Train Overall Accuracy:  99.8%
- Test Overall Accuracy:  86.6%
- Test Positive Accuracy:  84.7%
- Test Negative Accuracy:  88.3%
