Importing Libraries

In [139]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import re
import string
import warnings 
warnings.filterwarnings('ignore')

Inserting Dataset

In [140]:
data = pd.read_csv("deceptive-opinion.csv")
data = data.rename(columns={"deceptive":"label"})

In [141]:
data.head(5)

Unnamed: 0,label,hotel,polarity,source,text
0,truthful,conrad,positive,TripAdvisor,We stayed for a one night getaway with family ...
1,truthful,hyatt,positive,TripAdvisor,Triple A rate with upgrade to view room was le...
2,truthful,hyatt,positive,TripAdvisor,This comes a little late as I'm finally catchi...
3,truthful,omni,positive,TripAdvisor,The Omni Chicago really delivers on all fronts...
4,truthful,hyatt,positive,TripAdvisor,I asked for a high floor away from the elevato...


Shuffling Dataset

In [142]:
data = data.sample(frac = 1)

In [143]:
data.head(10)

Unnamed: 0,label,hotel,polarity,source,text
457,deceptive,hyatt,positive,MTurk,I recently stayed at the Hyatt Regency in Chic...
1538,deceptive,intercontinental,negative,MTurk,The Intercontinental Chicago was not as great ...
878,truthful,hyatt,negative,Web,We had an issue with housekeeping and the hote...
59,truthful,omni,positive,TripAdvisor,"This a great property, excellent location and ..."
1398,deceptive,affinia,negative,MTurk,I recently stayed at the Affina Chicago hotel ...
1430,deceptive,affinia,negative,MTurk,I truly am not the sort of person to write a n...
1339,deceptive,sheraton,negative,MTurk,I stayed at the Sheraton Chicago Hotel and Tow...
1078,truthful,james,negative,Web,"Me, My sister and my best friend all went to s..."
134,truthful,knickerbocker,positive,TripAdvisor,Stayed there three nights from 4/17/09 through...
1238,deceptive,omni,negative,MTurk,I had never been to Chicago and was looking fo...




Removing Last 10 rows for testing

In [144]:
data.shape

(1600, 5)

In [145]:
data_for_manualTesting = data.tail(10)
for i in range(1599,1589,-1):
    data.drop([i], axis = 0, inplace = True)

In [146]:
data_for_manualTesting.head(10)

Unnamed: 0,label,hotel,polarity,source,text
408,deceptive,conrad,positive,MTurk,If you are looking for a luxurious downtown Ch...
1014,truthful,affinia,negative,Web,"I'd been searching for a cool, non-chain hotel..."
422,deceptive,fairmont,positive,MTurk,The atmosphere at this hotel is truly remarkab...
864,truthful,conrad,negative,Web,Let me first start by saying I have always bee...
1413,deceptive,talbott,negative,MTurk,I selected The Talbott for my recent family va...
384,truthful,palmer,positive,TripAdvisor,We just got back from a great weekend in Chica...
14,truthful,hyatt,positive,TripAdvisor,I got a Sunday night stay for only $50 off of ...
88,truthful,homewood,positive,TripAdvisor,"Hi,Stayed here for three nights recently,From ..."
35,truthful,fairmont,positive,TripAdvisor,I stayed at the Fairmont during a professional...
635,deceptive,affinia,positive,MTurk,My husband and I arrived for a 3 night stay fo...


Dropping attributes that are not needed

In [147]:
data = data.drop(["hotel", "source"], axis = 1)
data.head(10)

Unnamed: 0,label,polarity,text
457,deceptive,positive,I recently stayed at the Hyatt Regency in Chic...
1538,deceptive,negative,The Intercontinental Chicago was not as great ...
878,truthful,negative,We had an issue with housekeeping and the hote...
59,truthful,positive,"This a great property, excellent location and ..."
1398,deceptive,negative,I recently stayed at the Affina Chicago hotel ...
1430,deceptive,negative,I truly am not the sort of person to write a n...
1339,deceptive,negative,I stayed at the Sheraton Chicago Hotel and Tow...
1078,truthful,negative,"Me, My sister and my best friend all went to s..."
134,truthful,positive,Stayed there three nights from 4/17/09 through...
1238,deceptive,negative,I had never been to Chicago and was looking fo...


Resetting Index

In [148]:
data.reset_index(inplace = True)
data.drop(["index"], axis = 1, inplace = True)
data.head(5)

Unnamed: 0,label,polarity,text
0,deceptive,positive,I recently stayed at the Hyatt Regency in Chic...
1,deceptive,negative,The Intercontinental Chicago was not as great ...
2,truthful,negative,We had an issue with housekeeping and the hote...
3,truthful,positive,"This a great property, excellent location and ..."
4,deceptive,negative,I recently stayed at the Affina Chicago hotel ...


Checking for NULL values

In [149]:
data.isnull().sum()

label       0
polarity    0
text        0
dtype: int64


Grouping the data based on polarity to identify skew in data 

In [150]:
groupedReviews = data.groupby("polarity").label.value_counts()
groupedReviews

polarity  label    
negative  truthful     400
          deceptive    390
positive  deceptive    400
          truthful     400
Name: label, dtype: int64

Note - There is minimal skew, Therefore we can proceed without any adjustments

#### Creating a function to convert the text in lowercase, remove the extra space, special chr., ulr and links.

In [151]:
def textProcessing(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W"," ",text) 
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)    
    return text

In [152]:
data["text"]  = data["text"].apply(textProcessing)

In [153]:
data.head(5)

Unnamed: 0,label,polarity,text
0,deceptive,positive,i recently stayed at the hyatt regency in chic...
1,deceptive,negative,the intercontinental chicago was not as great ...
2,truthful,negative,we had an issue with housekeeping and the hote...
3,truthful,positive,this a great property excellent location and ...
4,deceptive,negative,i recently stayed at the affina chicago hotel ...


#### Eliminating polarity as there is no skew

In [154]:
data.drop(["polarity"], axis = 1, inplace = True)

In [155]:
data.head(5)

Unnamed: 0,label,text
0,deceptive,i recently stayed at the hyatt regency in chic...
1,deceptive,the intercontinental chicago was not as great ...
2,truthful,we had an issue with housekeeping and the hote...
3,truthful,this a great property excellent location and ...
4,deceptive,i recently stayed at the affina chicago hotel ...


#### Renaming label as 0 and 1

In [156]:
dummy = pd.get_dummies(data['label'])
dummy.head(5)

Unnamed: 0,deceptive,truthful
0,1,0
1,1,0
2,0,1
3,0,1
4,1,0


In [157]:
data = pd.concat((data, dummy), axis = 1)
data.head(5)

Unnamed: 0,label,text,deceptive,truthful
0,deceptive,i recently stayed at the hyatt regency in chic...,1,0
1,deceptive,the intercontinental chicago was not as great ...,1,0
2,truthful,we had an issue with housekeeping and the hote...,0,1
3,truthful,this a great property excellent location and ...,0,1
4,deceptive,i recently stayed at the affina chicago hotel ...,1,0


In [158]:
data.drop(["label"], axis = 1, inplace = True)
data.drop(["deceptive"], axis = 1, inplace = True)
data = data.rename(columns = {"truthful":"label"})
data.head(5)

Unnamed: 0,text,label
0,i recently stayed at the hyatt regency in chic...,0
1,the intercontinental chicago was not as great ...,0
2,we had an issue with housekeeping and the hote...,1
3,this a great property excellent location and ...,1
4,i recently stayed at the affina chicago hotel ...,0


#### Defining the Independent and Dependant Variables

In [159]:
x = data["text"]
y = data["label"]

### Splitting the data into Testing and Training data

In [160]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

### Converting text to Vectors

In [161]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [162]:
vectorization = TfidfVectorizer()
xv_train = vectorization.fit_transform(x_train)
xv_test = vectorization.transform(x_test)

# Classical Models

## 1. Logistic Regression

In [163]:
from sklearn.linear_model import LogisticRegression

In [164]:
LR = LogisticRegression()
LR.fit(xv_train,y_train)

LogisticRegression()

In [165]:
pred_lr=LR.predict(xv_test)

In [166]:
LR.score(xv_test, y_test)

0.8693467336683417

In [167]:
print(classification_report(y_test, pred_lr))

              precision    recall  f1-score   support

           0       0.86      0.86      0.86       192
           1       0.87      0.87      0.87       206

    accuracy                           0.87       398
   macro avg       0.87      0.87      0.87       398
weighted avg       0.87      0.87      0.87       398



## 2. Decision Tree Classification

In [168]:
from sklearn.tree import DecisionTreeClassifier

In [169]:
DT = DecisionTreeClassifier()
DT.fit(xv_train, y_train)

DecisionTreeClassifier()

In [170]:
pred_dt = DT.predict(xv_test)

In [171]:
DT.score(xv_test, y_test)

0.6859296482412061

In [172]:
print(classification_report(y_test, pred_dt))

              precision    recall  f1-score   support

           0       0.67      0.70      0.68       192
           1       0.71      0.67      0.69       206

    accuracy                           0.69       398
   macro avg       0.69      0.69      0.69       398
weighted avg       0.69      0.69      0.69       398



## 3. Gradient Boosting Classifier

In [173]:
from sklearn.ensemble import GradientBoostingClassifier

In [174]:
GBC = GradientBoostingClassifier(random_state=0)
GBC.fit(xv_train, y_train)

GradientBoostingClassifier(random_state=0)

In [175]:
pred_gbc = GBC.predict(xv_test)

In [176]:
GBC.score(xv_test, y_test)

0.8492462311557789

In [177]:
print(classification_report(y_test, pred_gbc))

              precision    recall  f1-score   support

           0       0.83      0.86      0.85       192
           1       0.86      0.84      0.85       206

    accuracy                           0.85       398
   macro avg       0.85      0.85      0.85       398
weighted avg       0.85      0.85      0.85       398



## 4. Random Forest Classifier

In [178]:
from sklearn.ensemble import RandomForestClassifier

In [179]:
RFC = RandomForestClassifier(random_state=0)
RFC.fit(xv_train, y_train)

RandomForestClassifier(random_state=0)

In [180]:
pred_rfc = RFC.predict(xv_test)

In [181]:
RFC.score(xv_test, y_test)

0.8668341708542714

In [182]:
print(classification_report(y_test, pred_rfc))

              precision    recall  f1-score   support

           0       0.84      0.90      0.87       192
           1       0.90      0.83      0.87       206

    accuracy                           0.87       398
   macro avg       0.87      0.87      0.87       398
weighted avg       0.87      0.87      0.87       398

