#### Importing Libraries

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import re
import string
import warnings 
warnings.filterwarnings('ignore')

#### Inserting Dataset

In [3]:
data = pd.read_csv("deceptive-opinion.csv")
data = data.rename(columns={"deceptive":"label"})

In [4]:
data.head(5)

Unnamed: 0,label,hotel,polarity,source,text
0,truthful,conrad,positive,TripAdvisor,We stayed for a one night getaway with family ...
1,truthful,hyatt,positive,TripAdvisor,Triple A rate with upgrade to view room was le...
2,truthful,hyatt,positive,TripAdvisor,This comes a little late as I'm finally catchi...
3,truthful,omni,positive,TripAdvisor,The Omni Chicago really delivers on all fronts...
4,truthful,hyatt,positive,TripAdvisor,I asked for a high floor away from the elevato...


#### Shuffling Dataset

In [5]:
data = data.sample(frac = 1)

In [6]:
data.head(10)

Unnamed: 0,label,hotel,polarity,source,text
587,deceptive,hardrock,positive,MTurk,After searching for the perfect hotel for my C...
1036,truthful,hardrock,negative,Web,Move beyond the rock theme and it's an average...
1320,deceptive,homewood,negative,MTurk,I went to the Homewood Suites in Chicago which...
883,truthful,homewood,negative,Web,While the parking service was great - the fron...
768,deceptive,palmer,positive,MTurk,An excellent vacation destination. Clean rooms...
1356,deceptive,homewood,negative,MTurk,Not worth the price! I had the unfortunate exp...
1526,deceptive,intercontinental,negative,MTurk,The International Chicago Magnificent Mile is ...
699,deceptive,sofitel,positive,MTurk,Best hotel in an excellent location I stayed a...
843,truthful,conrad,negative,Web,We've stayed here before but this time around ...
165,truthful,ambassador,positive,TripAdvisor,In a very nice location close to shopping and ...


#### Dropping attributes that are not needed

In [7]:
data = data.drop(["hotel", "source"], axis = 1)
data.head(10)

Unnamed: 0,label,polarity,text
587,deceptive,positive,After searching for the perfect hotel for my C...
1036,truthful,negative,Move beyond the rock theme and it's an average...
1320,deceptive,negative,I went to the Homewood Suites in Chicago which...
883,truthful,negative,While the parking service was great - the fron...
768,deceptive,positive,An excellent vacation destination. Clean rooms...
1356,deceptive,negative,Not worth the price! I had the unfortunate exp...
1526,deceptive,negative,The International Chicago Magnificent Mile is ...
699,deceptive,positive,Best hotel in an excellent location I stayed a...
843,truthful,negative,We've stayed here before but this time around ...
165,truthful,positive,In a very nice location close to shopping and ...


#### Resetting Index

In [8]:
data.reset_index(inplace = True)
data.drop(["index"], axis = 1, inplace = True)
data.head(5)

Unnamed: 0,label,polarity,text
0,deceptive,positive,After searching for the perfect hotel for my C...
1,truthful,negative,Move beyond the rock theme and it's an average...
2,deceptive,negative,I went to the Homewood Suites in Chicago which...
3,truthful,negative,While the parking service was great - the fron...
4,deceptive,positive,An excellent vacation destination. Clean rooms...


#### Checking for NULL values

In [9]:
data.isnull().sum()

label       0
polarity    0
text        0
dtype: int64


#### Grouping the data based on polarity to identify skew in data 

In [10]:
groupedReviews = data.groupby("polarity").label.value_counts()
groupedReviews

polarity  label    
negative  deceptive    400
          truthful     400
positive  deceptive    400
          truthful     400
Name: label, dtype: int64

Note - There is minimal skew, Therefore we can proceed without any adjustments

#### Eliminating polarity as there is no skew

In [11]:
data.drop(["polarity"], axis = 1, inplace = True)

In [12]:
data.head(5)

Unnamed: 0,label,text
0,deceptive,After searching for the perfect hotel for my C...
1,truthful,Move beyond the rock theme and it's an average...
2,deceptive,I went to the Homewood Suites in Chicago which...
3,truthful,While the parking service was great - the fron...
4,deceptive,An excellent vacation destination. Clean rooms...


#### Renaming label as 0 and 1

In [13]:
dummy = pd.get_dummies(data['label'])
dummy.head(5)

Unnamed: 0,deceptive,truthful
0,1,0
1,0,1
2,1,0
3,0,1
4,1,0


In [14]:
data = pd.concat((data, dummy), axis = 1)
data.head(5)

Unnamed: 0,label,text,deceptive,truthful
0,deceptive,After searching for the perfect hotel for my C...,1,0
1,truthful,Move beyond the rock theme and it's an average...,0,1
2,deceptive,I went to the Homewood Suites in Chicago which...,1,0
3,truthful,While the parking service was great - the fron...,0,1
4,deceptive,An excellent vacation destination. Clean rooms...,1,0


In [15]:
data.drop(["label"], axis = 1, inplace = True)
data.drop(["deceptive"], axis = 1, inplace = True)
data = data.rename(columns = {"truthful":"label"})
data.head(5)

Unnamed: 0,text,label
0,After searching for the perfect hotel for my C...,0
1,Move beyond the rock theme and it's an average...,1
2,I went to the Homewood Suites in Chicago which...,0
3,While the parking service was great - the fron...,1
4,An excellent vacation destination. Clean rooms...,0




#### Removing Last 10 rows for testing

In [16]:
data.shape

(1600, 2)

In [17]:
data_for_manualTesting = data.tail(10)
for i in range(1599,1589,-1):
    data.drop([i], axis = 0, inplace = True)

In [18]:
data_for_manualTesting.reset_index(inplace = True)
data_for_manualTesting.drop(["index"], axis = 1, inplace = True)
data_for_manualTesting.head(10)

Unnamed: 0,text,label
0,The Sheraton Chicago Hotel and Towers is a mag...,0
1,Thank god I got this hotel through priceline. ...,1
2,The Hilton Chicago is one of the best Hotels I...,0
3,so my sweetie lures me out under the guise of ...,1
4,As a frequent traveler for both business and p...,0
5,It has been a couple of years since I stayed h...,1
6,First of all when you check in the check in pr...,0
7,They allowed parties to go on all night and bo...,1
8,My stay at the Homewood Suites in downtown Chi...,0
9,"The Talbott Hotel was very classy, professiona...",0


## Defining Independent and Dependent Variables 

In [19]:
x = data["text"] #independent
y = data["label"] #dependent

x_manual = data_for_manualTesting["text"]
y_manual = data_for_manualTesting["label"]

In [20]:
x

0       After searching for the perfect hotel for my C...
1       Move beyond the rock theme and it's an average...
2       I went to the Homewood Suites in Chicago which...
3       While the parking service was great - the fron...
4       An excellent vacation destination. Clean rooms...
                              ...                        
1585    Triple A rate with upgrade to view room was le...
1586    It was my first time to visit Chicago and I wo...
1587    I recently was in Chicago on business and chos...
1588    I had a wonderful time at the James Hotel whil...
1589    Overpriced is the best word to describe the Co...
Name: text, Length: 1590, dtype: object

In [21]:
x_manual

0    The Sheraton Chicago Hotel and Towers is a mag...
1    Thank god I got this hotel through priceline. ...
2    The Hilton Chicago is one of the best Hotels I...
3    so my sweetie lures me out under the guise of ...
4    As a frequent traveler for both business and p...
5    It has been a couple of years since I stayed h...
6    First of all when you check in the check in pr...
7    They allowed parties to go on all night and bo...
8    My stay at the Homewood Suites in downtown Chi...
9    The Talbott Hotel was very classy, professiona...
Name: text, dtype: object

# Bidirectional LSTM

In [22]:
import tensorflow as tf

In [23]:
tf.__version__

'2.8.0'

In [24]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import Dropout

In [25]:
### Vocabulary size
voc_size=5000

### Onehot Representation

In [26]:
messages= x.copy()
test_messages = x_manual.copy()

In [27]:
messages[1]

"Move beyond the rock theme and it's an average hotel with a good location. Room was unbearably hot and the controls on the wall did nothing. Thank goodness the window opened or we'd have been cooked alive. That of course let in all the noise from Michigan Ave. This combined with our neighbors door slamming against their thrown door lock every 30 seconds made for a lively setting. Room was stuffy too. Burn marks on carpet lead me to believe nasty smokers had occupied room on many occasions. Wife happy w/ Aveda in bath. Base bar was good with great service and well made drinks tho pricey.\n"

In [28]:
import nltk
import re
from nltk.corpus import stopwords

In [29]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/rishi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [30]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages[i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)
    
ps_manual = PorterStemmer()
corpus_manual = []
for i in range(0, len(test_messages)):
    review_manual = re.sub('[^a-zA-Z]', ' ', test_messages[i])
    review_manual = review_manual.lower()
    review_manual = review_manual.split()
    
    review_manual = [ps_manual.stem(word) for word in review_manual if not word in stopwords.words('english')]
    review_manual = ' '.join(review_manual)
    corpus_manual.append(review_manual)

In [31]:
onehot_repr=[one_hot(words,voc_size)for words in corpus] 
onehot_test = [one_hot(words,voc_size)for words in corpus_manual] 
onehot_repr


[[3909,
  887,
  4287,
  316,
  3369,
  2148,
  3364,
  1105,
  4589,
  398,
  4287,
  4287,
  4357,
  958,
  2845,
  4287,
  2602,
  796,
  2986,
  58,
  2606,
  4654,
  4287,
  2131,
  700,
  202,
  308,
  3759,
  2251,
  1434,
  4565,
  1645,
  2314,
  4673,
  3767,
  1787,
  3569,
  3619,
  1320,
  1919,
  1488,
  952,
  2143,
  3142,
  565,
  4373,
  2597,
  3767,
  1101,
  1401,
  1282,
  1457,
  2159,
  316,
  3593,
  3116,
  986,
  3926,
  4104,
  2716,
  2613,
  4700,
  4021,
  3995,
  4673,
  2716,
  4166,
  403,
  4641,
  239,
  2111,
  3657,
  4897,
  4878,
  4295,
  633,
  272,
  4839,
  2695,
  1867,
  3941,
  4027,
  19,
  4589,
  398,
  4804,
  4078,
  3600,
  2378,
  4145,
  197,
  4261,
  3302,
  2504,
  3688,
  4331,
  958,
  1616,
  1780],
 [4102,
  3196,
  398,
  2984,
  549,
  4287,
  1930,
  2538,
  3767,
  414,
  4637,
  3328,
  766,
  1484,
  686,
  1930,
  2008,
  1101,
  2811,
  3336,
  2019,
  239,
  688,
  607,
  1542,
  3333,
  3982,
  1401,
  4017,
  4800

### Embedding Representation

In [32]:
sent_length=20
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
embedded_test = pad_sequences(onehot_test,padding='pre',maxlen=sent_length)
print(embedded_docs)

[[1867 3941 4027 ...  958 1616 1780]
 [ 412 3955 3767 ...  453 4629  303]
 [4865 1804   36 ... 1356 4287 2602]
 ...
 [1402 2420  156 ... 2956 2692 3134]
 [3226  316 3767 ...  854 1077 3041]
 [1024 4039 1258 ...  709   19 4287]]


In [33]:
embedded_docs[2]

array([4865, 1804,   36, 1042, 4192, 3294,  554, 1686, 4287, 4436, 2772,
        590, 2371, 4287,  554,   19, 4118, 1356, 4287, 2602], dtype=int32)

In [34]:
## LSTM: Creating model
embedding_vector_features=40
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(Dropout(0.3))
model.add(LSTM(100))
model.add(Dropout(0.3))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['BinaryAccuracy'])
print(model.summary())

2022-04-27 10:56:30.081593: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 20, 40)            200000    
                                                                 
 dropout (Dropout)           (None, 20, 40)            0         
                                                                 
 lstm (LSTM)                 (None, 100)               56400     
                                                                 
 dropout_1 (Dropout)         (None, 100)               0         
                                                                 
 dense (Dense)               (None, 1)                 101       
                                                                 
Total params: 256,501
Trainable params: 256,501
Non-trainable params: 0
_________________________________________________________________
None


In [35]:
## Bidirectional LSTM: Creating model
embedding_vector_features=40
model1=Sequential()
model1.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model1.add(Dropout(0.3))
model1.add(Bidirectional(LSTM(100)))
model1.add(Dropout(0.3))
model1.add(Dense(1,activation='sigmoid'))
model1.compile(loss='binary_crossentropy',optimizer='adam',metrics=['BinaryAccuracy'])
print(model1.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 20, 40)            200000    
                                                                 
 dropout_2 (Dropout)         (None, 20, 40)            0         
                                                                 
 bidirectional (Bidirectiona  (None, 200)              112800    
 l)                                                              
                                                                 
 dropout_3 (Dropout)         (None, 200)               0         
                                                                 
 dense_1 (Dense)             (None, 1)                 201       
                                                                 
Total params: 313,001
Trainable params: 313,001
Non-trainable params: 0
________________________________________________

In [36]:
len(embedded_docs),y.shape

(1590, (1590,))

In [37]:
import numpy as np
X_final=np.array(embedded_docs)
y_final=np.array(y)

x_manual = np.array(embedded_test)
y_manual = np.array(y_manual)

In [38]:
X_final

array([[1867, 3941, 4027, ...,  958, 1616, 1780],
       [ 412, 3955, 3767, ...,  453, 4629,  303],
       [4865, 1804,   36, ..., 1356, 4287, 2602],
       ...,
       [1402, 2420,  156, ..., 2956, 2692, 3134],
       [3226,  316, 3767, ...,  854, 1077, 3041],
       [1024, 4039, 1258, ...,  709,   19, 4287]], dtype=int32)

In [39]:
x_manual

array([[1804, 4804, 2497, 2371, 4022,  778, 3255, 4857, 3038, 4804, 4536,
        4606,  202, 2269,  707, 3671, 1488, 2378, 2504, 4804],
       [2846, 2695,  633, 3523, 4567,  892,  892, 4700, 4519, 4287,  202,
        3412, 1581,  554, 4804,  779, 3569, 4490, 1442, 1540],
       [3438, 4742, 3767, 3437, 1402, 4201, 2381,   84, 3657, 4762, 1160,
        3523, 4299, 3623,  197, 4287, 3785, 2992, 4145, 4804],
       [4282, 3437,  481, 2159,  677, 4994, 1603, 1401,  998, 4082, 4994,
        3767, 3995, 4282, 3004, 2312, 3891, 1668, 2160, 1730],
       [1402, 4201, 2381,  986, 3290, 1885, 3892, 3741, 4145,  796, 3767,
        4536, 3941,  316, 4145, 2834, 4804, 3440, 1550, 3892],
       [4005, 4282, 3767, 3671, 4068, 4347, 4287,  394, 4989,  590, 2538,
        4417, 1598,  333, 1101, 4282,  565, 3569,  201,  424],
       [3060, 3645,   92, 4005, 2226,  554, 1836,  707, 3995, 3196,  958,
        2845,   19, 4804, 3440, 1550,  883,  316, 4012, 4118],
       [4166, 3466, 3720, 4896, 4793, 269

In [40]:
X_final.shape,y_final.shape

((1590, 20), (1590,))

In [57]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.33, random_state=52)

In [58]:
X_train[1]

array([2503, 2976, 4202, 1730,  633, 4839, 2476, 3902, 4804, 4287,  239,
       1563,  883, 3600, 1692, 3984, 2769,  288,  883,  316], dtype=int32)

### Model Training 

In [59]:
### LSTM Training
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=25,batch_size=32, verbose = 2)

Epoch 1/25
34/34 - 1s - loss: 0.3166 - binary_accuracy: 0.8883 - val_loss: 0.3582 - val_binary_accuracy: 0.8857 - 625ms/epoch - 18ms/step
Epoch 2/25
34/34 - 1s - loss: 0.1637 - binary_accuracy: 0.9502 - val_loss: 0.3566 - val_binary_accuracy: 0.8743 - 606ms/epoch - 18ms/step
Epoch 3/25
34/34 - 1s - loss: 0.0819 - binary_accuracy: 0.9793 - val_loss: 0.4577 - val_binary_accuracy: 0.8667 - 589ms/epoch - 17ms/step
Epoch 4/25
34/34 - 1s - loss: 0.0338 - binary_accuracy: 0.9906 - val_loss: 0.5123 - val_binary_accuracy: 0.8724 - 577ms/epoch - 17ms/step
Epoch 5/25
34/34 - 1s - loss: 0.0154 - binary_accuracy: 0.9972 - val_loss: 0.7507 - val_binary_accuracy: 0.8514 - 512ms/epoch - 15ms/step
Epoch 6/25
34/34 - 1s - loss: 0.0168 - binary_accuracy: 0.9953 - val_loss: 0.6964 - val_binary_accuracy: 0.8552 - 525ms/epoch - 15ms/step
Epoch 7/25
34/34 - 1s - loss: 0.0096 - binary_accuracy: 0.9981 - val_loss: 0.7239 - val_binary_accuracy: 0.8476 - 510ms/epoch - 15ms/step
Epoch 8/25
34/34 - 1s - loss: 0.00

<keras.callbacks.History at 0x7feaa854bbb0>

In [60]:
### Bidirectional LSTM Training
model1.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=25,batch_size=32, verbose = 2)

Epoch 1/25
34/34 - 1s - loss: 0.3432 - binary_accuracy: 0.8920 - val_loss: 0.3318 - val_binary_accuracy: 0.8743 - 770ms/epoch - 23ms/step
Epoch 2/25
34/34 - 1s - loss: 0.1135 - binary_accuracy: 0.9643 - val_loss: 0.3903 - val_binary_accuracy: 0.8800 - 682ms/epoch - 20ms/step
Epoch 3/25
34/34 - 1s - loss: 0.0388 - binary_accuracy: 0.9878 - val_loss: 0.4938 - val_binary_accuracy: 0.8781 - 689ms/epoch - 20ms/step
Epoch 4/25
34/34 - 1s - loss: 0.0124 - binary_accuracy: 0.9972 - val_loss: 0.5986 - val_binary_accuracy: 0.8762 - 694ms/epoch - 20ms/step
Epoch 5/25
34/34 - 1s - loss: 0.0122 - binary_accuracy: 0.9962 - val_loss: 0.5870 - val_binary_accuracy: 0.8667 - 687ms/epoch - 20ms/step
Epoch 6/25
34/34 - 1s - loss: 0.0090 - binary_accuracy: 0.9972 - val_loss: 0.6501 - val_binary_accuracy: 0.8705 - 695ms/epoch - 20ms/step
Epoch 7/25
34/34 - 1s - loss: 0.0050 - binary_accuracy: 0.9991 - val_loss: 0.7245 - val_binary_accuracy: 0.8686 - 690ms/epoch - 20ms/step
Epoch 8/25
34/34 - 1s - loss: 0.00

<keras.callbacks.History at 0x7feaaf96cd00>

### Performance Metrics and Accuracy

### LSTM

In [61]:
predictions = (model.predict(x_manual) > 0.5).astype("int32")

In [62]:
from sklearn.metrics import confusion_matrix

In [63]:
confusion_matrix(y_manual, predictions)

array([[5, 1],
       [0, 4]])

In [64]:
from sklearn.metrics import accuracy_score
accuracy_score(y_manual,predictions)

0.9

In [65]:
from sklearn.metrics import classification_report
print(classification_report(y_manual, predictions))

              precision    recall  f1-score   support

           0       1.00      0.83      0.91         6
           1       0.80      1.00      0.89         4

    accuracy                           0.90        10
   macro avg       0.90      0.92      0.90        10
weighted avg       0.92      0.90      0.90        10



### Bidirectional LSTM

In [70]:
predictions = (model.predict(x_manual) > 0.5).astype("int32")

In [71]:
from sklearn.metrics import confusion_matrix

In [73]:
confusion_matrix(y_manual, predictions)

array([[5, 1],
       [0, 4]])

In [74]:
from sklearn.metrics import accuracy_score
accuracy_score(y_manual,predictions)

0.9

In [75]:
from sklearn.metrics import classification_report
print(classification_report(y_manual, predictions))

              precision    recall  f1-score   support

           0       1.00      0.83      0.91         6
           1       0.80      1.00      0.89         4

    accuracy                           0.90        10
   macro avg       0.90      0.92      0.90        10
weighted avg       0.92      0.90      0.90        10



### Manual Testing 

In [55]:
from nltk.stem.porter import PorterStemmer

def review_anaylsis(prediction):
    if prediction == 1:
        return "Honest Review"
    else:
        return "Fake Review"
    
def prediction(text):
    ps = PorterStemmer()
    corpus = []
    review = re.sub('[^a-zA-Z]', ' ', text)
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)
    onehot_repr = [one_hot(words,voc_size)for words in corpus]
    sent_length=20
    embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
    text_final = np.array(embedded_docs)
    lstm_prediction = (model.predict(text_final) > 0.5).astype("int32")
    bidr_prediction = (model1.predict(text_final) > 0.5).astype("int32")
    print(f"\n\nAccording to LSTM This review is a {review_anaylsis(lstm_prediction[0])}")
    print(f"\nAccording to Bidirectional LSTM This review is a {review_anaylsis(bidr_prediction[0])}")

In [56]:
text = str(input())
prediction(text)

"A very nice hotel with small but adequate rooms. The decor was lovely. My daughter and I spent 5-nights here, saw Wicked just down the street and used the public transportation to get all around town. Was a bit disappointed with the bar/restaurant which opened and closed it's kitchen at will without regard to posted hours. The room description boasted of safe's large enough to accomodate a laptop, so I brought mine. I was informed after check-in that the safes are not in every room (and not in mine). As a side note, I was surprised that nearly all the shops many restaurants closed at 6pm in that area... even the 7-11.  "


According to LSTM This review is a Honest Review

According to Bidirectional LSTM This review is a Honest Review
