**Hotel Review Sentiment Analysis using NLP**

In [1]:
#Import Libraries
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
import warnings
from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

In [4]:
#Load the data
hotel_data = pd.read_csv('greek_hotel_reviews.csv')

In [5]:
#Number of missing values in the dataset
hotel_data.isna().sum()

Reviews    1
Rating     0
dtype: int64

In [6]:
# Drop empty Review values
hotel_data = hotel_data[hotel_data["Reviews"].notna()]    

In [7]:
# Number of missing values in the dataset
hotel_data.isna().sum()

Reviews    0
Rating     0
dtype: int64

In [8]:
hotel_data.Rating.value_counts()

5    5627
4    1245
3     455
1     178
2     136
Name: Rating, dtype: int64

In [9]:
#Classifying reviews into “positive” and “negative” so we can use this as training data for our sentiment classification model.
#Positive reviews will be classified as +1, and negative reviews will be classified as 0.

hotel_data = hotel_data[hotel_data['Rating'] != 3]
hotel_data['Rating'] = hotel_data['Rating'].apply(lambda rating : +1 if rating > 3 else 0)

In [10]:
hotel_data = hotel_data.reset_index(drop=True)

In [11]:
hotel_data.head()

Unnamed: 0,Reviews,Rating
0,Wonderful hotel,1
1,Apanemo,1
2,Best secret Getaway,1
3,Paradise,1
4,Lovely place to stay!,1


# Vader knowledge based unsupervised model

In [129]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()

# Calculate polarity score using Vader
def get_vader_score(sentence): 
    compound = sid.polarity_scores(sentence)['compound']
    if compound > 0.05: 
        return 1
    elif (compound >= -0.05) and (compound <=0.05): 
        return None
    else: 
        return 0
    
hotel_data['vader'] = hotel_data.apply(lambda x: get_vader_score(x['Reviews']), axis=1)

# Evaluate results
print(f'Accuracy: {accuracy_score(hotel_data.dropna()["Rating"].values, hotel_data.dropna()["vader"].values)}')

Accuracy: 0.9674029212682579


# Apply TFIDF vectorization to reviews dataset and apply ML algorithms to the vectorized dataset

## Cleaning the text data in Reviews

In [12]:
data = hotel_data[['Rating', 'Reviews']]

In [13]:
# Removing the puntuation marks
data['Reviews'] = data['Reviews'].str.replace('[^\w\s]','')
data.head()

Unnamed: 0,Rating,Reviews
0,1,Wonderful hotel
1,1,Apanemo
2,1,Best secret Getaway
3,1,Paradise
4,1,Lovely place to stay


In [14]:
#Remove stopwords

from nltk.corpus import stopwords

english_stop_words = stopwords.words('english')
def remove_stop_words(corpus):
    removed_stop_words = []
    for review in corpus:
        removed_stop_words.append(
            ' '.join([word for word in review.split()
                      if word not in english_stop_words])
        )
    return removed_stop_words

data['Reviews'] = remove_stop_words(data['Reviews'])

data.head()

Unnamed: 0,Rating,Reviews
0,1,Wonderful hotel
1,1,Apanemo
2,1,Best secret Getaway
3,1,Paradise
4,1,Lovely place stay


In [15]:
#Normalization to its true root

import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.stem import PorterStemmer
porter=PorterStemmer()
from nltk.stem.wordnet import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

from nltk.corpus import wordnet
    
    
from nltk.tag.perceptron import PerceptronTagger
tagger = PerceptronTagger()

def stemSentence(sentence):
    stem_sentence=[]
    
    for word, tag in tagger.tag(word_tokenize(sentence)):
        wntag = tag[0].lower()
        wntag = wntag if wntag in ['a', 'r', 'n', 'v'] else 'n'
        word=lemmatizer.lemmatize(word,wntag)
        stem_sentence.append(porter.stem(word))
        
    return " ".join(stem_sentence)


for index,row in data.iterrows():
    review = stemSentence(row['Reviews'])
    data.loc[index,'Reviews'] = review
    
data.head()

Unnamed: 0,Rating,Reviews
0,1,wonder hotel
1,1,apanemo
2,1,best secret getaway
3,1,paradis
4,1,love place stay


## Split the dataframe

In [16]:
# Splitting train/test data
x = data['Reviews']
Y = data['Rating']

In [17]:
from sklearn.model_selection import train_test_split

xtrain, xtest, Y_train, Y_test = train_test_split(x, Y, train_size = 0.7, random_state = 500, stratify = Y)

## Vectorization tf-idf 3-grams

TF-IDF is an abbreviation for Term Frequency Inverse Document Frequency. This is very common algorithm to transform text into a meaningful representation of numbers which is used to fit machine algorithm for prediction.

Count Vectorizer give number of frequency with respect to index of vocabulary where as tf-idf consider overall documents of weight of words.

![](img1.png)

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1,3))

vectorizer.fit(xtrain)
vectorizer.fit(xtest)
X_train = vectorizer.transform(xtrain)
X_test = vectorizer.transform(xtest)

Resampling involves creating a new transformed version of the training dataset in which the selected examples have a different class distribution. Random oversampling involves randomly selecting examples from the minority class, with replacement, and adding them to the training dataset. Random undersampling involves randomly selecting examples from the majority class and deleting them from the training dataset.

In [19]:
# Check imbalanced data - summarize the class distribution
from collections import Counter

counter = Counter(Y_train)
print(counter)

Counter({1: 4810, 0: 220})


In [20]:
from imblearn.over_sampling import RandomOverSampler

rus = RandomOverSampler(random_state=777)

X_train, Y_train = rus.fit_resample(X_train, Y_train)

In [21]:
# summarize the new class distribution
counter = Counter(Y_train)
print(counter)

Counter({1: 4810, 0: 4810})


## Model Fitting

### Logistic Regression
Logistic regression is a process of modeling the probability of a discrete outcome given an input variable. The most common logistic regression models a binary outcome; something that can take two values such as true/false, yes/no, and so on. Logistic regression is a useful analysis method for classification problems, where you are trying to determine if a new sample fits best into a category. 

The best way to think about logistic regression is that it is a linear regression but for classification problems. Logistic regression essentially uses a logistic function defined below to model a binary output variable (Tolles & Meurer, 2016). The primary difference between linear regression and logistic regression is that logistic regression's range is bounded between 0 and 1. In addition, as opposed to linear regression, logistic regression does not require a linear relationship between inputs and output variables.

In [140]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

lr = LogisticRegression()
lr.fit(X_train,Y_train)
y_pred = lr.predict(X_test)
print(classification_report(Y_test, y_pred))

              precision    recall  f1-score   support

           0       0.48      0.61      0.54        94
           1       0.98      0.97      0.98      2062

    accuracy                           0.95      2156
   macro avg       0.73      0.79      0.76      2156
weighted avg       0.96      0.95      0.96      2156



### K-Nearest Neighbor

K-Nearest Neighbor is an exemplar-based or instance based approach which is effectively applied for sense tagged words. This classification approach based on 
instances where instances are used as points in the vector and test instance compares the new instance with all previously stored instances in the memory.
It is based on supervised learning algorithm which is provided with the training set and during classification it compares the test instance with training set.

In KNN the classification of new examples are represented in the vector form of ‘n’ features .The exemplar- based methods do not ignore any exceptions so that 
the context should be disambiguated properly.

The K-Nearest Neighbor classifier selects the correct answer by comparing target word with sense inventory dictionary.
This classifier finds the ‘k’ nearest sample to the target word and the closest sense is selected as the correct sense of the word.

In [141]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, Y_train)
y_pred = knn.predict(X_test)
print(classification_report(Y_test, y_pred))

              precision    recall  f1-score   support

           0       0.62      0.38      0.47        94
           1       0.97      0.99      0.98      2062

    accuracy                           0.96      2156
   macro avg       0.80      0.69      0.73      2156
weighted avg       0.96      0.96      0.96      2156



### Random Forest Classifier

The random forest algorithm is made up of a collection of decision trees, and each tree in the ensemble is comprised of a data sample drawn from a training set with replacement, called the bootstrap sample.Depending on the type of problem, the determination of the prediction will vary. For a regression task, the individual decision trees will be averaged, and for a classification task, a majority vote,this step is known as aggregation.

In [142]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(max_depth=2, random_state=0)
rf.fit(X_train.toarray(),Y_train)
y_pred = rf.predict(X_test.toarray())
print(classification_report(Y_test, y_pred))

              precision    recall  f1-score   support

           0       0.49      0.36      0.41        94
           1       0.97      0.98      0.98      2062

    accuracy                           0.96      2156
   macro avg       0.73      0.67      0.70      2156
weighted avg       0.95      0.96      0.95      2156



### Naive Bayes

Naive Bayes is a family of probabilistic algorithms that take advantage of probability theory and Bayes’ Theorem to predict the target variable. They are probabilistic, which means that they calculate the probability of each tag for a given text, and then output the tag with the highest one. The way they get these probabilities is by using Bayes’ Theorem, which describes the probability of a feature, based on prior knowledge of conditions that might be related to that feature.

In [143]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()
nb.fit(X_train.toarray(),Y_train)
y_pred = nb.predict(X_test.toarray())
print(classification_report(Y_test, y_pred))

              precision    recall  f1-score   support

           0       0.21      0.61      0.31        94
           1       0.98      0.90      0.94      2062

    accuracy                           0.88      2156
   macro avg       0.60      0.75      0.63      2156
weighted avg       0.95      0.88      0.91      2156



### Support Vector Machine

The objective of the support vector machine algorithm is to find a hyperplane in an N-dimensional space(N — the number of features) that distinctly classifies the data points.

To separate the two classes of data points, there are many possible hyperplanes that could be chosen. Hyperplanes are decision boundaries that help classify the data points. Our objective is to find a plane that has the maximum margin or the maximum distance between data points of both classes.

In [144]:
from sklearn import svm

sv = svm.SVC()
sv.fit(X_train.toarray(),Y_train)
y_pred = sv.predict(X_test.toarray())
print(classification_report(Y_test, y_pred))

              precision    recall  f1-score   support

           0       0.76      0.37      0.50        94
           1       0.97      0.99      0.98      2062

    accuracy                           0.97      2156
   macro avg       0.87      0.68      0.74      2156
weighted avg       0.96      0.97      0.96      2156



### Xgboost

XGBoost the Algorithm operates on decision trees, models that construct a graph that examines the input under various "if" statements (vertices in the graph). Whether the "if" condition is satisfied influences the next "if" condition and eventual prediction. XGBoost the Algorithm progressively adds more and more "if" conditions to the decision tree to build a stronger model.

In [147]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

xgb = XGBClassifier()
xgb.fit(X_train,Y_train)
y_pred = xgb.predict(X_test)
print(classification_report(Y_test, y_pred))

              precision    recall  f1-score   support

           0       0.57      0.49      0.53        94
           1       0.98      0.98      0.98      2062

    accuracy                           0.96      2156
   macro avg       0.78      0.74      0.75      2156
weighted avg       0.96      0.96      0.96      2156



### Long Short-Term Memory (LSTM) – Keras Implementation

The  Long  Short-Term  Memory  or  LSTM  network  is  a  recurrent  neural  net-work that is trained using Backpropagation Through Time and overcomes thevanishing gradient problem.  LSTMs differentiated by common RNNs by usingmemory blocks that are connected into layers.  A block has components thatmake it smarter than a classical neuron and a memory for recent sequences.Thiscapability of LSTMs has been used to great effect in complex natural languageprocessing problems such as neural machine translation.

**Padding**

The sequences have different lengths and Keras prefers inputs to be vectorized and all inputs to have the same length. We set the maximum size of each list to 1000. You can try a different size. The lists with size greater than 1000 will be truncated to 1000. For the lists that have length less than 1000, we will add 0 at the end of the list until it reaches the max length. This process is called padding.

**Embedding**

Embedding layer is one of the available layers in Keras. This is mainly used in Natural Language Processing related applications such as language modeling, but it can also be used with other tasks that involve neural networks. While dealing with NLP problems, we can use pre-trained word embeddings such as GloVe. Alternatively we can also train our own embeddings using Keras embedding layer. **Word embeddings can be thought of as an alternate to one-hot encoding along with dimensionality reduction.**

Embedding layer enables us to convert each word into a fixed length vector of defined size. The resultant vector is a dense one with having real values instead of just 0’s and 1’s. The fixed length of word vectors helps us to represent words in a better way along with reduced dimensions.

This way embedding layer works like a lookup table. The words are the keys in this table, while the dense word vectors are the values. To understand it better, let’s look at the implementation of Keras Embedding layer.

In [148]:
tf_idf_dict = list(vectorizer.vocabulary_.items())

max_val = 0
for i in range(0,len(tf_idf_dict)-1):
    current_val = tf_idf_dict[i][1]
    if current_val > max_val:
        max_val = current_val

vocab_size = max_val
print(vocab_size)

6675


In [149]:
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Dropout
from keras.preprocessing.sequence import pad_sequences

maxlen = 100

X_train = pad_sequences(X_train.toarray(), padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test.toarray(), padding='post', maxlen=maxlen)

INFO:tensorflow:Enabling eager execution
INFO:tensorflow:Enabling v2 tensorshape
INFO:tensorflow:Enabling resource variables
INFO:tensorflow:Enabling tensor equality
INFO:tensorflow:Enabling control flow v2


In [150]:
from numpy import array
from numpy import asarray
from numpy import zeros

embeddings_dictionary = dict()
glove_file = open(r'glove.6B.100d.txt', encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary [word] = vector_dimensions
glove_file.close()

In [151]:
embedding_matrix = zeros((vocab_size, 100))
for word, index in vectorizer.vocabulary_.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [152]:
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length = X_train.shape[1], weights=[embedding_matrix], trainable=False))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer='adam', metrics = ['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 100)          667500    
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 100, 100)          0         
_________________________________________________________________
lstm (LSTM)                  (None, 128)               117248    
_________________________________________________________________
dense (Dense)                (None, 1)                 129       
Total params: 784,877
Trainable params: 117,377
Non-trainable params: 667,500
_________________________________________________________________
None


In [153]:
import tensorflow as tf

y_nn_train = tf.one_hot(Y_train, depth=1)
y_nn_test = tf.one_hot(Y_test, depth=1)

In [154]:
batch_size = 64
history = model.fit(X_train, Y_train, epochs = 5, batch_size=batch_size, validation_data=(X_test, y_nn_test))

Epoch 1/5
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: invalid syntax (tmptmlxaw85.py, line 13)
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: invalid syntax (tmptmlxaw85.py, line 13)
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: invalid syntax (tmpbn300an4.py, line 48)
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: invalid syntax (tmpbn300an4.py, line 48)
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
y_pred = model.predict_classes(X_test)

In [None]:
print(classification_report(Y_test, y_pred))

### AutoSklearnClassifier

In [24]:
from autosklearn.classification import AutoSklearnClassifier

X_train=X_train
y_train=Y_train
asc = AutoSklearnClassifier(time_left_for_this_task=10*60, n_jobs=8)
asc.fit(X_train,y_train)
y_pred=asc.predict(X_test)
print(classification_report(Y_test, y_pred))

              precision    recall  f1-score   support

           0       0.51      0.48      0.49        94
           1       0.98      0.98      0.98      2062

    accuracy                           0.96      2156
   macro avg       0.74      0.73      0.73      2156
weighted avg       0.96      0.96      0.96      2156



# LSTM with Bidirectional Encoder Representations from Transformer BERT

In [29]:
from sklearn.model_selection import train_test_split

train, test= train_test_split(data, test_size = 0.3, random_state = 500)

In [30]:
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures

model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
def convert_data_to_examples(train, test, DATA_COLUMN, LABEL_COLUMN): 
  train_InputExamples = train.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
                                                          text_a = x[DATA_COLUMN], 
                                                          text_b = None,
                                                          label = x[LABEL_COLUMN]), axis = 1)

  validation_InputExamples = test.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
                                                          text_a = x[DATA_COLUMN], 
                                                          text_b = None,
                                                          label = x[LABEL_COLUMN]), axis = 1)
  
  return train_InputExamples, validation_InputExamples

  train_InputExamples, validation_InputExamples = convert_data_to_examples(train, 
                                                                           test, 
                                                                           'Reviews', 
                                                                           'Rating')
  
def convert_examples_to_tf_dataset(examples, tokenizer, max_length=128):
    features = [] # -> will hold InputFeatures to be converted later

    for e in examples:
        # Documentation is really strong for this method, so please take a look at it
        input_dict = tokenizer.encode_plus(
            e.text_a,
            add_special_tokens=True,
            max_length=max_length, # truncates if len(s) > max_length
            return_token_type_ids=True,
            return_attention_mask=True,
            pad_to_max_length=True, # pads to the right by default # CHECK THIS for pad_to_max_length
            truncation=True
        )

        input_ids, token_type_ids, attention_mask = (input_dict["input_ids"],
            input_dict["token_type_ids"], input_dict['attention_mask'])

        features.append(
            InputFeatures(
                input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=e.label
            )
        )

    def gen():
        for f in features:
            yield (
                {
                    "input_ids": f.input_ids,
                    "attention_mask": f.attention_mask,
                    "token_type_ids": f.token_type_ids,
                },
                f.label,
            )

    return tf.data.Dataset.from_generator(
        gen,
        ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
        (
            {
                "input_ids": tf.TensorShape([None]),
                "attention_mask": tf.TensorShape([None]),
                "token_type_ids": tf.TensorShape([None]),
            },
            tf.TensorShape([]),
        ),
    )


DATA_COLUMN = 'Reviews'
LABEL_COLUMN = 'Rating'

In [32]:
import tensorflow as tf

train_InputExamples, validation_InputExamples = convert_data_to_examples(train, test, DATA_COLUMN, LABEL_COLUMN)

train_data = convert_examples_to_tf_dataset(list(train_InputExamples), tokenizer)
train_data = train_data.shuffle(100).batch(32).repeat(2)

validation_data = convert_examples_to_tf_dataset(list(validation_InputExamples), tokenizer)
validation_data = validation_data.batch(32)

In [33]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0), 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])

model.fit(train_data, epochs=2, steps_per_epoch=5)

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x1d1c9cc6c70>

In [34]:
from sklearn.metrics import classification_report

tf_batch = tokenizer(test['Reviews'].tolist(), max_length=128, padding=True, truncation=True, return_tensors='tf')
tf_outputs = model(tf_batch)

tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)
labels = [0,1]
label = tf.argmax(tf_predictions, axis=1)
label = label.numpy()
print(classification_report(test['Rating'], label))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        94
           1       0.96      1.00      0.98      2062

    accuracy                           0.96      2156
   macro avg       0.48      0.50      0.49      2156
weighted avg       0.91      0.96      0.94      2156

