#                      SENTIMENT ANALYSIS OF TRIP ADVISOR: HOTEL REVIEWS

# Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


# Reading The Dataset

In [2]:
# Read the CSV file into a DataFrame
df=pd.read_csv("tripadvisor_hotel_reviews.csv",encoding="ISO-8859-1")

# Summary Of The Dataset

In [3]:
df

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5
...,...,...
20486,"best kept secret 3rd time staying charm, not 5...",5
20487,great location price view hotel great quick pl...,4
20488,"ok just looks nice modern outside, desk staff ...",2
20489,hotel theft ruined vacation hotel opened sept ...,1


# Checking For The Null Values

In [4]:
df.isnull().sum()

Review    0
Rating    0
dtype: int64

# Removing The Null Values

In [5]:
# Drop rows with any missing values
df.dropna(how='any',inplace=True)
# Now, df contains only rows without missing values


In [6]:
# Get the count of unique values in the 'Rating' column of the DataFrame
counts = df['Rating'].value_counts()
counts

Rating
5    9054
4    6039
3    2184
2    1793
1    1421
Name: count, dtype: int64

In [7]:
pip install wordcloud


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.1.2 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [8]:
# Importing necessary libraries
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud,STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import LancasterStemmer,WordNetLemmatizer


# Removing HTML Strips

In [9]:
# Importing necessary libraries
from bs4 import BeautifulSoup  # Import BeautifulSoup for HTML tag stripping
import re  # Import regular expressions module for pattern matching

# Function to strip HTML tags using BeautifulSoup
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

# Function to remove text between square brackets using regular expression
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

# Function to denoise text by applying both HTML tag stripping and removing text between square brackets
def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    return text

# Applying the denoise_text function on the 'Review' column of the DataFrame
df['Review'] = df['Review'].apply(denoise_text)


  soup = BeautifulSoup(text, "html.parser")


In [10]:
df

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5
...,...,...
20486,"best kept secret 3rd time staying charm, not 5...",5
20487,great location price view hotel great quick pl...,4
20488,"ok just looks nice modern outside, desk staff ...",2
20489,hotel theft ruined vacation hotel opened sept ...,1


# Removing Special Characters

In [11]:
# Define function for removing special characters
def remove_special_characters(text, remove_digits=True):
    # Define the pattern for removing non-alphanumeric characters and spaces
    pattern = r'[^a-zA-z0-9\s]'
    
    # Use regular expression to substitute the pattern with an empty string
    text = re.sub(pattern, '', text)
    
    return text

# Apply the remove_special_characters function on the 'Review' column of the DataFrame
df['Review'] = df['Review'].apply(remove_special_characters)


In [12]:
df

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4 experience hotel monaco seatt...,3
3,unique great stay wonderful time hotel monaco ...,5
4,great stay great stay went seahawk game awesom...,5
...,...,...
20486,best kept secret 3rd time staying charm not 5s...,5
20487,great location price view hotel great quick pl...,4
20488,ok just looks nice modern outside desk staff n...,2
20489,hotel theft ruined vacation hotel opened sept ...,1


# Stemming The Text

In [13]:
# Import the necessary library
import nltk

# Define a function for simple stemming
def simple_stemmer(text):
    # Initialize the Porter stemmer from NLTK
    ps = nltk.porter.PorterStemmer()
    
    # Apply stemming to each word in the text
    # Split the text into words, apply stemming to each word, and join them back into a space-separated string
    text = ' '.join([ps.stem(word) for word in text.split()])
    
    # Return the processed text
    return text

# Apply the simple_stemmer function to the 'Review' column in the DataFrame
df['Review'] = df['Review'].apply(simple_stemmer)


In [14]:
df

Unnamed: 0,Review,Rating
0,nice hotel expens park got good deal stay hote...,4
1,ok noth special charg diamond member hilton de...,2
2,nice room not 4 experi hotel monaco seattl goo...,3
3,uniqu great stay wonder time hotel monaco loca...,5
4,great stay great stay went seahawk game awesom...,5
...,...,...
20486,best kept secret 3rd time stay charm not 5star...,5
20487,great locat price view hotel great quick place...,4
20488,ok just look nice modern outsid desk staff nt ...,2
20489,hotel theft ruin vacat hotel open sept 17 2007...,1


In [15]:
# Import the NLTK library
import nltk

# Download the NLTK stopwords dataset
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Text Normalization

In [16]:
#Initialize the ToktokTokenizer for text tokenization
tokenizer=ToktokTokenizer()
#Setting English stopwords using NLTK
stopword_list=nltk.corpus.stopwords.words('english')
# The 'tokenizer' object can be used to tokenize text, breaking it down into individual units like words or sentences.
# Tokenization is a crucial step in natural language processing (NLP) tasks.

# 'stopword_list' contains common English stopwords. 
# Stopwords are words that are frequently used in a language but often do not contribute much to the meaning of a text.
# They are commonly removed during text processing to focus on more meaningful words.

# Now you can use the 'tokenizer' object and 'stopword_list' for further text processing and analysis.


In [17]:
df

Unnamed: 0,Review,Rating
0,nice hotel expens park got good deal stay hote...,4
1,ok noth special charg diamond member hilton de...,2
2,nice room not 4 experi hotel monaco seattl goo...,3
3,uniqu great stay wonder time hotel monaco loca...,5
4,great stay great stay went seahawk game awesom...,5
...,...,...
20486,best kept secret 3rd time stay charm not 5star...,5
20487,great locat price view hotel great quick place...,4
20488,ok just look nice modern outsid desk staff nt ...,2
20489,hotel theft ruin vacat hotel open sept 17 2007...,1


# Removing StopWords

In [18]:
# Set English stopwords using NLTK
stop = set(stopwords.words('english'))

# Print the set of stopwords
print(stop)

# Define a function to remove stopwords from text
def remove_stopwords(text, is_lower_case=False):
    # Tokenize the text using the previously defined 'tokenizer'
    tokens = tokenizer.tokenize(text)
    
    # Remove leading and trailing whitespaces from each token
    tokens = [token.strip() for token in tokens]
    
    # Check if the text is in lowercase or not
    if is_lower_case:
        # Remove stopwords from tokens (case-sensitive)
        filtered_tokens = [token for token in tokens if token not in stop]
    else:
        # Remove stopwords from tokens (case-insensitive)
        filtered_tokens = [token for token in tokens if token.lower() not in stop]
    
    # Join the filtered tokens back into a text string
    filtered_text = ' '.join(filtered_tokens)
    
    # Return the text with stopwords removed
    return filtered_text

# Apply the remove_stopwords function to the 'Review' column in the DataFrame
df['Review'] = df['Review'].apply(remove_stopwords)


{'after', 'it', 'what', 'as', 'above', "mightn't", 'in', 'by', "you'd", 'but', 'to', 'from', 'herself', 'between', 'further', "needn't", "weren't", 'm', "don't", 'your', 'nor', 'a', 're', 'them', 'were', 'couldn', 'then', "aren't", 'we', 'against', 'didn', 'off', 'here', 'out', 'her', 'wouldn', 've', 'before', 'through', 'ain', 'has', "you've", 'hers', 'about', 'she', 'his', 'are', 'you', 'under', 'be', 'when', 'same', "shouldn't", 'our', 'yourself', 'an', 'needn', 's', "you'll", 'can', 'with', 'been', 'those', "hasn't", 'where', 'such', 'own', 'mustn', 'don', 'they', 'and', 'than', "should've", 'most', 'which', 'should', 'myself', 'haven', 'was', 'why', 'doesn', 'not', 'aren', 'their', 'for', 'again', 'mightn', 'any', 'now', 'ourselves', 'doing', 'until', 'ma', 'over', 'i', 'because', 'o', "haven't", 'both', 'at', 'is', "won't", 'this', 'yours', 'my', 'each', 'isn', 'had', "she's", 'whom', 'just', 'll', 'only', 'these', 'hadn', 'very', 't', "mustn't", 'few', 'him', 'if', 'y', 'he', 'o

In [19]:
df

Unnamed: 0,Review,Rating
0,nice hotel expens park got good deal stay hote...,4
1,ok noth special charg diamond member hilton de...,2
2,nice room 4 experi hotel monaco seattl good ho...,3
3,uniqu great stay wonder time hotel monaco loca...,5
4,great stay great stay went seahawk game awesom...,5
...,...,...
20486,best kept secret 3rd time stay charm 5star ca ...,5
20487,great locat price view hotel great quick place...,4
20488,ok look nice modern outsid desk staff nt parti...,2
20489,hotel theft ruin vacat hotel open sept 17 2007...,1


# Splitting Dataset Into Training And Testing 

In [20]:
# Assuming the total number of rows in your DataFrame is 20490

# Selecting 80% of the data for training (16392 out of 20490)
norm_train_reviews = df.Review[:16392]

# Accessing the first review in the training set
first_review_in_training_set = norm_train_reviews[0]# Assuming the total number of rows in your DataFrame is 20490

# Selecting 20% of the data for testing (4098 out of 20490)
norm_test_reviews = df.Review[4098:]

# This line sets aside a portion of the data for testing purposes.
# It assumes that the first 80% of the data is used for training (as mentioned in previous code).

# The test set typically starts from a specific index (4098 in this case) and goes to the end of the DataFrame.
# The size of the test set is determined by the remaining 20% of the data.

# It is a common practice to split the data into training and testing sets to evaluate the model's performance on unseen data.


# Print the first review in the training set
print(first_review_in_training_set)


nice hotel expens park got good deal stay hotel anniversari arriv late even took advic previou review valet park check quick easi littl disappoint nonexist view room room clean nice size bed comfort woke stiff neck high pillow soundproof like heard music room night morn loud bang door open close hear peopl talk hallway mayb noisi neighbor aveda bath product nice goldfish stay nice touch taken advantag stay longer locat great walk distanc shop overal nice experi pay 40 park night


In [21]:
# Assuming the total number of rows in your DataFrame is 20490

# Selecting 20% of the data for testing (4098 out of 20490)
norm_test_reviews = df.Review[4098:]

# This line sets aside a portion of the data for testing purposes.
# It assumes that the first 80% of the data is used for training (as mentioned in previous code).

# The test set typically starts from a specific index (4098 in this case) and goes to the end of the DataFrame.
# The size of the test set is determined by the remaining 20% of the data.

# It is a common practice to split the data into training and testing sets to evaluate the model's performance on unseen data.


In [22]:
#TERM FREQUENCY INVERSE DOCUMENT FREQUENCY MODEL(TFIDF)
#IT IS USED TO CONVERT TEXT DOCUMENT TO MATRIX OF TFIDF FEATURE

In [23]:
pip install scikit-learn





[notice] A new release of pip is available: 23.1.2 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [24]:
# Importing CountVectorizer from scikit-learn
from sklearn.feature_extraction.text import CountVectorizer

# Initializing CountVectorizer with specified parameters
cv = CountVectorizer(min_df=0.0, max_df=1, binary=False, ngram_range=(1, 3))

# Transforming training reviews using CountVectorizer
cv_train_reviews = cv.fit_transform(norm_train_reviews)

# Transforming test reviews using the same CountVectorizer
cv_test_reviews = cv.transform(norm_test_reviews)

# Printing the shape of the transformed training and test reviews
print('BOW_cv_train:', cv_train_reviews.shape)
print('BOW_cv_test:', cv_test_reviews.shape)

# The CountVectorizer is used to convert a collection of text documents into a matrix of token counts.
# min_df is the minimum document frequency, max_df is the maximum document frequency,
# binary is set to False to get the count of each word, and ngram_range is set to (1, 3) for unigrams, bigrams, and trigrams.

# The transformed reviews are represented as sparse matrices where each row corresponds to a document,
# and each column corresponds to a unique word in the vocabulary.

# The vocabulary of the CountVectorizer can be obtained using:
# vocab = cv.get_feature_names()
# This will return a list of feature names (words) used 


BOW_cv_train: (16392, 1838470)
BOW_cv_test: (16393, 1838470)


In [25]:
# Importing TfidfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

# Initializing TfidfVectorizer with specified parameters
tv = TfidfVectorizer(min_df=0.0, max_df=1, use_idf=True, ngram_range=(1, 3))

# Transforming training reviews using TfidfVectorizer
tv_train_reviews = tv.fit_transform(norm_train_reviews)

# Transforming test reviews using the same TfidfVectorizer
tv_test_reviews = tv.transform(norm_test_reviews)

# Printing the shape of the transformed training and test reviews
print('Tfidf_train:', tv_train_reviews.shape)
print('Tfidf_test:', tv_test_reviews.shape)

# TfidfVectorizer is used to convert a collection of text documents into a matrix of term frequency-inverse document frequency (TF-IDF) features.
# min_df is the minimum document frequency, max_df is the maximum document frequency,
# use_idf is set to True to enable inverse-document-frequency reweighting, and ngram_range is set to (1, 3) for unigrams, bigrams, and trigrams.

# The transformed reviews are represented as sparse matrices where each row corresponds to a document,
# and each column corresponds to a unique n-gram in the vocabulary.

# The shape of the resulting matrices indicates the number of documents and the size of the vocabulary.


Tfidf_train: (16392, 1838470)
Tfidf_test: (16393, 1838470)


In [26]:
# Importing necessary libraries
from sklearn.preprocessing import LabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Assuming df is your DataFrame
df['Rating'] = df['Rating'].astype(str)

# Labeling the sentiment data
lb = LabelBinarizer()
sentiment_data = lb.fit_transform(df['Rating'])

# Printing the shape and classes of the sentiment data
print(sentiment_data.shape)
print(lb.classes_)

# Extracting the column representing the sentiment labels
sentiment_labels = lb.classes_

# Splitting sentiment data into training and testing sets
train_sentiments = sentiment_data[:16392, 0]  # Use the first column
test_sentiments = sentiment_data[4098:, 0]   # Use the first column

# Initializing Logistic Regression model
lr = LogisticRegression(penalty='l2', max_iter=500, C=1, random_state=42)

# Fitting the model for Bag of Words features
lr_bow = lr.fit(cv_train_reviews, train_sentiments)
print(lr_bow)

# Fitting the model for TF-IDF features
lr_tfidf = lr.fit(tv_train_reviews, train_sentiments)
print(lr_tfidf)


(20491, 5)
['1' '2' '3' '4' '5']
LogisticRegression(C=1, max_iter=500, random_state=42)
LogisticRegression(C=1, max_iter=500, random_state=42)


In [27]:
df

Unnamed: 0,Review,Rating
0,nice hotel expens park got good deal stay hote...,4
1,ok noth special charg diamond member hilton de...,2
2,nice room 4 experi hotel monaco seattl good ho...,3
3,uniqu great stay wonder time hotel monaco loca...,5
4,great stay great stay went seahawk game awesom...,5
...,...,...
20486,best kept secret 3rd time stay charm 5star ca ...,5
20487,great locat price view hotel great quick place...,4
20488,ok look nice modern outsid desk staff nt parti...,2
20489,hotel theft ruin vacat hotel open sept 17 2007...,1


# LOGISTIC REGRESSION MODEL PERFORMANCE ON TEST DATASET

In [28]:
# Predicting the sentiment using the trained logistic regression model on Bag of Words features
lr_bow_predict = lr.predict(cv_test_reviews)
print("Predictions for Bag of Words features:", lr_bow_predict)

# Predicting the sentiment using the trained logistic regression model on TF-IDF features
lr_tfidf_predict = lr.predict(tv_test_reviews)
print("Predictions for TF-IDF features:", lr_tfidf_predict)


Predictions for Bag of Words features: [0 0 0 ... 0 0 0]
Predictions for TF-IDF features: [0 0 0 ... 0 0 0]


# ACCURACY OF THE MODEL


In [29]:
# Importing necessary metrics from scikit-learn
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Predicting the sentiment using the trained logistic regression model on Bag of Words features
lr_bow_predict = lr.predict(cv_test_reviews)

# Displaying classification report, confusion matrix, and accuracy score for Bag of Words predictions
print("Classification Report for Bag of Words predictions:")
print(classification_report(test_sentiments, lr_bow_predict))
print("Confusion Matrix for Bag of Words predictions:")
print(confusion_matrix(test_sentiments, lr_bow_predict))
print("Accuracy Score for Bag of Words predictions:", accuracy_score(test_sentiments, lr_bow_predict))

# Predicting the sentiment using the trained logistic regression model on TF-IDF features
lr_tfidf_predict = lr.predict(tv_test_reviews)

# Displaying classification report, confusion matrix, and accuracy score for TF-IDF predictions
print("\nClassification Report for TF-IDF predictions:")
print(classification_report(test_sentiments, lr_tfidf_predict))
print("Confusion Matrix for TF-IDF predictions:")
print(confusion_matrix(test_sentiments, lr_tfidf_predict))
print("Accuracy Score for TF-IDF predictions:", accuracy_score(test_sentiments, lr_tfidf_predict))


Classification Report for Bag of Words predictions:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99     15342
           1       1.00      0.79      0.88      1051

    accuracy                           0.99     16393
   macro avg       0.99      0.90      0.94     16393
weighted avg       0.99      0.99      0.99     16393

Confusion Matrix for Bag of Words predictions:
[[15342     0]
 [  220   831]]
Accuracy Score for Bag of Words predictions: 0.9865796376502165

Classification Report for TF-IDF predictions:
              precision    recall  f1-score   support

           0       0.94      1.00      0.97     15342
           1       0.00      0.00      0.00      1051

    accuracy                           0.94     16393
   macro avg       0.47      0.50      0.48     16393
weighted avg       0.88      0.94      0.90     16393

Confusion Matrix for TF-IDF predictions:
[[15342     0]
 [ 1051     0]]
Accuracy Score for TF-IDF predict

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# USING NAIVE BAYES

In [30]:
# Importing Multinomial Naive Bayes classifier from scikit-learn
from sklearn.naive_bayes import MultinomialNB


In [31]:
# Creating an instance of the Multinomial Naive Bayes classifier
mnb = MultinomialNB()

# Fitting the Multinomial Naive Bayes model for Bag of Words features
mnb_bow = mnb.fit(cv_train_reviews, train_sentiments)
print("Multinomial Naive Bayes Model for Bag of Words features:", mnb_bow)

# Fitting the Multinomial Naive Bayes model for TF-IDF features
mnb_tfidf = mnb.fit(tv_train_reviews, train_sentiments)
print("Multinomial Naive Bayes Model for TF-IDF features:", mnb_tfidf)


Multinomial Naive Bayes Model for Bag of Words features: MultinomialNB()
Multinomial Naive Bayes Model for TF-IDF features: MultinomialNB()


# PREDICTING THE MODEL

In [32]:
#Predicting the model for bag of words
mnb_bow_predict=mnb.predict(cv_test_reviews)
print(mnb_bow_predict)
#Predicting the model for tfidf features
mnb_tfidf_predict=mnb.predict(tv_test_reviews)
print(mnb_tfidf_predict)

[1 0 0 ... 0 1 0]
[0 0 0 ... 0 0 0]


# ACCURACY USING MULTONOMIAL NAIVE BAYES

In [33]:
# Accuracy score for Bag of Words predictions
mnb_bow_score = accuracy_score(test_sentiments, mnb_bow_predict)
print("Accuracy Score for Bag of Words predictions:", mnb_bow_score)

# Accuracy score for TF-IDF predictions
mnb_tfidf_score = accuracy_score(test_sentiments, mnb_tfidf_predict)
print("Accuracy Score for TF-IDF predictions:", mnb_tfidf_score)


Accuracy Score for Bag of Words predictions: 0.945769535777466
Accuracy Score for TF-IDF predictions: 0.9367412920148844
