In [203]:
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import plotly.express as px
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [204]:
data = pd.read_csv('Finalised_Total_Reviews.csv')
data.columns = [x.lower() for x in data.columns]
data.head()

Unnamed: 0,name,review_count,average_rating,overall_experience,rating,title,comment_content,date,hotel_class,good_to_know
0,M Social Singapore,"2,042 reviews",4.0,"[4.2, 4.5, 4.2, 4.0]",5.0,"""Exceptional Experience at M Social Hotel”",I had a similarly extraordinary experience dur...,Date of stay: September 2023,4.0 of 5 stars,"['', 'Boutique', 'Trendy', 'English, Chinese, ..."
1,M Social Singapore,"2,042 reviews",4.0,"[4.2, 4.5, 4.2, 4.0]",5.0,Armando at the front desk,Armando of the front desk was kind enough to l...,Date of stay: September 2023,4.0 of 5 stars,"['', 'Boutique', 'Trendy', 'English, Chinese, ..."
2,M Social Singapore,"2,042 reviews",4.0,"[4.2, 4.5, 4.2, 4.0]",5.0,The best stay experience in Singapore,We spend two wonderful nights at M Social Sing...,Date of stay: September 2023,4.0 of 5 stars,"['', 'Boutique', 'Trendy', 'English, Chinese, ..."
3,M Social Singapore,"2,042 reviews",4.0,"[4.2, 4.5, 4.2, 4.0]",5.0,A much needed STAYCATION,I have been frequenting M Social every time I ...,Date of stay: September 2023,4.0 of 5 stars,"['', 'Boutique', 'Trendy', 'English, Chinese, ..."
4,M Social Singapore,"2,042 reviews",4.0,"[4.2, 4.5, 4.2, 4.0]",5.0,Super location super staff .,Excellent location great hotel... especially A...,Date of stay: September 2023,4.0 of 5 stars,"['', 'Boutique', 'Trendy', 'English, Chinese, ..."


Here, we'll filter out only relevant columns that is needed for sentiment analysis.

In [205]:
# get relevant info only for sentiment analysis
df = data.filter(['name', 'average_rating', 'rating', 'comment_content', 'date'], axis = 1)
df.head()

Unnamed: 0,name,average_rating,rating,comment_content,date
0,M Social Singapore,4.0,5.0,I had a similarly extraordinary experience dur...,Date of stay: September 2023
1,M Social Singapore,4.0,5.0,Armando of the front desk was kind enough to l...,Date of stay: September 2023
2,M Social Singapore,4.0,5.0,We spend two wonderful nights at M Social Sing...,Date of stay: September 2023
3,M Social Singapore,4.0,5.0,I have been frequenting M Social every time I ...,Date of stay: September 2023
4,M Social Singapore,4.0,5.0,Excellent location great hotel... especially A...,Date of stay: September 2023


In [206]:
df.count()

name               51709
average_rating     51709
rating             51709
comment_content    51709
date               51699
dtype: int64

In [207]:
df.dropna(inplace=True)
df.count()

name               51699
average_rating     51699
rating             51699
comment_content    51699
date               51699
dtype: int64

In [208]:
# allow user to select hotel name
hotel_name = 'AM Hotel'

In [209]:
df = df[df.loc[:, ['name']].values == hotel_name]

In [210]:
df['rating'].value_counts()

5.0    139
4.0     67
2.0      7
3.0      4
1.0      3
Name: rating, dtype: int64

In [211]:
# instantiate the analyzer
analyzer = SentimentIntensityAnalyzer()

In [212]:
# create new column consisting of sentiment scores
df['scores'] = df['comment_content'].apply(lambda review: analyzer.polarity_scores(review))
df['compound_score'] = df['scores'].apply(lambda d:d['compound'])

In [213]:
def sentiment(score):
    if score > 0.25:
        # pos
        return 1
    elif score < -0.25:
        # neg
        return -1
    else:
        # neutral
        return 0

df['sentiment'] = df['compound_score'].apply(sentiment)

In [214]:
df.head()

Unnamed: 0,name,average_rating,rating,comment_content,date,scores,compound_score,sentiment
9660,AM Hotel,4.5,4.0,I enjoyed my stay at the AM Hotel. It’s a budg...,Date of stay: August 2023,"{'neg': 0.027, 'neu': 0.819, 'pos': 0.155, 'co...",0.9863,1
9661,AM Hotel,4.5,4.0,Hotel amenities; Simple and Basic. Ideal for ...,Date of stay: March 2023,"{'neg': 0.0, 'neu': 0.914, 'pos': 0.086, 'comp...",0.5267,1
9662,AM Hotel,4.5,5.0,AM hotel Singapore nice and cozy place Jun 202...,Date of stay: June 2023,"{'neg': 0.0, 'neu': 0.68, 'pos': 0.32, 'compou...",0.9647,1
9663,AM Hotel,4.5,5.0,Am hotel nice and clean also super market near...,Date of stay: February 2023,"{'neg': 0.101, 'neu': 0.654, 'pos': 0.245, 'co...",0.9342,1
9664,AM Hotel,4.5,2.0,"When I booked this hotel, I asked to check in ...",Date of stay: December 2022,"{'neg': 0.027, 'neu': 0.912, 'pos': 0.061, 'co...",0.9737,1


We check for the reviews where the program identifies as 'Neutral'. Here, we get to compare the sentiment against the rating. Sometimes, users may leave a neutral review but rate 4.0 or above, implying a "positive" rating.

In [215]:
# check which columns are neutral
df.loc[df['sentiment'] == 0]

Unnamed: 0,name,average_rating,rating,comment_content,date,scores,compound_score,sentiment
9717,AM Hotel,4.5,1.0,Read TA reviews then booked and prepaid throug...,Date of stay: November 2018,"{'neg': 0.1, 'neu': 0.801, 'pos': 0.099, 'comp...",0.2118,0
9795,AM Hotel,4.5,2.0,This could be a quaint boutique budget hotel w...,Date of stay: November 2017,"{'neg': 0.078, 'neu': 0.843, 'pos': 0.079, 'co...",0.0093,0


In [216]:
df['sentiment'].value_counts()

 1    211
-1      7
 0      2
Name: sentiment, dtype: int64

In [217]:
fig = px.histogram(df, x="sentiment")
fig.show()

# Benchmarking

In [218]:
# compare across dataframe
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [219]:
def senti(x):
    if x > 3.0:
        return 1
    elif x == 3.0:
        return 0
    else:
        return -1

df['rating'] = df['rating'].apply(senti)

In [220]:
df['rating'].value_counts()

 1    206
-1     10
 0      4
Name: rating, dtype: int64

In [221]:
# random guessing is a 50% percent chance
# if accuracy > 0.5, model is good
accuracy_score(df['rating'], df['sentiment'])

0.9454545454545454

In [222]:
print(classification_report(df['rating'], df['sentiment']))

              precision    recall  f1-score   support

          -1       0.57      0.40      0.47        10
           0       0.00      0.00      0.00         4
           1       0.97      0.99      0.98       206

    accuracy                           0.95       220
   macro avg       0.51      0.46      0.48       220
weighted avg       0.93      0.95      0.94       220

