In [64]:
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import plotly.express as px
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [65]:
data = pd.read_csv('/content/Hotel_Review-Singapore_Marriott_Tang_Plaza_Hotel-Singapore.html_reviews.csv')
data.columns = [x.lower() for x in data.columns]
data.head()

Unnamed: 0,comment_content,name,rating,date
0,Crossroad Restaurant : no service,Singapore Marriott Tang Plaza Hotel,10,Date of stay: September 2023
1,Big thanks to Sarina! You made our stay special,Singapore Marriott Tang Plaza Hotel,50,Date of stay: September 2023
2,"Stay for the location, not much else",Singapore Marriott Tang Plaza Hotel,30,Date of stay: September 2023
3,Cross road,Singapore Marriott Tang Plaza Hotel,50,Date of stay: August 2023
4,Traveler Beware,Singapore Marriott Tang Plaza Hotel,10,Date of stay: September 2023


Here, we'll filter out only relevant columns that is needed for sentiment analysis.

In [66]:
# get relevant info only for sentiment analysis
df = data.filter(['name', 'average_rating', 'rating', 'comment_content'], axis = 1)
df.head()

Unnamed: 0,name,rating,comment_content
0,Singapore Marriott Tang Plaza Hotel,10,Crossroad Restaurant : no service
1,Singapore Marriott Tang Plaza Hotel,50,Big thanks to Sarina! You made our stay special
2,Singapore Marriott Tang Plaza Hotel,30,"Stay for the location, not much else"
3,Singapore Marriott Tang Plaza Hotel,50,Cross road
4,Singapore Marriott Tang Plaza Hotel,10,Traveler Beware


In [67]:
df['rating'] = [x/10 for x in df['rating'] if x >= 10]

In [68]:
df.count()

name               100
rating             100
comment_content    100
dtype: int64

In [69]:
df.dropna(inplace=True)
df.head()

Unnamed: 0,name,rating,comment_content
0,Singapore Marriott Tang Plaza Hotel,1.0,Crossroad Restaurant : no service
1,Singapore Marriott Tang Plaza Hotel,5.0,Big thanks to Sarina! You made our stay special
2,Singapore Marriott Tang Plaza Hotel,3.0,"Stay for the location, not much else"
3,Singapore Marriott Tang Plaza Hotel,5.0,Cross road
4,Singapore Marriott Tang Plaza Hotel,1.0,Traveler Beware


In [70]:
df['rating'].value_counts()

5.0    59
1.0    16
4.0    15
2.0     6
3.0     4
Name: rating, dtype: int64

In [71]:
# instantiate the analyzer
analyzer = SentimentIntensityAnalyzer()

In [72]:
# create new column consisting of sentiment scores
df['scores'] = df['comment_content'].apply(lambda review: analyzer.polarity_scores(review))
df['compound_score'] = df['scores'].apply(lambda d:d['compound'])

In [73]:
def sentiment(score):
    if score > 0.25:
        # pos
        return 1
    elif score < -0.25:
        # neg
        return -1
    else:
        # neutral
        return 0

df['sentiment'] = df['compound_score'].apply(sentiment)

In [74]:
df.head()

Unnamed: 0,name,rating,comment_content,scores,compound_score,sentiment
0,Singapore Marriott Tang Plaza Hotel,1.0,Crossroad Restaurant : no service,"{'neg': 0.423, 'neu': 0.577, 'pos': 0.0, 'comp...",-0.296,-1
1,Singapore Marriott Tang Plaza Hotel,5.0,Big thanks to Sarina! You made our stay special,"{'neg': 0.0, 'neu': 0.543, 'pos': 0.457, 'comp...",0.7088,1
2,Singapore Marriott Tang Plaza Hotel,3.0,"Stay for the location, not much else","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,0
3,Singapore Marriott Tang Plaza Hotel,5.0,Cross road,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,0
4,Singapore Marriott Tang Plaza Hotel,1.0,Traveler Beware,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,0


We check for the reviews where the program identifies as 'Neutral'. Here, we get to compare the sentiment against the rating. Sometimes, users may leave a neutral review but rate 4.0 or above, implying a "positive" rating.

In [75]:
# check which columns are neutral
df.loc[df['sentiment'] == 0]

Unnamed: 0,name,rating,comment_content,scores,compound_score,sentiment
2,Singapore Marriott Tang Plaza Hotel,3.0,"Stay for the location, not much else","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,0
3,Singapore Marriott Tang Plaza Hotel,5.0,Cross road,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,0
4,Singapore Marriott Tang Plaza Hotel,1.0,Traveler Beware,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,0
7,Singapore Marriott Tang Plaza Hotel,5.0,Familystaycay,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,0
10,Singapore Marriott Tang Plaza Hotel,4.0,"Ok hotel, not excited to return","{'neg': 0.247, 'neu': 0.486, 'pos': 0.267, 'co...",0.0423,0
16,Singapore Marriott Tang Plaza Hotel,1.0,Unclear staff and discount menu discrepancy at...,"{'neg': 0.18, 'neu': 0.721, 'pos': 0.099, 'com...",-0.2263,0
20,Singapore Marriott Tang Plaza Hotel,3.0,Great Service; Terrible hardware,"{'neg': 0.337, 'neu': 0.217, 'pos': 0.446, 'co...",0.25,0
21,Singapore Marriott Tang Plaza Hotel,5.0,Unbeatable property,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,0
31,Singapore Marriott Tang Plaza Hotel,4.0,Experience staying at Singapore Marriott Tang ...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,0
33,Singapore Marriott Tang Plaza Hotel,1.0,Some issues.,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,0


In [76]:
df['sentiment'].value_counts()

 1    52
 0    32
-1    16
Name: sentiment, dtype: int64

In [77]:
fig = px.histogram(df, x="sentiment")
fig.show()

# Benchmarking

In [78]:
# compare across dataframe
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [79]:
def senti(x):
    if x > 3.0:
        return 1
    elif x >= 2.0 and x <= 3.0:
        return 0
    else:
        return -1

df['rating'] = df['rating'].apply(senti)

In [80]:
df['rating'].value_counts()

 1    74
-1    16
 0    10
Name: rating, dtype: int64

In [81]:
# random guessing is a 50% percent chance
# if accuracy > 0.5, model is good
accuracy_score(df['rating'], df['sentiment'])

0.63

In [82]:
print(classification_report(df['rating'], df['sentiment']))

              precision    recall  f1-score   support

          -1       0.56      0.56      0.56        16
           0       0.12      0.40      0.19        10
           1       0.96      0.68      0.79        74

    accuracy                           0.63       100
   macro avg       0.55      0.55      0.52       100
weighted avg       0.81      0.63      0.70       100

