In [1]:
from nltk.corpus import wordnet
import string
import numpy as np
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk import sent_tokenize, word_tokenize, pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier


  from numpy.core.umath_tests import inner1d


In [2]:
#Read the Hotel Reviews csv file
import pandas as pd
Hotel_Reviews = pd.read_csv("Hotel_Reviews.csv")

In [3]:
Hotel_Reviews.head(5)

Unnamed: 0,Hotel_Address,Additional_Number_of_Scoring,Review_Date,Average_Score,Hotel_Name,Reviewer_Nationality,Negative_Review,Review_Total_Negative_Word_Counts,Total_Number_of_Reviews,Positive_Review,Review_Total_Positive_Word_Counts,Total_Number_of_Reviews_Reviewer_Has_Given,Reviewer_Score,Tags,days_since_review,lat,lng
0,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,8/3/2017,7.7,Hotel Arena,Russia,I am so angry that i made this post available...,397,1403,Only the park outside of the hotel was beauti...,11,7,2.9,"[' Leisure trip ', ' Couple ', ' Duplex Double...",0 days,52.360576,4.915968
1,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,8/3/2017,7.7,Hotel Arena,Ireland,No Negative,0,1403,No real complaints the hotel was great great ...,105,7,7.5,"[' Leisure trip ', ' Couple ', ' Duplex Double...",0 days,52.360576,4.915968
2,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,7/31/2017,7.7,Hotel Arena,Australia,Rooms are nice but for elderly a bit difficul...,42,1403,Location was good and staff were ok It is cut...,21,9,7.1,"[' Leisure trip ', ' Family with young childre...",3 days,52.360576,4.915968
3,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,7/31/2017,7.7,Hotel Arena,United Kingdom,My room was dirty and I was afraid to walk ba...,210,1403,Great location in nice surroundings the bar a...,26,1,3.8,"[' Leisure trip ', ' Solo traveler ', ' Duplex...",3 days,52.360576,4.915968
4,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,7/24/2017,7.7,Hotel Arena,New Zealand,You When I booked with your company on line y...,140,1403,Amazing location and building Romantic setting,8,3,6.7,"[' Leisure trip ', ' Couple ', ' Suite ', ' St...",10 days,52.360576,4.915968


In [4]:
#Get all the positive and negative reviews into one column
Hotel_Reviews["reviews"] = Hotel_Reviews["Negative_Review"] + Hotel_Reviews["Positive_Review"]

#Create labels for good and bad reviews
Hotel_Reviews["is_bad_review"] = Hotel_Reviews["Reviewer_Score"].apply(lambda x: 1 if x < 5 else 0)
Hotel_Reviews["is_good_review"] = Hotel_Reviews["Reviewer_Score"].apply(lambda x: 1 if x > 5 else 0)

#Create a new dataframe with the relevant features
Hotel_Reviews = Hotel_Reviews[["reviews", "is_bad_review","is_good_review"]]
Hotel_Reviews.head()

Unnamed: 0,reviews,is_bad_review,is_good_review
0,I am so angry that i made this post available...,1,0
1,No Negative No real complaints the hotel was g...,0,1
2,Rooms are nice but for elderly a bit difficul...,0,1
3,My room was dirty and I was afraid to walk ba...,1,0
4,You When I booked with your company on line y...,0,1


In [5]:
##Remove the "No negative" and "No positive" words from the reviews
##If no comment was left by user then "No negative" or "No positive" is written instead
Hotel_Reviews["reviews"] = Hotel_Reviews["reviews"].apply(lambda x: x.replace("No Negative", "").replace("No Positive", ""))

In [6]:
#Take a sample size from the dataset 

hot_reviews= Hotel_Reviews.sample(frac = 0.1, replace = False, random_state=42)


In [7]:
##Inspired from https://nlpforhackers.io/sentiment-analysis-intro/
#Wordnet is used for POS tagging 
def reviews_pos(pos_tag):
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [8]:
##Inspired from https://krakensystems.co/blog/2019/sentiment-analysis-rule-based
def reviews_clean(reviews):
    # lower text
    reviews = reviews.lower()
    # tokenize text and remove puncutation
    reviews = [word.strip(string.punctuation) for word in reviews.split(" ")]
    # remove words that contain numbers
    reviews = [word for word in reviews if not any(c.isdigit() for c in word)]
    # remove stop words
    stop = stopwords.words('english')
    reviews = [x for x in reviews if x not in stop]
    # remove empty tokens
    reviews = [t for t in reviews if len(t) > 0]
    # pos tag text
    pos_tags = pos_tag(reviews)
    # lemmatize text
    reviews = [WordNetLemmatizer().lemmatize(t[0], reviews_pos(t[1])) for t in pos_tags]
    # remove words with only one letter
    reviews = [t for t in reviews if len(t) > 1]
    # join all
    reviews = " ".join(reviews)
    return(reviews)

# clean reviews data
hot_reviews["clean_reviews"] = hot_reviews["reviews"].apply(lambda x: reviews_clean(x))

In [9]:
## add sentiment anaylsis columns
#Vader is used because it uses a lexicon of words to find which ones are positive or negative
#It returns the following score
 #Negative score
 #Positive score
 #Neutral score
 #overall score

from nltk.sentiment.vader import SentimentIntensityAnalyzer

senti_analyser = SentimentIntensityAnalyzer()
hot_reviews["sentiments_reviews"] = hot_reviews["reviews"].apply(lambda x: senti_analyser.polarity_scores(x))
hot_reviews = pd.concat([hot_reviews.drop(['sentiments_reviews'], axis=1), hot_reviews['sentiments_reviews'].apply(pd.Series)], axis=1)



In [10]:
#Simple metrics to count the number of words and characters in each of the reviews
# add number of characters column
hot_reviews["number_of_chars"] = hot_reviews["reviews"].apply(lambda x: len(x))

# add number of words column
hot_reviews["number_of_words"] = hot_reviews["reviews"].apply(lambda x: len(x.split(" ")))

In [11]:
#TF-IDF is used for:
#TF calculate the number of times a word appears in a text
#IDF calculates the importance of this word
# add tf-idfs columns

review_vector = TfidfVectorizer(min_df = 10)
result = review_vector.fit_transform(hot_reviews["clean_reviews"]).toarray()
rev_tfidf = pd.DataFrame(result, columns = review_vector.get_feature_names())
rev_tfidf.columns = ["word_" + str(x) for x in rev_tfidf.columns]
rev_tfidf.index = hot_reviews.index
hot_reviews = pd.concat([hot_reviews, rev_tfidf], axis=1)

In [12]:
#View the dataset
hot_reviews.head(10)

Unnamed: 0,reviews,is_bad_review,is_good_review,clean_reviews,neg,neu,pos,compound,number_of_chars,number_of_words,...,word_yet,word_yoghurt,word_yogurt,word_young,word_yr,word_yummy,word_zero,word_ziggo,word_zone,word_zuid
488440,Would have appreciated a shop in the hotel th...,0,1,would appreciate shop hotel sell drinking wate...,0.049,0.617,0.334,0.9924,599,113,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
274649,No tissue paper box was present at the room,0,1,tissue paper box present room,0.216,0.784,0.0,-0.296,44,10,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
374688,Pillows Nice welcoming and service,0,1,pillow nice welcome service,0.0,0.345,0.655,0.6908,36,7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
404352,Everything including the nice upgrade The Hot...,0,1,everything include nice upgrade hotel revamp s...,0.0,0.621,0.379,0.9153,155,27,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
451596,Lovely hotel v welcoming staff,0,1,lovely hotel welcome staff,0.0,0.23,0.77,0.7717,32,7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
302161,They don t have free wifi The location is per...,0,1,free wifi location perfect lot time want look ...,0.0,0.735,0.265,0.8074,130,32,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
317079,Room generally a bit shabby with some lack of...,0,1,room generally bit shabby lack maintenance cru...,0.04,0.854,0.106,0.5859,318,57,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13963,Executive rooms 9th Floor don t have a bath T...,0,1,executive room floor bath website make look li...,0.047,0.823,0.13,0.8316,483,93,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
159785,Pity about the two days of rain Its centralit...,0,1,pity two day rain centrality proximity destina...,0.155,0.845,0.0,-0.296,76,14,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
195089,Didn t like it at all construction was in pro...,1,0,like construction progress stuff lie vacancy l...,0.108,0.66,0.231,0.6369,186,42,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
#Highest sentiment positive reviews
hot_reviews[hot_reviews["number_of_words"] >= 3].sort_values("pos", ascending = False)[["reviews", "pos"]].head(10)

Unnamed: 0,reviews,pos
43101,A perfect location comfortable great value,0.931
211742,Clean comfortable lovely staff,0.907
175551,Friendly welcome Comfortable room,0.905
365085,Good location great value,0.904
109564,Clean friendly and comfortable,0.902
145743,Good value amazing location,0.901
407590,breakfast excellent Clean comfort,0.899
407546,Great place I enjoyed,0.881
218571,Beautiful Quirky Comfortable,0.878
436901,Lovely comfortable rooms,0.877


In [16]:
##Lowest negative reviews
hot_reviews[hot_reviews["number_of_words"] >= 3].sort_values("neg", ascending = False)[["reviews", "neg"]].head(10)

Unnamed: 0,reviews,neg
474353,Poor,1.0
398554,Bad A C,1.0
150066,Everything poor Poor,0.861
67905,Nothing Excellent value,0.834
193086,No dislikes LOCATION,0.831
322590,No complaints Everything,0.831
317503,Nothing Friendly clean,0.83
356368,Nothing Great helpful wonderful staff,0.812
182887,Weird uncomfortable bathtub,0.811
318516,A disaster Nothing,0.804


In [33]:
# feature selection
label = ["is_good_review"]
columns = [label,"reviews","review_clean"]
features = [c for c in hot_reviews.columns if c not in columns]


In [None]:
#Split the data into train and test set
from sklearn.model_selection import train_test_split

train,test = train_test_split(reviews[features], test_size=0.8, random_state=42)
test,test1 = train_test_split(test, test_size=0.67, random_state=42)
test1,test2 = train_test_split(test1, test_size=0.5, random_state=42)

print(train.shape);print(test.shape);print(test1.shape);print(test2.shape)

In [None]:
#Apply the SVM model to the first set of data
##Inspired from https://medium.com/@vasista/sentiment-analysis-using-svm-338d418e3ff1
cl = svm.SVC(kernel='linear')
cl.fit(train[features], train['is_good_review'])
pred = cl.predict(test)
print("Results for SVC(kernel=linear)")
report = classification_report(test['is_good_review'], pred, output_dict=True)
print('positive: ', report['1'])
print('negative: ', report['0'])

In [None]:
#Apply the SVM model to the second set of data
cl = svm.SVC(kernel='linear')
cl.fit(train[features], train['is_good_review'])
pred = cl.predict(test1)
print("Results for SVC(kernel=linear)")
report = classification_report(test1['is_good_review'], pred, output_dict=True)
print('positive: ', report['1'])
print('negative: ', report['0'])

In [None]:
#Apply the SVM model to the third set of data
cl = svm.SVC(kernel='linear')
cl.fit(train[features], train['is_good_review'])
pred = cl.predict(test2)
print("Results for SVC(kernel=linear)")
report = classification_report(test2['is_good_review'], pred, output_dict=True)
print('positive: ', report['1'])
print('negative: ', report['0'])

In [None]:
#For second experiment
#Cleaning the data
def reviews_to_lowercase(reviews):
    review = []
    for rev in reviews:
        text = rev.lower()
        review.append(text)
    return review

def reviews_numbers(reviews):
    review = [word for word in reviews if not any(c.isdigit() for c in word)]
    return review

def reviews_punctuatation(reviews):
    review = []
    for rev in reviews:
        text = re.sub(r'[^\w\s]', '', rev)
        if text != '':
            review.append(text)
    return review

def review_stopwords(reviews):
    review = []
    for rev in reviews:
        if rev not in stopwords.words('english'):
            review.append(rev)
    return review

def review_empty_tokens(reviews):
    review = [t for t in reviews if len(t) > 0]
    return review

def lemmatize_review(reviews):
    # function to get the pos tag reviews
    def reviews_pos(pos_tag):
        if pos_tag.startswith('J'):
            return wordnet.ADJ
        elif pos_tag.startswith('V'):
            return wordnet.VERB
        elif pos_tag.startswith('N'):
            return wordnet.NOUN
        elif pos_tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN
    # pos tag reviews 
    pos_tags = pos_tag(reviews)
    
    #lemmatize the reviews using WordLemmtizer
    review = [WordNetLemmatizer().lemmatize(t[0], reviews_pos(t[1])) for t in pos_tags]
    return review

def review_word(reviews):
    #after lemmatization remove words that have only one letter
    review = [t for t in reviews if len(t) > 1]
    return review

def clean_reviews(reviews):
    reviews = reviews_to_lowercase(reviews)
    reviews = reviews_numbers(reviews)
    reviews = reviews_punctuatation(reviews)
    reviews = review_stopwords(reviews)
    reviews = review_empty_tokens(reviews)
    reviews = lemmatize_review(reviews)
    reviews = review_word(reviews)
    return reviews 