In [None]:
#import data from a csv file and store in a dataframe
import pandas as pd
df = pd.read_csv('../data/Hotel_Reviews.csv')




In [140]:
df.columns


Index(['Hotel_Address', 'Additional_Number_of_Scoring', 'Review_Date',
       'Average_Score', 'Hotel_Name', 'Reviewer_Nationality',
       'Negative_Review', 'Review_Total_Negative_Word_Counts',
       'Total_Number_of_Reviews', 'Positive_Review',
       'Review_Total_Positive_Word_Counts',
       'Total_Number_of_Reviews_Reviewer_Has_Given', 'Reviewer_Score', 'Tags',
       'days_since_review', 'lat', 'lng', 'Reviews', 'Neg_clean', 'Neg_short',
       'Negative_clean', 'Positive_clean', 'Reviews_clean'],
      dtype='object')

In [36]:
# Function to preprocess sentences
from nltk.corpus import stopwords 
import string
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize 

def clean (text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, ' ') # Remove Punctuation
    lowercased = text.lower() # Lower Case
    tokenized = word_tokenize(lowercased) # Tokenize
    words_only = [word for word in tokenized if word.isalpha()] # Remove numbers
    stop_words = set(stopwords.words('english')) # Make stopword list
    without_stopwords = [word for word in words_only if not word in stop_words] # Remove Stop Words
    lemma=WordNetLemmatizer() # Initiate Lemmatizer
    lemmatized = [lemma.lemmatize(word) for word in without_stopwords] # Lemmatize
    cleaned = ' '.join(lemmatized) # Join back to a string
    return cleaned


0    angry made post available via possible site us...
1                                             negative
2    room nice elderly bit difficult room two story...
3    room dirty afraid walk barefoot floor looked c...
4    booked company line showed picture room though...
Name: Neg_clean, dtype: object

In [95]:
# Clean the reviews from the dataset
#removing invalid ones, e.g. empty reviews, everything is good/bad and reviews with less than 10 characters
# output the df with two new columns: 'Negative_clean' and 'Positive_clean'

# process the Negative_Review column
# strip() to remove leading and trailing whitespaces for one column of df
df['Negative_clean'] = df['Negative_Review'].apply(lambda x: x.strip())

# if the text has less than 10 characters, replace the text with 'nothing' for the above column
df.loc[df['Negative_clean'].str.len() < 10, 'Negative_clean'] = ' '

# if the text has 'nothing' or 'everything' or 'anything' str in it regardless lower case, replace the text with 'nothing' for the above column
df.loc[df['Negative_clean'].str.contains('nothing|everything|anything|No Negative', case=False), 'Negative_clean'] = ' '




In [96]:
# do the same process for Positive_Review column
df['Positive_clean'] = df['Positive_Review'].apply(lambda x: x.strip())

# if the text has less than 10 characters, replace the text with 'nothing' for the above column
df.loc[df['Positive_clean'].str.len() < 10, 'Positive_clean'] = ' '

df['Positive_clean'].value_counts().head(30)

No Positive                   35946
                              22761
Everything                     2897
Great location                 1419
The location                   1341
Good location                  1203
Friendly staff                  603
Excellent location              547
everything                      371
Location and staff              371
Location was great              294
Location was good               277
Everything was perfect          276
Good breakfast                  248
Location is great               236
Perfect location                227
good location                   225
Location location location      218
Location staff                  215
Location is good                214
Breakfast was good              198
the location                    195
Very friendly staff             194
Every thing                     187
Location was excellent          187
Excellent breakfast             182
Great breakfast                 178
Comfortable bed             

In [97]:
# if the text has 'nothing' or 'everything' or 'No Positive' str in it regardless lower case, replace the text with 'nothing' for the above column
df.loc[df['Positive_clean'].str.contains('nothing|everything|No Positive', case=False), 'Positive_clean'] = ' '
df['Positive_clean'].value_counts().head(30)


                              89417
Great location                 1419
The location                   1341
Good location                  1203
Friendly staff                  603
Excellent location              547
Location and staff              371
Location was great              294
Location was good               277
Good breakfast                  248
Location is great               236
Perfect location                227
good location                   225
Location location location      218
Location staff                  215
Location is good                214
Breakfast was good              198
the location                    195
Very friendly staff             194
Every thing                     187
Location was excellent          187
Excellent breakfast             182
Great breakfast                 178
Comfortable bed                 173
Cleanliness                     164
great location                  160
Location is perfect             154
Location is excellent       

In [98]:
# merger two 'Negative_clean', 'Positive_clean' columns and save to a new column 'Reviews', and save to a new column 'Reviews_clean'
df['Reviews_clean'] = df['Negative_clean'] + df['Positive_clean']
df['Reviews_clean']

0         I am so angry that i made this post available ...
1          No real complaints the hotel was great great ...
2         Rooms are nice but for elderly a bit difficult...
3         My room was dirty and I was afraid to walk bar...
4         You When I booked with your company on line yo...
                                ...                        
515733    no trolly or staff to help you take the luggag...
515734    The hotel looks like 3 but surely not 4Breakfa...
515735    The ac was useless It was a hot week in vienna...
515736     The rooms are enormous and really comfortable...
515737    I was in 3rd floor It didn t work Free Wifesta...
Name: Reviews_clean, Length: 515738, dtype: object

In [141]:
df['Reviews_clean'].value_counts()



                                                                                                                            27827
 Great location                                                                                                               492
 The location                                                                                                                 245
 Good location                                                                                                                217
 Excellent location                                                                                                           174
                                                                                                                            ...  
no changes neededcentral location nice staff clean rooms breakfast good selection complimentary water muffins                   1
The space the iron ruined partners shirt value of moneyThe bed was comfortable and the loc

In [114]:
df['Reviews_clean'][0]

'I am so angry that i made this post available via all possible sites i use when planing my trips so no one will make the mistake of booking this place I made my booking via booking com We stayed for 6 nights in this hotel from 11 to 17 July Upon arrival we were placed in a small room on the 2nd floor of the hotel It turned out that this was not the room we booked I had specially reserved the 2 level duplex room so that we would have a big windows and high ceilings The room itself was ok if you don t mind the broken window that can not be closed hello rain and a mini fridge that contained some sort of a bio weapon at least i guessed so by the smell of it I intimately asked to change the room and after explaining 2 times that i booked a duplex btw it costs the same as a simple double but got way more volume due to the high ceiling was offered a room but only the next day SO i had to check out the next day before 11 o clock in order to get the room i waned to Not the best way to begin yo

In [139]:
#merge string from the same column of a df serial doc

doc = df['Reviews_clean'][0:30].str.cat(sep=' ')
doc

'I am so angry that i made this post available via all possible sites i use when planing my trips so no one will make the mistake of booking this place I made my booking via booking com We stayed for 6 nights in this hotel from 11 to 17 July Upon arrival we were placed in a small room on the 2nd floor of the hotel It turned out that this was not the room we booked I had specially reserved the 2 level duplex room so that we would have a big windows and high ceilings The room itself was ok if you don t mind the broken window that can not be closed hello rain and a mini fridge that contained some sort of a bio weapon at least i guessed so by the smell of it I intimately asked to change the room and after explaining 2 times that i booked a duplex btw it costs the same as a simple double but got way more volume due to the high ceiling was offered a room but only the next day SO i had to check out the next day before 11 o clock in order to get the room i waned to Not the best way to begin yo

In [143]:
df[['Negative_clean', 'Positive_clean', 'Review_Date', 'Hotel_Name']].shape

(515738, 4)

In [None]:
df.rename(columns={negative_review: 'Negative_Review', positive_review: 'Positive_Review', Review_Date: 'Review_Date', Hotel_Name: 'Hotel_Name'}, inplace=True)

In [53]:
from keybert import KeyBERT

# doc = """
#          Supervised learning is the machine learning task of learning a function that
#          maps an input to an output based on example input-output pairs. It infers a
#          function from labeled training data consisting of a set of training examples.
#          In supervised learning, each example is a pair consisting of an input object
#          (typically a vector) and a desired output value (also called the supervisory signal).
#          A supervised learning algorithm analyzes the training data and produces an inferred function,
#          which can be used for mapping new examples. An optimal scenario will allow for the
#          algorithm to correctly determine the class labels for unseen instances. This requires
#          the learning algorithm to generalize from the training data to unseen situations in a
#          'reasonable' way (see inductive bias).
#       """
kw_model = KeyBERT()
keywords = kw_model.extract_keywords(doc)

In [54]:
keywords

[('rooms', 0.5254),
 ('hotel', 0.503),
 ('booking', 0.4872),
 ('bedroom', 0.4834),
 ('room', 0.4767)]

In [55]:
kw_model.extract_keywords(doc, keyphrase_ngram_range=(1, 1), stop_words=None)


[('rooms', 0.5254),
 ('hotel', 0.503),
 ('booking', 0.4872),
 ('bedroom', 0.4834),
 ('room', 0.4767)]

In [56]:
kw_model.extract_keywords(doc, keyphrase_ngram_range=(1, 1), top_n=100, stop_words=None)


[('rooms', 0.5254),
 ('hotel', 0.503),
 ('booking', 0.4872),
 ('bedroom', 0.4834),
 ('room', 0.4767),
 ('hotels', 0.4469),
 ('bedrooms', 0.4215),
 ('ceiling', 0.3684),
 ('booked', 0.3675),
 ('guests', 0.3643),
 ('accommodated', 0.3571),
 ('beds', 0.3371),
 ('hallway', 0.3316),
 ('ceilings', 0.3237),
 ('checkout', 0.3188),
 ('bed', 0.317),
 ('furniture', 0.3019),
 ('refebishment', 0.3006),
 ('holiday', 0.3004),
 ('hall', 0.2999),
 ('arriving', 0.2963),
 ('spacious', 0.2925),
 ('floor', 0.2887),
 ('facilities', 0.2857),
 ('building', 0.2841),
 ('guest', 0.2813),
 ('rent', 0.2806),
 ('mattress', 0.277),
 ('remodeling', 0.2757),
 ('evening', 0.2755),
 ('duplex', 0.2731),
 ('transaction', 0.273),
 ('delivery', 0.2728),
 ('doors', 0.2728),
 ('refurbishment', 0.2722),
 ('planing', 0.2712),
 ('door', 0.2706),
 ('floors', 0.2705),
 ('spend', 0.2703),
 ('walls', 0.2696),
 ('storeyed', 0.2691),
 ('troubles', 0.268),
 ('planning', 0.2664),
 ('access', 0.2654),
 ('couch', 0.2641),
 ('buildings', 0.

In [57]:
keywords = kw_model.extract_keywords(doc, highlight=True)