### 1. read data and basic data clean

In [258]:
#import data from a csv file and store in a dataframe
import pandas as pd
df = pd.read_csv('../data/Hotel_Reviews.csv')



df['Negative_Review'].isnull().sum(), df['Positive_Review'].isnull().sum()
# Clean the reviews from the dataset
#removing invalid ones, e.g. empty reviews, everything is good/bad and reviews with less than 10 characters
# output the df with two new columns: 'Negative_clean' and 'Positive_clean'

# process the Negative_Review column
# strip() to remove leading and trailing whitespaces for one column of df
import numpy as np

#change data type to string for the column NegativeReview
df['NegativeReview'] = df['Negative_Review'].astype(str)

#show data type for the column NegativeReview
print(type(df['NegativeReview'][0]))

df['Negative_clean'] = df['NegativeReview'].apply(lambda x: x.strip())

# if the text has less than 10 characters, replace the text with 'nothing' for the above column
df.loc[df['Negative_clean'].str.len() < 10, 'Negative_clean'] = ' '

# if the text has 'nothing' or 'everything' or 'anything' str in it regardless lower case, replace the text with 'nothing' for the above column
df.loc[df['Negative_clean'].str.contains('nothing|everything|anything|No Negative', case=False), 'Negative_clean'] = ' '

# do the same process for Positive_Review column
df['Positive_clean'] = df['Positive_Review'].apply(lambda x: x.strip())

# if the text has less than 10 characters, replace the text with 'nothing' for the above column
df.loc[df['Positive_clean'].str.len() < 10, 'Positive_clean'] = ' '

df['Positive_clean'].value_counts().head(30)

# if the text has 'nothing' or 'everything' or 'No Positive' str in it regardless lower case, replace the text with 'nothing' for the above column
df.loc[df['Positive_clean'].str.contains('nothing|everything|No Positive', case=False), 'Positive_clean'] = ' '
df['Positive_clean'].value_counts().head(30)


<class 'str'>


                              89417
Great location                 1419
The location                   1341
Good location                  1203
Friendly staff                  603
Excellent location              547
Location and staff              371
Location was great              294
Location was good               277
Good breakfast                  248
Location is great               236
Perfect location                227
good location                   225
Location location location      218
Location staff                  215
Location is good                214
Breakfast was good              198
the location                    195
Very friendly staff             194
Every thing                     187
Location was excellent          187
Excellent breakfast             182
Great breakfast                 178
Comfortable bed                 173
Cleanliness                     164
great location                  160
Location is perfect             154
Location is excellent       

In [259]:
# check ' ' value for the new columns
neg_invalid_count = df['Negative_clean'][df['Negative_clean']==" "].shape[0]
pos_invalid_count = df['Positive_clean'][df['Positive_clean']==" "].shape[0]
pos_invalid_count/len(df), neg_invalid_count/len(df)

(0.1733767920921088, 0.3747658694918738)

In [260]:
# merger two 'Negative_clean', 'Positive_clean' columns and save to a new column 'Reviews', and save to a new column 'Reviews_clean'
df['Reviews_clean'] = df['Negative_clean'] + df['Positive_clean']
df['Reviews_clean']

0         I am so angry that i made this post available ...
1          No real complaints the hotel was great great ...
2         Rooms are nice but for elderly a bit difficult...
3         My room was dirty and I was afraid to walk bar...
4         You When I booked with your company on line yo...
                                ...                        
515733    no trolly or staff to help you take the luggag...
515734    The hotel looks like 3 but surely not 4Breakfa...
515735    The ac was useless It was a hot week in vienna...
515736     The rooms are enormous and really comfortable...
515737    I was in 3rd floor It didn t work Free Wifesta...
Name: Reviews_clean, Length: 515738, dtype: object

In [261]:
df['Reviews_clean'].value_counts().head(10)



                       27827
 Great location          492
 The location            245
 Good location           217
 Excellent location      174
 Friendly staff          162
 Every thing             149
Small room               137
 Location and staff      122
 Perfect location         98
Name: Reviews_clean, dtype: int64

In [262]:
# if the value is ' ', drop the row
df_new = df[df['Reviews_clean'] != '  '][['Reviews_clean']]
df_new.shape

(487911, 1)

In [263]:
df_new.value_counts().head(30)

Reviews_clean                   
 Great location                     492
 The location                       245
 Good location                      217
 Excellent location                 174
 Friendly staff                     162
 Every thing                        149
Small room                          137
 Location and staff                 122
 Perfect location                    98
 Very friendly staff                 97
 Excellent breakfast                 86
Small rooms                          81
 Location location location          76
 Location staff                      75
 Good breakfast                      65
 Great breakfast                     65
 Friendly helpful staff              62
 Great staff                         60
 good location                       60
 Very friendly and helpful staff     59
 Excellent staff                     59
 great location                      57
Very small room                      57
 Very helpful staff                  56
 Excell

In [264]:
type(df_new)
# the data is cleaned and ready for further analysis

pandas.core.frame.DataFrame

In [265]:
# get the length of each string in column 'Reviews_clean'
df_new['length'] = df_new['Reviews_clean'].str.len()

df_new

Unnamed: 0,Reviews_clean,length
0,I am so angry that i made this post available ...,1909
1,No real complaints the hotel was great great ...,610
2,Rooms are nice but for elderly a bit difficult...,297
3,My room was dirty and I was afraid to walk bar...,1217
4,You When I booked with your company on line yo...,770
...,...,...
515733,no trolly or staff to help you take the luggag...,60
515734,The hotel looks like 3 but surely not 4Breakfa...,83
515735,The ac was useless It was a hot week in vienna...,77
515736,The rooms are enormous and really comfortable...,124


In [276]:
#get the mean of the length, but should check the word count instead, can do it 
#by using nltk library
df_new['length'].mean(), df_new['length'].median(), df_new['length'].max(), df_new['length'].min()

# with max num of char is 3800, rough 3800/5, 600, you need to check again after the preprocessing. 


(172.50744910444732, 117.0, 3801, 11)

### 2. Preprocessing

In [None]:
from nltk.corpus import stopwords 
import string
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize 

def clean (text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, ' ') # Remove Punctuation
    lowercased = text.lower() # Lower Case
    tokenized = word_tokenize(lowercased) # Tokenize
    words_only = [word for word in tokenized if word.isalpha()] # Remove numbers
    stop_words = set(stopwords.words('english')) # Make stopword list
    without_stopwords = [word for word in words_only if not word in stop_words] # Remove Stop Words
    lemma=WordNetLemmatizer() # Initiate Lemmatizer
    lemmatized = [lemma.lemmatize(word) for word in without_stopwords] # Lemmatize
    cleaned = ' '.join(lemmatized) # Join back to a string
    return cleaned

# Apply to all texts
data['clean_text'] = data.text.apply(clean)

data.head()

### 3. Keyword Extraction using KeyBERT

In [267]:
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer

# function to get the keywords from the text using KeyBERT
def keywords_extract(text):
    sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
    kw_model = KeyBERT(model=sentence_model)
    keywords = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 1), top_n=100)
    return keywords

doc = df_new['Reviews_clean'][1]


keywords = keywords_extract(doc)
keywords

[('amsterdam', 0.434),
 ('hotel', 0.4064),
 ('restaurant', 0.3946),
 ('checkout', 0.3222),
 ('complaints', 0.321),
 ('amenities', 0.288),
 ('rooms', 0.2667),
 ('quality', 0.2666),
 ('deposit', 0.2181),
 ('surroundings', 0.1729),
 ('location', 0.1708),
 ('payment', 0.1664),
 ('payments', 0.1579),
 ('fantastic', 0.1531),
 ('refund', 0.1504),
 ('food', 0.1462),
 ('service', 0.1397),
 ('recommendations', 0.1387),
 ('spot', 0.1308),
 ('great', 0.1274),
 ('excellent', 0.1268),
 ('regarding', 0.1215),
 ('toasted', 0.1183),
 ('site', 0.1168),
 ('vegetarian', 0.1122),
 ('check', 0.1104),
 ('return', 0.1061),
 ('vegan', 0.1057),
 ('lacking', 0.1031),
 ('real', 0.1007),
 ('bit', 0.1006),
 ('original', 0.0839),
 ('aside', 0.0826),
 ('sandwich', 0.0823),
 ('secondly', 0.0806),
 ('things', 0.0761),
 ('staff', 0.0701),
 ('confusing', 0.0661),
 ('offer', 0.0573),
 ('option', 0.0461),
 ('firstly', 0.0461),
 ('thought', 0.0436),
 ('new', 0.0351),
 ('minor', 0.0076),
 ('make', 0.0036),
 ('wrap', -0.0291)

In [268]:
test_df = df_new.head(2) 

In [269]:
# get the list of keywords for each the review in the dataframe, append to a big list

test_df['keywords'] = test_df.apply(lambda x: keywords_extract(x))


In [270]:
test_df

Unnamed: 0,Reviews_clean,length
0,I am so angry that i made this post available ...,1909
1,No real complaints the hotel was great great ...,610


In [271]:
# flatten column 'keywords' to get a list of keywords
list_list = test_df['keywords'].tolist()

In [272]:
# flatten the list of list
one_list = [item for sublist in list_list for item in sublist]
one_list


[('hotel', 0.6927),
 ('facilities', 0.5691),
 ('location', 0.4048),
 ('large', 0.1548),
 ('good', 0.0743),
 ('wifi', 0.442),
 ('pool', 0.3917),
 ('access', 0.2954),
 ('location', 0.2578),
 ('friendly', 0.194),
 ('free', 0.1774),
 ('staff', 0.1652),
 ('includedgreat', 0.1578),
 ('age', 0.1315),
 ('nice', 0.0984),
 ('day', 0.0936),
 ('smelt', 0.7067),
 ('room', 0.4993),
 ('musty', 0.4962),
 ('deposit', 0.537),
 ('booking', 0.4564),
 ('unexpected', 0.2866),
 ('confirmation', 0.2711),
 ('room', 0.2677),
 ('enter', 0.2337),
 ('morning', 0.2073),
 ('staff', 0.1846),
 ('attenpting', 0.1775),
 ('early', 0.1702),
 ('asked', 0.1363),
 ('wasnt', 0.1352),
 ('occasions', 0.1288),
 ('airport', 0.5491),
 ('airportvery', 0.5038),
 ('staff', 0.3864),
 ('pay', 0.3822),
 ('paid', 0.3412),
 ('received', 0.2052),
 ('close', 0.0838),
 ('having', 0.0809),
 ('room', 0.3911),
 ('key', 0.3538),
 ('customer', 0.2404),
 ('upgraded', 0.2375),
 ('dodgy', 0.2275),
 ('service', 0.1622),
 ('bit', 0.0979),
 ('immediate

In [273]:
# convert the list to a DataFrame
keyword_df = pd.DataFrame(one_list, columns=['keyword', 'score'])
keyword_df

Unnamed: 0,keyword,score
0,hotel,0.6927
1,facilities,0.5691
2,location,0.4048
3,large,0.1548
4,good,0.0743
...,...,...
9800,free,0.2717
9801,didn,0.1560
9802,kind,0.1462
9803,work,0.1458


In [274]:
# get the top 10 keywords by groupby keyword and get the sum of the score
keyword_df.groupby('keyword').sum().sort_values(by='score', ascending=False).head(10)




Unnamed: 0_level_0,score
keyword,Unnamed: 1_level_1
room,137.7615
hotel,96.8256
staff,95.506
location,76.2193
rooms,64.4714
breakfast,57.0175
small,38.1044
bed,37.4988
wifi,30.7716
friendly,27.3309
