### 1. read data and basic data clean

In [5]:
#import data from a csv file and store in a dataframe
import pandas as pd
df = pd.read_csv('../data/Hotel_Reviews.csv')



df['Negative_Review'].isnull().sum(), df['Positive_Review'].isnull().sum()
# Clean the reviews from the dataset
#removing invalid ones, e.g. empty reviews, everything is good/bad and reviews with less than 10 characters
# output the df with two new columns: 'Negative_clean' and 'Positive_clean'

# process the Negative_Review column
# strip() to remove leading and trailing whitespaces for one column of df
import numpy as np

#change data type to string for the column NegativeReview
df['NegativeReview'] = df['Negative_Review'].astype(str)

#show data type for the column NegativeReview
print(type(df['NegativeReview'][0]))

df['Negative_clean'] = df['NegativeReview'].apply(lambda x: x.strip())

# if the text has less than 10 characters, replace the text with 'nothing' for the above column
df.loc[df['Negative_clean'].str.len() < 10, 'Negative_clean'] = ' '

# if the text has 'nothing' or 'everything' or 'anything' str in it regardless lower case, replace the text with 'nothing' for the above column
df.loc[df['Negative_clean'].str.contains('nothing|everything|anything|No Negative', case=False), 'Negative_clean'] = ' '

# do the same process for Positive_Review column
df['Positive_clean'] = df['Positive_Review'].apply(lambda x: x.strip())

# if the text has less than 10 characters, replace the text with 'nothing' for the above column
df.loc[df['Positive_clean'].str.len() < 10, 'Positive_clean'] = ' '

df['Positive_clean'].value_counts().head(30)

# if the text has 'nothing' or 'everything' or 'No Positive' str in it regardless lower case, replace the text with 'nothing' for the above column
df.loc[df['Positive_clean'].str.contains('nothing|everything|No Positive', case=False), 'Positive_clean'] = ' '
df['Positive_clean'].value_counts().head(30)


<class 'str'>


                              89417
Great location                 1419
The location                   1341
Good location                  1203
Friendly staff                  603
Excellent location              547
Location and staff              371
Location was great              294
Location was good               277
Good breakfast                  248
Location is great               236
Perfect location                227
good location                   225
Location location location      218
Location staff                  215
Location is good                214
Breakfast was good              198
the location                    195
Very friendly staff             194
Every thing                     187
Location was excellent          187
Excellent breakfast             182
Great breakfast                 178
Comfortable bed                 173
Cleanliness                     164
great location                  160
Location is perfect             154
Location is excellent       

In [6]:
# check ' ' value for the new columns
neg_invalid_count = df['Negative_clean'][df['Negative_clean']==" "].shape[0]
pos_invalid_count = df['Positive_clean'][df['Positive_clean']==" "].shape[0]
pos_invalid_count/len(df), neg_invalid_count/len(df)

(0.1733767920921088, 0.3747658694918738)

In [7]:
# merger two 'Negative_clean', 'Positive_clean' columns and save to a new column 'Reviews', and save to a new column 'Reviews_clean'
df['Reviews_clean'] = df['Negative_clean'] + df['Positive_clean']
df['Reviews_clean']

0         I am so angry that i made this post available ...
1          No real complaints the hotel was great great ...
2         Rooms are nice but for elderly a bit difficult...
3         My room was dirty and I was afraid to walk bar...
4         You When I booked with your company on line yo...
                                ...                        
515733    no trolly or staff to help you take the luggag...
515734    The hotel looks like 3 but surely not 4Breakfa...
515735    The ac was useless It was a hot week in vienna...
515736     The rooms are enormous and really comfortable...
515737    I was in 3rd floor It didn t work Free Wifesta...
Name: Reviews_clean, Length: 515738, dtype: object

In [8]:
df['Reviews_clean'].value_counts().head(10)



                       27827
 Great location          492
 The location            245
 Good location           217
 Excellent location      174
 Friendly staff          162
 Every thing             149
Small room               137
 Location and staff      122
 Perfect location         98
Name: Reviews_clean, dtype: int64

In [9]:
# if the value is ' ', drop the row
df_new = df[df['Reviews_clean'] != '  '][['Reviews_clean']]
df_new.shape

(487911, 1)

In [10]:
df_new.value_counts().head(30)

Reviews_clean                   
 Great location                     492
 The location                       245
 Good location                      217
 Excellent location                 174
 Friendly staff                     162
 Every thing                        149
Small room                          137
 Location and staff                 122
 Perfect location                    98
 Very friendly staff                 97
 Excellent breakfast                 86
Small rooms                          81
 Location location location          76
 Location staff                      75
 Good breakfast                      65
 Great breakfast                     65
 Friendly helpful staff              62
 Great staff                         60
 good location                       60
 Very friendly and helpful staff     59
 Excellent staff                     59
 great location                      57
Very small room                      57
 Very helpful staff                  56
 Excell

In [11]:
type(df_new)
# the data is cleaned and ready for further analysis

pandas.core.frame.DataFrame

In [12]:
# get the length of each string in column 'Reviews_clean'
df_new['length'] = df_new['Reviews_clean'].str.len()

df_new

Unnamed: 0,Reviews_clean,length
0,I am so angry that i made this post available ...,1909
1,No real complaints the hotel was great great ...,610
2,Rooms are nice but for elderly a bit difficult...,297
3,My room was dirty and I was afraid to walk bar...,1217
4,You When I booked with your company on line yo...,770
...,...,...
515733,no trolly or staff to help you take the luggag...,60
515734,The hotel looks like 3 but surely not 4Breakfa...,83
515735,The ac was useless It was a hot week in vienna...,77
515736,The rooms are enormous and really comfortable...,124


In [13]:
#get the mean of the length, but should check the word count instead, can do it 
#by using nltk library
df_new['length'].mean(), df_new['length'].median(), df_new['length'].max(), df_new['length'].min()

# with max num of char is 3800, rough 3800/5, 600, you need to check again after the preprocessing. 


(172.50744910444732, 117.0, 3801, 11)

In [44]:
# using a sample of 100 rows to test the model
test_df=df_new.head(1000)


### 2. Preprocessing

In [38]:
from nltk.corpus import stopwords 
import string
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize, pos_tag


def clean (text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, ' ') # Remove Punctuation
    lowercased = text.lower() # Lower Case
    tokenized = word_tokenize(lowercased) # Tokenize
  
    # tag each word with its part of speech
    tagged_words = pos_tag(tokenized)
    # remove adj and adv
    filtered_words = [word for word, tag in tagged_words if tag not in ['JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS']]

    words_only = [word for word in filtered_words if word.isalpha()] # Remove numbers
    stop_words = set(stopwords.words('english')) # Make stopword lists 
    #add more stop words
    stop_words.update(['hotel', 'booking', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten'])
    without_stopwords = [word for word in words_only if not word in stop_words] # Remove Stop Words
    lemma=WordNetLemmatizer() # Initiate Lemmatizer
    lemmatized = [lemma.lemmatize(word) for word in without_stopwords] # Lemmatize
    cleaned = ' '.join(lemmatized) # Join back to a string
    # print(f'{lowercased=}\n', f'{tokenized=}\n', f'{filtered_words=}\n', f'{without_stopwords=}\n', f'{words_only=}\n', f'{lemmatized=}\n', f'{cleaned=}\n')
    # print()
    return cleaned

clean(test_df['Reviews_clean'][0])

'made post via site use planing trip one make mistake booking place made via booking com stayed night hotel july upon arrival placed room floor hotel turned room booked reserved level room would window ceiling room mind broken window closed rain mini fridge contained sort bio weapon guessed smell asked change room explaining time booked btw cost got way volume ceiling offered room day check day clock order get room waned way begin holiday wait till order check room waist time room got wanted peaceful garden view window tired waiting room placed belonging rushed city evening turned noise room guess made vibrating tube something annoying hell stop making fall asleep wife audio recording attach want send via mail day came determine cause disturbing sound offered change room hotel booked room left one seems park outside hotel'

In [45]:

# Apply to all texts test set:
test_df['clean_text'] = test_df.Reviews_clean.apply(clean)
df_now = test_df.copy()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['clean_text'] = test_df.Reviews_clean.apply(clean)


In [None]:
# Apply to all texts full set:
df_new['clean_text'] = df_new.Reviews_clean.apply(clean)
df_new

### 3. Keyword Extraction using KeyBERT

In [46]:
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer

# function to get the keywords from the text using KeyBERT
def keywords_extract(text):
    sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
    kw_model = KeyBERT(model=sentence_model)
    keywords = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 1), top_n=100)
    return keywords

#convert df series to list

doc = df_now['clean_text'].tolist()
doc

keywords = keywords_extract(doc)
keywords

[[('booking', 0.4746),
  ('hotel', 0.4706),
  ('room', 0.417),
  ('booked', 0.3706),
  ('noise', 0.2893),
  ('audio', 0.2814),
  ('sound', 0.2808),
  ('reserved', 0.2806),
  ('place', 0.2734),
  ('mail', 0.267),
  ('recording', 0.2667),
  ('post', 0.2588),
  ('site', 0.2526),
  ('planing', 0.2523),
  ('ceiling', 0.2498),
  ('check', 0.2468),
  ('mistake', 0.2401),
  ('arrival', 0.2388),
  ('contained', 0.2304),
  ('floor', 0.2273),
  ('order', 0.2267),
  ('closed', 0.2251),
  ('evening', 0.225),
  ('btw', 0.2203),
  ('sort', 0.2191),
  ('change', 0.219),
  ('garden', 0.2174),
  ('disturbing', 0.215),
  ('park', 0.2143),
  ('mind', 0.2133),
  ('window', 0.2122),
  ('trip', 0.21),
  ('stayed', 0.2082),
  ('night', 0.2079),
  ('wanted', 0.206),
  ('fridge', 0.2035),
  ('got', 0.2016),
  ('placed', 0.2),
  ('holiday', 0.1979),
  ('asked', 0.1934),
  ('volume', 0.1861),
  ('vibrating', 0.185),
  ('outside', 0.1846),
  ('belonging', 0.1845),
  ('tired', 0.1807),
  ('wife', 0.1793),
  ('send'

In [51]:
stop_words = set(stopwords.words('english'))

len(stop_words)

191

In [None]:
# flatten column 'keywords' to get a list of keywords
list_list = test_df['keywords'].tolist()

In [None]:
# flatten the list of list
one_list = [item for sublist in list_list for item in sublist]
one_list


In [None]:
# convert the list to a DataFrame
keyword_df = pd.DataFrame(one_list, columns=['keyword', 'score'])
keyword_df

In [None]:
# get the top 10 keywords by groupby keyword and get the sum of the score
keyword_df.groupby('keyword').sum().sort_values(by='score', ascending=False).head(10)




### may not be useful

In [43]:
import spacy
def remove_adj_adv(sentence):
    

    #load small english model from spacy: https://spacy.io/models/en
    nlp = spacy.load('en_core_web_sm') #you can use other methods
    # excluded tags
    excluded_tags = {"ADJ", "ADV"} #"NOUN", "VERB", "ADJ", "ADV", "ADP", "PROPN"

    new_sentence = []
    for token in nlp(sentence):
        if token.pos_ not in excluded_tags:
            new_sentence.append(token.text)
    new_sentence_str = " ".join(new_sentence)
    return new_sentence_str

#print and compare the original and the new sentence
print("Before:", test_df['Reviews_clean'][0])
print("After:", remove_adj_adv(test_df['Reviews_clean'][0]))
print()

Before: I am so angry that i made this post available via all possible sites i use when planing my trips so no one will make the mistake of booking this place I made my booking via booking com We stayed for 6 nights in this hotel from 11 to 17 July Upon arrival we were placed in a small room on the 2nd floor of the hotel It turned out that this was not the room we booked I had specially reserved the 2 level duplex room so that we would have a big windows and high ceilings The room itself was ok if you don t mind the broken window that can not be closed hello rain and a mini fridge that contained some sort of a bio weapon at least i guessed so by the smell of it I intimately asked to change the room and after explaining 2 times that i booked a duplex btw it costs the same as a simple double but got way more volume due to the high ceiling was offered a room but only the next day SO i had to check out the next day before 11 o clock in order to get the room i waned to Not the best way to b

In [None]:
#load small english model from spacy: https://spacy.io/models/en
nlp = spacy.load('en_core_web_sm') #you can use other methods
nlp(test_df['Reviews_clean'][0])

I am so angry that i made this post available via all possible sites i use when planing my trips so no one will make the mistake of booking this place I made my booking via booking com We stayed for 6 nights in this hotel from 11 to 17 July Upon arrival we were placed in a small room on the 2nd floor of the hotel It turned out that this was not the room we booked I had specially reserved the 2 level duplex room so that we would have a big windows and high ceilings The room itself was ok if you don t mind the broken window that can not be closed hello rain and a mini fridge that contained some sort of a bio weapon at least i guessed so by the smell of it I intimately asked to change the room and after explaining 2 times that i booked a duplex btw it costs the same as a simple double but got way more volume due to the high ceiling was offered a room but only the next day SO i had to check out the next day before 11 o clock in order to get the room i waned to Not the best way to begin you