### 1. read the train data and call basic cleaning function


In [None]:
#import data from a csv file and store in a dataframe with only 4 columns we need
import pandas as pd
df = pd.read_csv('../data/Hotel_Reviews.csv')

review_df = df[['Hotel_Name', 'Review_Date', 'Positive_Review', 'Negative_Review']]
review_df


In [None]:
# String to make the review text invalid
# Positive_Review: invalid reviews are with the following strings:
test_pos_invalid_content = 'there are no comments available for this review|everything'
# Negative_Review: invalid reviews are with the following strings:
test_neg_invalid_content = 'nothing|n/a|none'

In [None]:

# Positive_Review: invalid reviews are with the following strings:'There are no comments available for this review', 'everything'
train_pos_invalid_content = 'nothing|everything|no positive'
# Negative_Review: invalid reviews are with the following strings:
train_neg_invalid_content = 'nothing|everything|anything|no negative'


In [None]:

import sys

# Add the directory containing the module to the Python path
sys.path.append('/Users/zengsheng/code/TechLah/RevuSum')

# Import the module that contains the function
import app.basic_review_clean as brc



In [None]:
# get the reviews for one hotel
review_df_now = review_df[review_df['Hotel_Name'] == 'Hotel Arena'] 
review_df_now

In [None]:
## test the function ###
cleaned_review_df = brc.clean_hotel_reviews(review_df_now, train_pos_invalid_content, train_neg_invalid_content)
print(cleaned_review_df.head())

In [None]:
cleaned_review_df.info()

### 1.2 read the test data and call basic cleaning function


In [None]:
import pandas as pd

# this is cleaned data, so no need to basic clean again
test_data_path = '/Users/zengsheng/code/TechLah/RevuSum/data/cleaned_test_data_5.pkl' # test data pkl file path
# Read the DataFrame from the pickle file
review_df = pd.read_pickle(test_data_path)
review_df['Hotel_Name'][2]

In [None]:
# get the reviews for one hotel
hotel_name = 'Ibis Budget Singapore Pearl'
cleaned_review_df = review_df[review_df['Hotel_Name'] == hotel_name] 
cleaned_review_df

In [147]:
cleaned_review_df['Positive_Review']
review_df['Hotel_Name'][12]

12    Holiday Inn Express Singapore Katong, an IHG H...
12                          Ibis Budget Singapore Pearl
12                         ibis Styles Singapore Albert
12    Village Hotel Albert Court by Far East Hospita...
Name: Hotel_Name, dtype: object

### 2. Preprocessing

In [None]:
from nltk.corpus import stopwords 
import string
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize, pos_tag


def preprocessing (text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, ' ') # Remove Punctuation
    lowercased = text.lower() # Lower Case
    tokenized = word_tokenize(lowercased) # Tokenize
  
    # tag each word with its part of speech
    tagged_words = pos_tag(tokenized)
    # remove adj and adv
    filtered_words = [word for word, tag in tagged_words if tag not in ['JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS']]

    words_only = [word for word in filtered_words if word.isalpha()] # Remove numbers
    stop_words = set(stopwords.words('english')) # Make stopword lists 
    #add more stop words
    stop_words.update(['hotel', 'booking', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten'])
    #print(len(stop_words))
    
    without_stopwords = [word for word in words_only if not word in stop_words] # Remove Stop Words
    lemma=WordNetLemmatizer() # Initiate Lemmatizer
    lemmatized = [lemma.lemmatize(word) for word in without_stopwords] # Lemmatize
    cleaned = ' '.join(lemmatized) # Join back to a string
    # print(f'{lowercased=}\n', f'{tokenized=}\n', f'{filtered_words=}\n', f'{without_stopwords=}\n', f'{words_only=}\n', f'{lemmatized=}\n', f'{cleaned=}\n')
    # print()
    return cleaned

preprocessing(cleaned_review_df['Positive_Review'][1])

In [None]:
# defining a function to preprocess the positive and negative reviews and merge them into one column
# drop those rows with empty processed_text
def preprocess_all(cleaned_review_df):

    # preporcessing the positive and negative reviews and save them in two new columns:
    cleaned_review_df['pos_processed_text'] = cleaned_review_df.Positive_Review.apply(lambda x: preprocessing(x) if not pd.isna(x) else x)
    cleaned_review_df['neg_processed_text'] = cleaned_review_df.Negative_Review.apply(lambda x: preprocessing(x) if not pd.isna(x) else x)
    df_now = cleaned_review_df.copy()

    # merge the positive and negative reviews into one new column:'processed_text'
    cleaned_review_df['processed_text'] = cleaned_review_df['pos_processed_text'] + cleaned_review_df['neg_processed_text']



    #drop those rows with empty processed_text
    df_now = cleaned_review_df[['processed_text']].dropna()
    return df_now

doc_df = preprocess_all(cleaned_review_df)
doc_df

In [None]:
#check the dimension of the doc list
type(doc_df['processed_text'][1])
#convert df seriral to a list
doc_list = doc_df['processed_text'].tolist()
doc_list

### 3. Keyword Extraction using KeyBERT

In [None]:
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer

# function to get the keywords from the text using KeyBERT
def keywords_extract(text, top_n=5):
    sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
    kw_model = KeyBERT(model=sentence_model)
    keywords = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 1), top_n=top_n)
    return keywords

keywords = keywords_extract(doc_list, 100)
keywords

In [145]:
# flatten the list of list
one_list = [item for sublist in keywords for item in sublist]
one_list

# convert the list to a DataFrame
keyword_df = pd.DataFrame(one_list, columns=['keyword', 'score'])
keyword_df
# get the top 10 keywords by groupby keyword and get the sum of the score
top_keyword = keyword_df.groupby('keyword').sum().sort_values(by='score', ascending=False).head(5)

#get the the keywords list without the number
top_keyword.index.tolist()



['room', 'location', 'staff', 'bus', 'bed']

In [None]:
# get the top 10 keywords by groupby keyword and get the sum of the score
keyword_df.groupby('keyword').sum().sort_values(by='score', ascending=False).head(10)




### may not be useful

In [None]:
import spacy
def remove_adj_adv(sentence):
    

    #load small english model from spacy: https://spacy.io/models/en
    nlp = spacy.load('en_core_web_sm') #you can use other methods
    # excluded tags
    excluded_tags = {"ADJ", "ADV"} #"NOUN", "VERB", "ADJ", "ADV", "ADP", "PROPN"

    new_sentence = []
    for token in nlp(sentence):
        if token.pos_ not in excluded_tags:
            new_sentence.append(token.text)
    new_sentence_str = " ".join(new_sentence)
    return new_sentence_str

#print and compare the original and the new sentence
print("Before:", test_df['Reviews_clean'][0])
print("After:", remove_adj_adv(test_df['Reviews_clean'][0]))
print()

In [None]:
#load small english model from spacy: https://spacy.io/models/en
nlp = spacy.load('en_core_web_sm') #you can use other methods
nlp(test_df['Reviews_clean'][0])