In [None]:
import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv('data/Hotel_Reviews_Cleaned.csv')

df.head()

In [None]:
# import re
# from nltk.corpus import stopwords
# from nltk.tokenize import word_tokenize

# def preprocess_text(text):
#   text = re.sub(r'\d+', '', text)  # Remove numbers
#   text = text.lower()  # Convert to lowercase
#   text = re.sub(r'\s+', ' ', text)  # Remove extra whitespaces
#   text = ' '.join([word for word in word_tokenize(text) if word.isalpha() and word not in stopwords.words('english')])
#   return text

# df['Cleaned_Positive_Reviews'] = df['Positive_Review'].apply(preprocess_text)
# df['Cleaned_Negative_Reviews'] = df['Negative_Review'].apply(preprocess_text)

In [None]:
# df.to_csv('data/Hotel_Reviews_Cleaned.csv', index=False)

In [None]:
df['Cleaned_Positive_Reviews'].fillna('', inplace=True)


In [None]:
df['Cleaned_Negative_Reviews'].fillna('', inplace=True)

In [None]:
positive_hotel_reviews = df.groupby('Hotel_Name')['Cleaned_Positive_Reviews'].apply(lambda x: ' '.join(x)).reset_index()

In [None]:
negative_hotel_reviews = df.groupby('Hotel_Name')['Cleaned_Negative_Reviews'].apply(lambda x: ' '.join(x)).reset_index()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

def extract_top_keywords(text, n=5):
  vectorizer = CountVectorizer()
  X = vectorizer.fit_transform([text])
  terms = vectorizer.get_feature_names_out()
  keywords = [terms[i] for i in X.toarray().argsort()[0][-n:]][::-1]
  return keywords

positive_hotel_reviews['Top_Positive_Keywords'] = positive_hotel_reviews['Cleaned_Positive_Reviews'].apply(extract_top_keywords)
negative_hotel_reviews['Top_Negative_Keywords'] = negative_hotel_reviews['Cleaned_Negative_Reviews'].apply(extract_top_keywords)

In [None]:
random_hotels = df['Hotel_Name'].sample(n=10).values

print("List of Hotels:\n")
print(random_hotels)
print("*"*50)

selected_hotel = input("\nEnter the name of the hotel: ")
print(f"\nYou selected {selected_hotel}")
print("*"*50)

if selected_hotel in positive_hotel_reviews['Hotel_Name'].values:
    top_positive_keywords = positive_hotel_reviews[positive_hotel_reviews['Hotel_Name'] == selected_hotel]['Top_Positive_Keywords'].values[0]
    top_negative_keywords = negative_hotel_reviews[negative_hotel_reviews['Hotel_Name'] == selected_hotel]['Top_Negative_Keywords'].values[0]

    keywords = list(set(top_positive_keywords + top_negative_keywords))

    print(f"\nTop keywords for the hotel '{selected_hotel}':\n")
    print(keywords)

    selected_keyword = input("\nEnter a keyword: ")
    print(f"\nYou selected {selected_keyword}")
    print("*"*50)

    # Find the index of the selected hotel
    selected_hotel_indexes_list = df.index[df['Hotel_Name'] == selected_hotel].to_list()

    # Find reviews similar to the selected keyword within the specified hotel
    similar_positive_reviews = []
    similar_negative_reviews = []

    if selected_keyword in keywords:
        for i in selected_hotel_indexes_list:
            pos_review = df.iloc[i]['Cleaned_Positive_Reviews']
            neg_review = df.iloc[i]['Cleaned_Negative_Reviews']
            if selected_keyword in df.iloc[i]['Cleaned_Positive_Reviews'] and selected_keyword in df.iloc[i]['Cleaned_Negative_Reviews']:
                similar_positive_reviews.append(df.iloc[i]['Positive_Review'])
                similar_negative_reviews.append(df.iloc[i]['Negative_Review'])
            elif selected_keyword in df.iloc[i]['Cleaned_Positive_Reviews']:
                similar_positive_reviews.append(df.iloc[i]['Positive_Review'])
            elif selected_keyword in df.iloc[i]['Cleaned_Negative_Reviews']:
                similar_negative_reviews.append(df.iloc[i]['Negative_Review'])
            else:
                continue
        print(f"\n {len(similar_positive_reviews)}/{len(selected_hotel_indexes_list)} positive reviews contain the keyword '{selected_keyword}' for the hotel '{selected_hotel}':\n")
        for review in similar_positive_reviews:
            print("-", review)

        print(f"\n {len(similar_negative_reviews)}/{len(selected_hotel_indexes_list)} negative reviews contain the keyword '{selected_keyword}' for the hotel '{selected_hotel}':\n")
        for review in similar_negative_reviews:
            print("-", review)
    else:
        print("Invalid keyword.")
else:
    print("Invalid hotel name.")

### TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF Vectorization for each hotel
hotel_tfidf_matrices = {}
for hotel_name, group in df.groupby('Hotel_Name'):
  positive_vectorizer = TfidfVectorizer()
  negative_vectorizer = TfidfVectorizer()
  tfidf_matrix_positive = positive_vectorizer.fit_transform(group['Cleaned_Positive_Reviews'])
  tfidf_matrix_negative = negative_vectorizer.fit_transform(group['Cleaned_Negative_Reviews'])
  hotel_tfidf_matrices[hotel_name] = {'positive_vectorizer': positive_vectorizer, 'negative_vectorizer': negative_vectorizer , 'tfidf_matrix_positive': tfidf_matrix_positive, 'tfidf_matrix_negative': tfidf_matrix_negative}

# Function to extract top keywords based on TF-IDF for a specific hotel
def extract_top_keywords_for_hotel(document, hotel_name, top_n=5, positive=True):
  positive_vectorizer = hotel_tfidf_matrices[hotel_name]['positive_vectorizer']
  negative_vectorizer = hotel_tfidf_matrices[hotel_name]['negative_vectorizer']
  
  if positive:
    tfidf_scores = positive_vectorizer.transform([document])
    feature_names = positive_vectorizer.get_feature_names_out()
    sorted_indices = tfidf_scores.indices[np.argsort(tfidf_scores.data)][::-1][:top_n]
    keywords = [feature_names[idx] for idx in sorted_indices]
    return keywords
  else:
    tfidf_scores = negative_vectorizer.transform([document])
    feature_names = negative_vectorizer.get_feature_names_out()
    sorted_indices = tfidf_scores.indices[np.argsort(tfidf_scores.data)][::-1][:top_n]
    keywords = [feature_names[idx] for idx in sorted_indices]
    return keywords

In [None]:
positive_hotel_reviews['Top_Positive_Keywords_TFIDF'] = positive_hotel_reviews.apply(lambda x: extract_top_keywords_for_hotel(x['Cleaned_Positive_Reviews'], x['Hotel_Name']), axis=1)
negative_hotel_reviews['Top_Negative_Keywords_TFIDF'] = negative_hotel_reviews.apply(lambda x: extract_top_keywords_for_hotel(x['Cleaned_Negative_Reviews'], x['Hotel_Name'], positive=False), axis=1)

### KeyBERT

In [None]:
from keybert import KeyBERT

In [None]:
kw_model = KeyBERT('all-mpnet-base-v2')

In [None]:
def extract_top_keywords_keybert(document, top_n=5):
  keywords = kw_model.extract_keywords(document, keyphrase_ngram_range=(1, 1), stop_words='english', top_n=top_n)
  return [keyword[0] for keyword in keywords]

In [None]:
positive_hotel_reviews['Top_Positive_Keywords_KeyBERT'] = positive_hotel_reviews['Cleaned_Positive_Reviews'].apply(extract_top_keywords_keybert)
negative_hotel_reviews['Top_Negative_Keywords_KeyBERT'] = negative_hotel_reviews['Cleaned_Negative_Reviews'].apply(extract_top_keywords_keybert)

In [None]:
negative_hotel_reviews.head(15).to_excel('data/negative_hotel_reviews.xlsx', index=False)