In [1]:
import pandas as pd
import gensim
import nltk
import gensim.corpora as corpora
from gensim.models import LdaModel, CoherenceModel, Phrases
from gensim.corpora import Dictionary
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
import re

In [2]:
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\owner\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\owner\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
data = pd.read_csv('dataset_expedia-hotels-com-reviews-scraper_2024.csv')

In [4]:
data.head()

Unnamed: 0,__typename,brandType,contentDirectFeedbackPromptId,customData/doWeEndorseIt,customData/hotel,hotelId,id,impressionAnalytics,impressionAnalytics/__typename,impressionAnalytics/event,...,translationInfo/loadingTranslationText,translationInfo/seeOriginalText,translationInfo/targetLocale,translationInfo/translatedBy/__typename,translationInfo/translatedBy/description,translationInfo/translationCallToActionLabel,travelers/0,travelers/1,travelers/2,travelers/3
0,PropertyReview,Expedia,,no opinion,Prague Hotel Krystal,428588,66b63b958397934320ad7f75,,,,...,,,,,,,Traveled with family and small children,Traveled with family,,
1,PropertyReview,Expedia,,no opinion,Prague Hotel Krystal,428588,66b503566a7cfe1353bdb91e,,,,...,,,,,,,Traveled with family,,,
2,PropertyReview,Expedia,,no opinion,Prague Hotel Krystal,428588,66b4d8d6e6073643fd1b9cdc,,,,...,,,,,,,Traveled with family and small children,,,
3,PropertyReview,Expedia,,no opinion,Prague Hotel Krystal,428588,66b3c0f33677b72a4489b05f,,,,...,,,,,,,Traveled with family and small children,,,
4,PropertyReview,Expedia,,no opinion,Prague Hotel Krystal,428588,66b36f292fa7b179c05b6637,,ClientSideImpressionEventAnalytics,Scroll reviews,...,,,,,,,,,,


In [5]:
# Initialize stopwords and lemmatizer
stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

# 1. Preprocessing Function (Simplified with Bigram within the function)
def preprocess(text):
    if not isinstance(text, str):
        text = ''
    
    # Remove special characters and numbers
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    
    # Convert to lowercase and tokenize
    tokens = simple_preprocess(text.lower(), deacc=True)
    
    # Lemmatize and remove stopwords
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    
    # Create bigrams
    bigram = Phrases([tokens], min_count=5, threshold=100)
    tokens = bigram[tokens]
    
    return tokens

In [6]:
# Apply preprocessing
data['processed_text'] = data['text'].apply(preprocess)

In [7]:
print(data['text'][0])

The property was too large for my liking. The food was okay some days and not great at all some days. Definitely the food needs a upgrade especially the Italian food. The pool area was beautiful, but the pools had mold. Pools need to be cleaned better. and the Beach lets talk about the beach. Its horrible it has tons of sea weed. If you are going to this hotel for the beach forget it. reroute yourself else were. I would give this hotel a 3 out of 5. Maybe next time I would prefer the Grand. Its newer.


In [8]:
print(data['processed_text'][0])

['property', 'large', 'liking', 'food', 'okay', 'day', 'great', 'day', 'definitely', 'food', 'need', 'upgrade', 'especially', 'italian', 'food', 'pool', 'area', 'beautiful', 'pool', 'mold', 'pool', 'need', 'cleaned', 'better', 'beach', 'let', 'talk', 'beach', 'horrible', 'ton', 'sea', 'weed', 'going', 'hotel', 'beach', 'forget', 'reroute', 'else', 'would', 'give', 'hotel', 'maybe', 'next', 'time', 'would', 'prefer', 'grand', 'newer']


In [9]:
# Create Dictionary and Corpus
dictionary = corpora.Dictionary(data['processed_text'])
# Filter out extremes
dictionary.filter_extremes(no_below=5, no_above=0.5)
corpus = [dictionary.doc2bow(text) for text in data['processed_text']]

In [10]:
# Build LDA Model
lda_model = LdaModel(corpus=corpus,
                     id2word=dictionary,
                     num_topics=10, 
                     random_state=100,
                     update_every=1,
                     chunksize=500,
                     passes=100,
                     alpha='auto',
                     eta='auto',
                     per_word_topics=True)

In [11]:
# 4. Print the 25 most significant terms per topic
for i in range(10):
    print(f'Topic {i+1}:')
    print([term for term, _ in lda_model.show_topic(i, topn=25)])
    print('\n')

Topic 1:
['great', 'resort', 'palace', 'food', 'place', 'staff', 'moon', 'amazing', 'stay', 'family', 'back', 'kid', 'everything', 'friendly', 'option', 'sunrise', 'experience', 'pool', 'best', 'clean', 'activity', 'would', 'stayed', 'definitely', 'recommend']


Topic 2:
['room', 'staff', 'like', 'would', 'made', 'horrible', 'make', 'bar', 'getting', 'asked', 'three', 'almost', 'came', 'even', 'feel', 'working', 'member', 'care', 'accommodating', 'nothing', 'second', 'review', 'tip', 'else', 'cleaned']


Topic 3:
['beach', 'good', 'water', 'nice', 'food', 'pool', 'ocean', 'really', 'dirty', 'clean', 'hotel', 'like', 'ok', 'beautiful', 'blue', 'group', 'sand', 'sea', 'see', 'went', 'use', 'disappointing', 'amenity', 'color', 'selection']


Topic 4:
['staff', 'helpful', 'friendly', 'perfect', 'kind', 'airport', 'super', 'last', 'happy', 'transportation', 'nice', 'loved', 'coming', 'hr', 'door', 'back', 'friend', 'got', 'amenity', 'ground', 'request', 'soon', 'thanks', 'everything', 'aski

In [12]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

# Visualize the topics using pyLDAvis
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, dictionary, sort_topics=False)

# Display the pyLDAvis visualization
pyLDAvis.display(vis)

In [13]:
# Get the topic distribution for each document
topic_distributions = []
for bow in corpus:
    topic_distribution = lda_model.get_document_topics(bow, minimum_probability=0)
    topic_distributions.append([weight for _, weight in topic_distribution])

# Create a DataFrame
topic_df = pd.DataFrame(topic_distributions, columns=[f'Topic_{i+1}' for i in range(lda_model.num_topics)])

# Add the processed text to the DataFrame
topic_df.insert(0, 'Processed_Text', data['processed_text'])

# Add a "Dominant_Topic" column with the name of the topic with the highest weight
topic_df['Dominant_Topic'] = topic_df.iloc[:, 1:].idxmax(axis=1)

topic_df


Unnamed: 0,Processed_Text,Topic_1,Topic_2,Topic_3,Topic_4,Topic_5,Topic_6,Topic_7,Topic_8,Topic_9,Topic_10,Dominant_Topic
0,"[property, large, liking, food, okay, day, gre...",0.110946,0.106699,0.214798,0.002954,0.003155,0.051218,0.431310,0.005594,0.001979,0.071345,Topic_7
1,"[stuff, friendly, food, excellent, variety, fo...",0.079735,0.006889,0.343126,0.005857,0.006221,0.006482,0.470554,0.070269,0.003904,0.006962,Topic_7
2,"[big, organized, transportation, short, staffe...",0.109832,0.017049,0.018680,0.223693,0.015252,0.015849,0.546587,0.026432,0.009571,0.017054,Topic_7
3,[good],0.176678,0.044700,0.290976,0.037792,0.040364,0.041943,0.229772,0.067405,0.025329,0.045039,Topic_3
4,"[really, pushy, come, selling, resort, example...",0.130394,0.009945,0.011191,0.008275,0.008838,0.009187,0.229511,0.014968,0.005546,0.572143,Topic_10
...,...,...,...,...,...,...,...,...,...,...,...,...
2407,"[el, complejo, de, lujo, la, atencion, del, pe...",0.025106,0.006352,0.007025,0.005370,0.824656,0.083938,0.027976,0.009578,0.003599,0.006400,Topic_5
2408,"[tres, bien, entretenu, personnel, tres, ecout...",0.138639,0.035076,0.038793,0.029655,0.031674,0.248258,0.154489,0.052893,0.019876,0.250647,Topic_10
2409,"[se, hicieron, validos, los, usd, que, habian,...",0.036491,0.009427,0.010211,0.064459,0.631586,0.178708,0.040663,0.013922,0.005232,0.009302,Topic_5
2410,[],0.243504,0.061608,0.068136,0.052086,0.055631,0.057807,0.271343,0.092900,0.034910,0.062075,Topic_7


In [14]:
topic_df['Dominant_Topic'].value_counts()

Dominant_Topic
Topic_7     1415
Topic_1      450
Topic_5      199
Topic_8      100
Topic_3       88
Topic_10      50
Topic_4       47
Topic_2       34
Topic_6       17
Topic_9       12
Name: count, dtype: int64

In [15]:
# Function to print sample documents for a given topic
def print_sample_documents(topic_name, n_samples=5):
    # Filter the DataFrame based on the topic
    filtered_df = topic_df[topic_df['Dominant_Topic'] == topic_name]
    
    # Sample n_samples documents from the filtered DataFrame
    sample_documents = filtered_df['Processed_Text'].sample(n_samples, random_state=1)
    
    # Print the sampled documents
    for idx, text in enumerate(sample_documents, 1):
        print(f"Document {idx}: {text}\n")

In [17]:
print_sample_documents('Topic_5')

Document 1: ['todo', 'en', 'general', 'gusto', 'la', 'atencion', 'buenisima', 'te', 'hacen', 'sentir', 'como', 'en', 'casa']

Document 2: ['todo', 'fue', 'excelente']

Document 3: ['hotel', 'bonito', 'lo', 'mejor', 'la', 'comida', 'tristemente', 'la', 'amabilidad', 'el', 'trato', 'ha', 'decaido', 'mucho', 'toco', 'una', 'habitacion', 'con', 'la', 'coladera', 'de', 'la', 'regadera', 'tapada', 'por', 'lo', 'que', 'podiamos', 'banarnos', 'sin', 'que', 'se', 'rebosara', 'la', 'regadera', 'muy', 'incomodo', 'tener', 'que', 'llamar', 'para', 'destaparlo', 'el', 'chico', 'saco', 'la', 'bola', 'de', 'pelos', 'la', 'avento', 'la', 'taza', 'dejando', 'ahi', 'la', 'suciedad', 'de', 'huespedes', 'anteriores', 'todo', 'salpicado', 'enlodado', 'nuevamente', 'hubo', 'que', 'llamar', 'solicitar', 'limpieza', 'desinfeccion', 'se', 'tuvo', 'acceso', 'al', 'bano', 'en', 'ma', 'de', 'horas', 'en', 'lo', 'que', 'se', 'atendia', 'la', 'situacion', 'ellos', 'mismos', 'te', 'canalizan', 'al', 'departamento', 