In [51]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.datasets import fetch_20newsgroups
import gensim
from gensim import corpora
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import math

In [52]:
# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\T901068\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\T901068\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [53]:
# Load the dataset
file_path = 'dataset_expedia-hotels-com-reviews-scraper_2024.csv'
df = pd.read_csv(file_path)

In [54]:
df.head

<bound method NDFrame.head of           __typename brandType  contentDirectFeedbackPromptId  \
0     PropertyReview   Expedia                            NaN   
1     PropertyReview   Expedia                            NaN   
2     PropertyReview   Expedia                            NaN   
3     PropertyReview   Expedia                            NaN   
4     PropertyReview   Expedia                            NaN   
...              ...       ...                            ...   
2407  PropertyReview    Hotels                            NaN   
2408  PropertyReview   Expedia                            NaN   
2409  PropertyReview    Hotels                            NaN   
2410  PropertyReview    Hotels                            NaN   
2411  PropertyReview   Expedia                            NaN   

     customData/doWeEndorseIt      customData/hotel  hotelId  \
0                  no opinion  Prague Hotel Krystal   428588   
1                  no opinion  Prague Hotel Krystal   428588 

In [55]:
# Extract the 'text' column (assuming this is the review text)
reviews = df['text'].dropna().tolist()  # Drop any missing values

In [56]:
# Preprocess the text: tokenization, lowercasing, removing stopwords, filtering non-alphabetic words
stop_words = set(stopwords.words('english'))

In [57]:
def preprocess(doc):
    tokens = word_tokenize(doc.lower())  # Tokenize and lowercase
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]  # Remove stopwords and non-alphabetic words
    return tokens

In [58]:
# Apply preprocessing to each review
processed_reviews = [preprocess(review) for review in reviews]

In [59]:
# Create a dictionary representation of the documents
dictionary = corpora.Dictionary(processed_reviews)

In [60]:
# Convert documents into the bag-of-words format
corpus = [dictionary.doc2bow(review) for review in processed_reviews]

In [61]:
# Set the number of topics
num_topics = 5

In [62]:
# Train the LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)


In [63]:
# Create the DataFrame with Topic Distribution and Dominant Topic
topic_distributions = []
for bow in corpus:
    topic_distribution = lda_model.get_document_topics(bow, minimum_probability=0)
    topic_distributions.append([weight for _, weight in topic_distribution])

In [64]:
# Create a DataFrame
topic_df = pd.DataFrame(topic_distributions, columns=[f'Topic_{i+1}' for i in range(lda_model.num_topics)])

In [65]:
# Add the processed text to the DataFrame
topic_df['Processed_Text'] = [' '.join(review) for review in processed_reviews]

In [66]:
# Reorder columns to have 'Processed_Text' at the beginning
topic_df = topic_df[['Processed_Text'] + [f'Topic_{i+1}' for i in range(lda_model.num_topics)]]

In [67]:
# Add a "Dominant_Topic" column with the name of the topic with the highest weight
topic_df['Dominant_Topic'] = topic_df.iloc[:, 1:].idxmax(axis=1)

In [68]:
# Display the DataFrame
print(topic_df.head())

                                      Processed_Text   Topic_1   Topic_2  \
0  property large liking food okay days great day...  0.004181  0.004097   
1  stuff friendly food excellent variety food poo...  0.010145  0.010001   
2  big organized transportation short staffed get...  0.020389  0.020008   
3                                               good  0.100527  0.100002   
4  really pushy comes selling resorts example day...  0.945881  0.013341   

    Topic_3   Topic_4   Topic_5 Dominant_Topic  
0  0.174878  0.812664  0.004180        Topic_4  
1  0.010102  0.959453  0.010299        Topic_4  
2  0.480722  0.020666  0.458216        Topic_3  
3  0.101449  0.596597  0.101424        Topic_4  
4  0.013569  0.013745  0.013464        Topic_1  


In [69]:
# Print the topics in a more human-readable format
for idx, topic in lda_model.print_topics(-1):
    print(f"\nTopic {idx + 1}:")
    print("="*30)
    words = topic.split(" + ")
    for word in words:
        weight, term = word.split("*")
        print(f"{term.strip()} ({float(weight):.4f})")


Topic 1:
"us" (0.0130)
"hotel" (0.0110)
"would" (0.0110)
"room" (0.0100)
"resort" (0.0100)
"property" (0.0080)
"back" (0.0080)
"great" (0.0080)
"one" (0.0070)
"staff" (0.0070)

Topic 2:
"de" (0.0350)
"la" (0.0310)
"el" (0.0280)
"muy" (0.0210)
"que" (0.0210)
"en" (0.0190)
"los" (0.0140)
"es" (0.0130)
"hotel" (0.0130)
"todo" (0.0120)

Topic 3:
"room" (0.0250)
"service" (0.0100)
"time" (0.0090)
"hotel" (0.0080)
"food" (0.0070)
"resort" (0.0070)
"back" (0.0070)
"experience" (0.0060)
"palace" (0.0060)
"would" (0.0060)

Topic 4:
"food" (0.0190)
"resort" (0.0180)
"staff" (0.0160)
"good" (0.0140)
"beach" (0.0120)
"great" (0.0120)
"pool" (0.0100)
"service" (0.0090)
"nice" (0.0090)
"get" (0.0090)

Topic 5:
"great" (0.0340)
"staff" (0.0290)
"service" (0.0210)
"excellent" (0.0190)
"amazing" (0.0180)
"food" (0.0170)
"place" (0.0160)
"property" (0.0130)
"everything" (0.0110)
"friendly" (0.0100)


In [70]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis  # For gensim 4.x and later

# Visualize the topics using pyLDAvis
vis = gensimvis.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(vis)
