In [1]:
import pandas as pd
import gensim
import nltk
import gensim.corpora as corpora
from gensim.models import LdaModel, CoherenceModel, Phrases
from gensim.corpora import Dictionary
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
import re

In [2]:
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')
nltk.download('stopwords')


[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/Ollie/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /Users/Ollie/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/Ollie/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
import pandas as pd
data = pd.read_csv('dataset_expedia-hotels-com-reviews-scraper_2024.csv')

In [4]:
data.head()

Unnamed: 0,__typename,brandType,contentDirectFeedbackPromptId,customData/doWeEndorseIt,customData/hotel,hotelId,id,impressionAnalytics,impressionAnalytics/__typename,impressionAnalytics/event,...,translationInfo/loadingTranslationText,translationInfo/seeOriginalText,translationInfo/targetLocale,translationInfo/translatedBy/__typename,translationInfo/translatedBy/description,translationInfo/translationCallToActionLabel,travelers/0,travelers/1,travelers/2,travelers/3
0,PropertyReview,Expedia,,no opinion,Prague Hotel Krystal,428588,66b63b958397934320ad7f75,,,,...,,,,,,,Traveled with family and small children,Traveled with family,,
1,PropertyReview,Expedia,,no opinion,Prague Hotel Krystal,428588,66b503566a7cfe1353bdb91e,,,,...,,,,,,,Traveled with family,,,
2,PropertyReview,Expedia,,no opinion,Prague Hotel Krystal,428588,66b4d8d6e6073643fd1b9cdc,,,,...,,,,,,,Traveled with family and small children,,,
3,PropertyReview,Expedia,,no opinion,Prague Hotel Krystal,428588,66b3c0f33677b72a4489b05f,,,,...,,,,,,,Traveled with family and small children,,,
4,PropertyReview,Expedia,,no opinion,Prague Hotel Krystal,428588,66b36f292fa7b179c05b6637,,ClientSideImpressionEventAnalytics,Scroll reviews,...,,,,,,,,,,


In [5]:
data.shape

(2412, 120)

In [6]:
# Initialize stopwords and lemmatizer
stop_words = stopwords.words('english','spanish')
lemmatizer = WordNetLemmatizer()

# 1. Preprocessing Function (Simplified with Bigram within the function)
def preprocess(text):
    if not isinstance(text, str):
        text = ''
    
    # Remove special characters and numbers
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    
    # Convert to lowercase and tokenize
    tokens = simple_preprocess(text.lower(), deacc=True)
    
    # # Add custom stopwords
    # custom_stopwords = {'la', 'de','el','muy','que','en','los','e','todo','lo','para','un','u','del','con','no','se','por','al'}  # Replace with your words
    # stop_words.update(custom_stopwords) 
    
    # Lemmatize and remove stopwords
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and len(word) > 4]
    
    # Create bigrams
    bigram = Phrases([tokens], min_count=5, threshold=100)
    tokens = bigram[tokens]
    
    return tokens

In [7]:
# Apply preprocessing
data['processed_text'] = data['text'].apply(preprocess)

In [8]:
print(data['text'][0])

The property was too large for my liking. The food was okay some days and not great at all some days. Definitely the food needs a upgrade especially the Italian food. The pool area was beautiful, but the pools had mold. Pools need to be cleaned better. and the Beach lets talk about the beach. Its horrible it has tons of sea weed. If you are going to this hotel for the beach forget it. reroute yourself else were. I would give this hotel a 3 out of 5. Maybe next time I would prefer the Grand. Its newer.


In [9]:
print(data['processed_text'][0])

['property', 'large', 'liking', 'great', 'definitely', 'need', 'upgrade', 'especially', 'italian', 'beautiful', 'pool', 'pool', 'cleaned', 'better', 'beach', 'beach', 'horrible', 'going', 'hotel', 'beach', 'forget', 'reroute', 'would', 'hotel', 'maybe', 'would', 'prefer', 'grand', 'newer']


In [10]:
# Create Dictionary and Corpus
dictionary = corpora.Dictionary(data['processed_text'])
# Filter out extremes
dictionary.filter_extremes(no_below=5, no_above=0.5)
corpus = [dictionary.doc2bow(text) for text in data['processed_text']]

In [11]:
# Build LDA Model
lda_model = LdaModel(corpus=corpus,
                     id2word=dictionary,
                     num_topics=4, 
                     random_state=100,
                     update_every=1,
                     chunksize=4000,
                     passes=10,
                     alpha='auto',
                     eta='auto',
                     per_word_topics=True)

In [12]:
# 4. Print the 25 most significant terms per topic
for i in range(3):
    print(f'Topic {i+1}:')
    print([term for term, _ in lda_model.show_topic(i, topn=25)])
    print('\n')

Topic 1:
['staff', 'great', 'amazing', 'service', 'resort', 'property', 'friendly', 'excellent', 'would', 'helpful', 'everything', 'clean', 'place', 'experience', 'beautiful', 'vacation', 'really', 'pool', 'people', 'wonderful', 'restaurant', 'overall', 'sale', 'sunrise', 'room']


Topic 2:
['restaurant', 'resort', 'check', 'service', 'reservation', 'hotel', 'great', 'room', 'would', 'expedia', 'credit', 'beach', 'property', 'night', 'water', 'palace', 'grand', 'never', 'booked', 'building', 'ocean', 'front', 'overall', 'hour', 'could']


Topic 3:
['resort', 'beach', 'hotel', 'palace', 'great', 'restaurant', 'staff', 'place', 'water', 'option', 'service', 'property', 'sunrise', 'clean', 'family', 'would', 'grand', 'stayed', 'beautiful', 'cancun', 'inclusive', 'thing', 'experience', 'nizuc', 'activity']




In [13]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

# Visualize the topics using pyLDAvis
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, dictionary, sort_topics=False)

# Display the pyLDAvis visualization
pyLDAvis.display(vis)

In [14]:
# Get the topic distribution for each document
topic_distributions = []
for bow in corpus:
    topic_distribution = lda_model.get_document_topics(bow, minimum_probability=0)
    topic_distributions.append([weight for _, weight in topic_distribution])

# Create a DataFrame
topic_df = pd.DataFrame(topic_distributions, columns=[f'Topic_{i+1}' for i in range(lda_model.num_topics)])

# Add the processed text to the DataFrame
topic_df.insert(0, 'Processed_Text', data['processed_text'])

# Add a "Dominant_Topic" column with the name of the topic with the highest weight
topic_df['Dominant_Topic'] = topic_df.iloc[:, 1:].idxmax(axis=1)

topic_df

Unnamed: 0,Processed_Text,Topic_1,Topic_2,Topic_3,Topic_4,Dominant_Topic
0,"[property, large, liking, great, definitely, n...",0.227459,0.005071,0.762640,0.004830,Topic_3
1,"[stuff, friendly, excellent, variety, restaura...",0.016593,0.012226,0.959524,0.011657,Topic_3
2,"[organized, transportation, short, staffed, re...",0.038200,0.028277,0.906618,0.026905,Topic_3
3,[],0.289236,0.216254,0.288090,0.206420,Topic_1
4,"[really, pushy, come, selling, resort, example...",0.022982,0.017088,0.943666,0.016264,Topic_3
...,...,...,...,...,...,...
2407,"[complejo, atencion, personal, lugar, primera,...",0.016330,0.012209,0.016265,0.955195,Topic_4
2408,"[entretenu, personnel, ecoute, environnement, ...",0.108292,0.706563,0.107861,0.077284,Topic_2
2409,"[hicieron, validos, habian, cuando, preguntaba...",0.048097,0.035960,0.047906,0.868036,Topic_4
2410,[],0.289236,0.216254,0.288090,0.206420,Topic_1


In [15]:
topic_df['Dominant_Topic'].value_counts()

Dominant_Topic
Topic_1    1406
Topic_3     466
Topic_2     293
Topic_4     247
Name: count, dtype: int64