In [1]:
#import data from a csv file and store in a dataframe
import pandas as pd
df = pd.read_csv('../raw_data/Hotel_Reviews.csv')

# merger two 'Negative_Review', 'Positive_Review' columns and save to a new column 'Reviews'
df['Reviews'] = df['Negative_Review'] + df['Positive_Review']

# get the X_train data, only one column 'Reviews'
X_train = df['Reviews']


In [3]:
X_train.shape, df.shape

((515738,), (515738, 18))

In [4]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import defaultdict

stop_words = set(stopwords.words("english"))
# Convert set to list, remove 'again', and convert back to set
word_list = list(stop_words)
word_list.append('negative')
stop_words = set(word_list)

# Function to preprocess sentences
def preprocess(sentences):
    lemmatizer = WordNetLemmatizer()
    word_frequencies = defaultdict(int)
    for sentence in sentences:
        words = word_tokenize(sentence.lower())
        words = [lemmatizer.lemmatize(word) for word in words if word.isalnum()]
        for word in words:
            if word not in stop_words:
                word_frequencies[word] += 1
    return word_frequencies

In [5]:
X_preprocessed = preprocess(X_train)

In [6]:
# Latent Dirichlet Allocation(LDA) model

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

data_vectorized = vectorizer.fit_transform(X_preprocessed)

lda_model = LatentDirichletAllocation(n_components=2)

lda_vectors = lda_model.fit_transform(data_vectorized)

In [7]:
#Visualize potential topics
#a function that prints the words associated with the potential topics.
def print_topics(model, vectorizer):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names_out()[i], topic[i])
                        for i in topic.argsort()[:-10 - 1:-1]])

In [8]:
print_topics(lda_model, vectorizer)

Topic 0:
[('fraudulent', 1.4767379919740853), ('canid', 1.4767379889367038), ('locatioin', 1.4767379804035135), ('beingva', 1.4767379758721062), ('touristed', 1.4767379700347414), ('nettet', 1.476737948201854), ('intence', 1.4767379387176947), ('otber', 1.476737936793093), ('908', 1.4767379353385723), ('botany', 1.4767379321897975)]
Topic 1:
[('extrememly', 1.4765976829346326), ('arriv', 1.4765976602245097), ('452', 1.4765976482030905), ('retuning', 1.4765976435568486), ('fnloor', 1.476597643510929), ('transpires', 1.4765976390709634), ('asterisk', 1.4765976353816783), ('2913', 1.4765976349007235), ('conevenient', 1.4765976248050152), ('gbp40', 1.4765976213280811)]


In [None]:
# 