In [None]:
# 4_topic_modeling_lda.ipynb
# Purpose: Perform LDA topic modeling on Chinese hotel reviews and visualize results

import pandas as pd
import numpy as np
import jieba
import re
from gensim import corpora, models
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
import matplotlib.pyplot as plt

In [None]:
# Load original review texts (assumes pre-tokenized data)
with open("positive_samples.txt", "r", encoding="utf-8") as f:
    positive_texts = [eval(line.strip())["text"] for line in f]

with open("negative_samples.txt", "r", encoding="utf-8") as f:
    negative_texts = [eval(line.strip())["text"] for line in f]

# Combine all reviews
all_texts = positive_texts + negative_texts

In [None]:
# Text preprocessing and tokenization
stopwords = set()
if not stopwords:
    stopwords = set(line.strip() for line in open("data/stopwords.txt", encoding="utf-8"))

def clean_and_tokenize(text):
    text = re.sub(r"[\s+\.!/_,$%^*(+\"']+|[+\-\-！，。？、~@#￥%……&*（）]+", "", text)
    words = [w for w in jieba.lcut(text) if w not in stopwords and len(w) > 1]
    return words


In [None]:
# Apply tokenizer
tokenized_texts = [clean_and_tokenize(text) for text in all_texts]

# Create dictionary and corpus
dictionary = corpora.Dictionary(tokenized_texts)
corpus = [dictionary.doc2bow(text) for text in tokenized_texts]

In [None]:
# Train LDA model
num_topics = 5
lda_model = models.LdaModel(corpus=corpus,
                             id2word=dictionary,
                             num_topics=num_topics,
                             random_state=42,
                             passes=10,
                             alpha='auto',
                             per_word_topics=True)

# Display top keywords per topic
for i, topic in lda_model.print_topics(num_words=10):
    print(f"Topic {i}: {topic}")

In [None]:
# Visualize with pyLDAvis
pyLDAvis.enable_notebook()
lda_vis = gensimvis.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(lda_vis)

# Save HTML visualization
pyLDAvis.save_html(lda_vis, "results/lda_visualization.html")
