### Load and label the data

In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv("hotel_reviews.csv", header=0)
df.columns = ["review", "rating"]

# Keep only 1, 2, 4, and 5 star ratings
df = df[df["rating"].isin([1, 2, 4, 5])]

# Create binary labels: 1 = positive review (4-5 stars), 0 = negative review (1-2 stars)
df["label"] = df["rating"].apply(lambda x: 1 if x >= 4 else 0)


### Text preprocessing (including tokenization, stopword removal, and stemming)

In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from gensim.utils import simple_preprocess

nltk.download("stopwords")
stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()

# Define preprocessing function:
# - lowercase and tokenize
# - remove stopwords
# - apply stemming
def preprocess(text):
    tokens = simple_preprocess(text, deacc=True)
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    return tokens

# Apply preprocessing to each review
df["tokens"] = df["review"].astype(str).apply(preprocess)

[nltk_data] Downloading package stopwords to /Users/adam/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Split into training and test sets

In [3]:
from sklearn.model_selection import train_test_split

X_train_tokens, X_test_tokens, y_train, y_test = train_test_split(
    df["tokens"], df["label"], test_size=0.2, random_state=38)

### Create dictionary and corpus (based only on the training set)

In [4]:
from gensim.corpora import Dictionary
from gensim.models import LdaModel

# Create a dictionary and bag-of-words corpus for the training set
dictionary = Dictionary(X_train_tokens)
train_corpus = [dictionary.doc2bow(tokens) for tokens in X_train_tokens]
test_corpus = [dictionary.doc2bow(tokens) for tokens in X_test_tokens]

### Train multiple LDA models with different numbers of topics

In [10]:
topic_nums = [5, 10, 15, 20, 25, 30, 35, 40]
lda_models = {}

for num_topics in topic_nums:
    lda = LdaModel(corpus=train_corpus,
                   id2word=dictionary,
                   num_topics=num_topics,
                   passes=10,
                   random_state=38)
    lda_models[num_topics] = lda
    print(f"Finish training lda_models {num_topics}")

Finish training lda_models 5
Finish training lda_models 10
Finish training lda_models 15
Finish training lda_models 20
Finish training lda_models 25
Finish training lda_models 30
Finish training lda_models 35
Finish training lda_models 40


### Convert each review to a topic distribution vector

In [11]:
import numpy as np

def get_topic_vector(model, corpus, num_topics):
    topic_vecs = []
    for doc_bow in corpus:
        doc_topics = model.get_document_topics(doc_bow, minimum_probability=0)
        vec = [prob for _, prob in sorted(doc_topics)]
        topic_vecs.append(vec)
    return np.array(topic_vecs)

### Train classifiers and evaluate performance for each topic model

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

results = []

for num_topics, lda_model in lda_models.items():
    X_train_vec = get_topic_vector(lda_model, train_corpus, num_topics)
    X_test_vec = get_topic_vector(lda_model, test_corpus, num_topics)
    
    clf = LogisticRegression(max_iter=1000)
    clf.fit(X_train_vec, y_train)
    y_pred = clf.predict(X_test_vec)
    y_prob = clf.predict_proba(X_test_vec)[:, 1]

    results.append({
        "num_topics": num_topics,
        "accuracy": accuracy_score(y_test, y_pred),
        "f1_score": f1_score(y_test, y_pred),
        "auc": roc_auc_score(y_test, y_prob)
    })

# Show results sorted by F1-score
results_df = pd.DataFrame(results)
print(results_df.sort_values(by="f1_score", ascending=False))

   num_topics  accuracy  f1_score       auc
5          30  0.933097  0.960350  0.963578
4          25  0.930912  0.959081  0.961322
7          40  0.926543  0.956592  0.959256
3          20  0.925724  0.956157  0.956010
6          35  0.925724  0.956115  0.959468
0           5  0.924085  0.955103  0.955224
2          15  0.922447  0.954075  0.950704
1          10  0.921354  0.953443  0.951551


When number of topics equal to 30, the accuracy and the f1 score is the highest.

### Check predicted class distribution to make sure the classifier isn't biased

In [8]:
import numpy as np

print(np.unique(y_pred, return_counts=True))

(array([0, 1]), array([ 522, 3140]))


### Topic model visualisation

In [20]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis  # 注意是 gensim_models，不是 gensim


In [None]:
# Choose one model for visualization, e.g., num_topics = 10
chosen_model = lda_models[30]

# Prepare the visualization
vis_data = gensimvis.prepare(chosen_model, train_corpus, dictionary)

# Display in notebook
pyLDAvis.display(vis_data)

In [None]:
# Save as HTML
#pyLDAvis.save_html(vis_data, 'lda_visualization_10topics.html')

In [23]:
# Print top 10 words per topic
for topic_id in range(chosen_model.num_topics):
    print(f"\nTopic #{topic_id}:")
    print(chosen_model.show_topic(topic_id, topn=10))



Topic #0:
[('cocot', 0.010610378), ('slowli', 0.009639268), ('katrina', 0.009598335), ('rubbish', 0.008875811), ('patch', 0.008671292), ('sireni', 0.008627781), ('citizen', 0.008506678), ('someday', 0.008429657), ('festiv', 0.008162036), ('rowdi', 0.008051225)]

Topic #1:
[('benjamin', 0.057020172), ('everyon', 0.031447753), ('accid', 0.025301378), ('awsom', 0.022461075), ('belief', 0.014585127), ('circu', 0.012831604), ('playa', 0.012489509), ('carmen', 0.0121043185), ('picadilli', 0.010693822), ('wilson', 0.010256878)]

Topic #2:
[('park', 0.1777788), ('car', 0.098721504), ('valet', 0.036798026), ('francisco', 0.031740576), ('san', 0.031019807), ('street', 0.025542337), ('cabl', 0.024974858), ('union', 0.02111635), ('garag', 0.02030634), ('squar', 0.018852714)]

Topic #3:
[('day', 0.01787855), ('drink', 0.011916044), ('bar', 0.01167472), ('water', 0.011211767), ('restaur', 0.009445181), ('food', 0.00942003), ('pool', 0.009395457), ('towel', 0.009384993), ('dinner', 0.009162327), ('r