# Hotel Review Analysis

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ast
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_validate, cross_val_predict
from sklearn.svm import LinearSVC
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, make_scorer, f1_score
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from pprint import pprint
import time
from gensim.models import Phrases
from gensim.corpora import Dictionary
from gensim.models import LdaModel, CoherenceModel
from gensim.matutils import corpus2csc
import pyLDAvis
import pyLDAvis.gensim_models
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# 載入資料

In [4]:
review = pd.read_csv("./raw_data/reviews.csv")
offering = pd.read_csv("./raw_data/offerings.csv")
offering = offering[offering['hotel_class'].notna()]
data = review.merge(offering, left_on="offering_id", right_on="id", suffixes=("_review", "_hotel"))
data.head(2)

Unnamed: 0,ratings,title,text,author,date_stayed,offering_id,num_helpful_votes,date,id_review,via_mobile,hotel_class,region_id,url,phone,details,address,type,id_hotel,name
0,"{'service': 5.0, 'cleanliness': 5.0, 'overall'...","“Truly is ""Jewel of the Upper Wets Side""”",Stayed in a king suite for 11 nights and yes i...,"{'username': 'Papa_Panda', 'num_cities': 22, '...",December 2012,93338,0,2012-12-17,147643103,False,3.0,60763,http://www.tripadvisor.com/Hotel_Review-g60763...,,,"{'region': 'NY', 'street-address': '2130 Broad...",hotel,93338,Hotel Beacon
1,"{'service': 5.0, 'cleanliness': 5.0, 'overall'...",“My home away from home!”,"On every visit to NYC, the Hotel Beacon is the...","{'username': 'Maureen V', 'num_reviews': 2, 'n...",December 2012,93338,0,2012-12-17,147639004,False,3.0,60763,http://www.tripadvisor.com/Hotel_Review-g60763...,,,"{'region': 'NY', 'street-address': '2130 Broad...",hotel,93338,Hotel Beacon


# 清洗資料
- 詞幹正規化 & 停用字 & 小寫 & 單一字

In [12]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-z ]', '', text)
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return tokens

data['tokens'] = data['text'].astype(str).apply(preprocess)
docs = [[word for word in doc.split() if word not in stop_words and len(word)>1] for doc in data['tokens']]
len(docs)

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x000001EEB0E93750>>
Traceback (most recent call last):
  File "c:\Users\USER\AppData\Local\Programs\Python\Python311\Lib\site-packages\ipykernel\ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

KeyboardInterrupt: 


NameError: name 'word_tokenize' is not defined

In [None]:
bigram = Phrases(docs, min_count=20)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # 將Token (bigram) 加入到docs裡面
            docs[idx].append(token)

## LDA模型
- 將hotel分類為 1-2 / 2-3 / 3-4 / 4-5 星，新增欄位hotel_class_group
- 對這些欄位對應的評論做LDA

In [None]:
# 分區間定義函式
def get_hotel_class_group(hotel_class):
    if 1.0 <= hotel_class < 2.0:
        return "1~2"
    elif 2.0 <= hotel_class < 3.0:
        return "2~3"
    elif 3.0 <= hotel_class < 4.0:
        return "3~4"
    elif 4.0 <= hotel_class <= 5.0:
        return "4~5"
    else:
        return "other"
    
data['hotel_class_group'] = data['hotel_class'].apply(get_hotel_class_group)

# 做 LDA 主題模型分析（每組跑一次）
t0 = time.time()
groups = ["1~2", "2~3", "3~4", "4~5"]
lda_results = {}

for group in groups:
    group_data = data[data['hotel_class_group'] == group]
    texts = group_data['tokens'].tolist()
    
    if len(texts) < 10:
        print(f"Group {group} has too few samples, skipping...")
        continue
    
    # 建立 docs 的 dictionary物件
    dictionary = Dictionary(texts)
    dictionary.filter_extremes(no_below=5, no_above=0.5)
    corpus = [dictionary.doc2bow(text) for text in texts]
    print(dictionary)
    
    # 訓練 LDA 模型
    # 不一定每組星等的評論都要分出5個topic(可能依評論資料不同而增加或減少)
    # 處理 num_topics
    topic_range = range(2, 11)
    
    lda_model = LdaModel(corpus=corpus,
                         id2word=dictionary,
                         num_topics=5,
                         random_state=42,
                         passes=5,
                         alpha='auto',
                         per_word_topics=True)
    
    lda_results[group] = {
        "model": lda_model,
        "dictionary": dictionary,
        "corpus": corpus
    }

    # 顯示主題
    print(f"\n=== LDA Topics for Hotel Class {group} ===")
    topics = lda_model.print_topics(num_words=10)
    for topic_num, topic_words in topics:
        print(f"Topic {topic_num + 1}: {topic_words}")
        
print(f"花費時間: {time.time() - t0} sec")

TypeError: doc2bow expects an array of unicode tokens on input, not a single string

#### 視覺化Perplexity 和 PMI 評估主題模型表現
+ **Pointwise Mutual Information (PMI)** : <br>
自然語言處理中，想要探討兩個字之間是否存在某種關係。<br>
例如：某些字會一起出現，可能帶有某些訊息，因此這個可以用 PMI 來計算，數字越大越好。
+ **perplexity** :<br>
perplexity 也是評估的指標之一，廣泛用於語言模型的評估，意思為複雜度，因此數字要越小越好。

## 3. LDAvis視覺化結果

LDAvis 是我們經常會使用的視覺化工具，目的為幫助我們解釋主題模型中，在我們建構好主題模型得到 θ(文件的主題分佈) 跟 φ(主題的字分佈)，透過 pyLDAvis 將主題降維成二維，以網頁的形式供我們查看。

+ 四個主題數，因此有四個圈圈
+ 圓越大代表 document 越大
+ 右邊可以看到主題的字分佈
+ 右上幫有一個 bar 調整 lambda：當 lambda=1 也就是代表本來的字分佈 φ，將 lambda 縮越小可以看到越唯一的字，好的分佈是 φ 值高且唯一，因此我們要在這兩者間取平衡
  - λ = 1.0 👉 根據 詞在該主題中出現的機率 排序（也就是根據φ值）
  - λ = 0.0 👉 根據 詞在主題中「相對其他主題」的特異性 排序
+ 圓心越相近，代表主題會越相似；反之，圓心分越開代表主題有唯一性<br>
  --> 假設詞彙本來有 100 字，維度應該是 100，假如本來維度接近(相近)的話，降維後也會接近(相近)

In [None]:
# # 只選擇 hotel_class 在 4~5 區間的評論
group = '1~2'
lda_model = lda_results[group]['model']
corpus = lda_results[group]['corpus']
dictionary = lda_results[group]['dictionary']

# 顯示互動視覺化
pyLDAvis.enable_notebook()
graph = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
graph

# for group in lda_results:
#     model = lda_results[group]['model']
#     corpus = lda_results[group]['corpus']
#     dictionary = lda_results[group]['dictionary']
#     vis = pyLDAvis.gensim_models.prepare(model, corpus, dictionary)
#     vis
    # 儲存html
    # pyLDAvis.save_html(vis, f"LDA_visualization_{group}.html")
    # print(f"Saved: LDA_visualization_{group}.html")

## 主題分佈的應用，搭配其他文章資訊

有了前面訓練的主題模型，接下來可以分析每一章節主題的分佈情況

In [None]:
# 取得每章的主題分佈
topics_doc = lda_model.get_document_topics(corpus)

In [None]:
# 將gensim的表示法轉成稀疏矩陣
m_theta = corpus2csc(topics_doc).T.toarray()
theta = pd.DataFrame(m_theta, columns=[f"topic_{i+1}" for i in range(m_theta.shape[1])])
theta

#### 將每個章節的主題機率分布視覺化

In [None]:
# fig, ax = plt.subplots(figsize=(15, 6))
# theta.plot.bar(ax=ax, stacked=True, color = plt.cm.Set3.colors)

# plot.bar 太多筆評論，記憶體 Out of memory

# 改為顯示主題的整體分布比例
mean_topic_dist = theta.mean(axis=0).sort_values(ascending=False)

fig, ax = plt.subplots(figsize=(8, 4))
mean_topic_dist.plot(kind='bar', color=plt.cm.Set3.colors)
plt.title("整體主題分布平均")
plt.ylabel("平均比例")
plt.xlabel("Topic")
plt.tight_layout()
plt.show()