# Hotel Review Analysis

In [1]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ast
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_validate, cross_val_predict
from sklearn.svm import LinearSVC
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, make_scorer, f1_score
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from pprint import pprint
import time
from gensim.models import Phrases
from gensim.corpora import Dictionary
from gensim.models import LdaModel, CoherenceModel
from gensim.matutils import corpus2csc
import pyLDAvis
import pyLDAvis.gensim_models

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# 載入資料

In [2]:
review = pd.read_csv("./raw_data/reviews.csv")
offering = pd.read_csv("./raw_data/offerings.csv")
offering = offering[offering['hotel_class'].notna()]
data = review.merge(offering, left_on="offering_id", right_on="id", suffixes=("_review", "_hotel"))
data.head(2)

Unnamed: 0,ratings,title,text,author,date_stayed,offering_id,num_helpful_votes,date,id_review,via_mobile,hotel_class,region_id,url,phone,details,address,type,id_hotel,name
0,"{'service': 5.0, 'cleanliness': 5.0, 'overall'...","“Truly is ""Jewel of the Upper Wets Side""”",Stayed in a king suite for 11 nights and yes i...,"{'username': 'Papa_Panda', 'num_cities': 22, '...",December 2012,93338,0,2012-12-17,147643103,False,3.0,60763,http://www.tripadvisor.com/Hotel_Review-g60763...,,,"{'region': 'NY', 'street-address': '2130 Broad...",hotel,93338,Hotel Beacon
1,"{'service': 5.0, 'cleanliness': 5.0, 'overall'...",“My home away from home!”,"On every visit to NYC, the Hotel Beacon is the...","{'username': 'Maureen V', 'num_reviews': 2, 'n...",December 2012,93338,0,2012-12-17,147639004,False,3.0,60763,http://www.tripadvisor.com/Hotel_Review-g60763...,,,"{'region': 'NY', 'street-address': '2130 Broad...",hotel,93338,Hotel Beacon


# 清洗資料
- 詞幹正規化 & 停用字 & 小寫 & 單一字

In [None]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-z ]', '', text)
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

data['tokens'] = data['text'].apply(preprocess)
docs = [[word for word in doc.split() if word not in stop_words and len(word)>1] for doc in data['tokens']]
len(docs)

843624

In [4]:
bigram = Phrases(docs, min_count=20)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # 將Token (bigram) 加入到docs裡面
            docs[idx].append(token)

## 新增欄位&訓練分類SVM器
- 將hotel分類為 1-2 / 2-3 / 3-4 / 4-5 星，新增欄位hotel_class_group
- 對這些欄位對應的評論做主題分析

In [5]:
# 分區間定義函式
def get_hotel_class_group(hotel_class):
    if 1.0 <= hotel_class < 2.0:
        return "1~2"
    elif 2.0 <= hotel_class < 3.0:
        return "2~3"
    elif 3.0 <= hotel_class < 4.0:
        return "3~4"
    elif 4.0 <= hotel_class <= 5.0:
        return "4~5"
    else:
        return "other"
    
data['hotel_class_group'] = data['hotel_class'].apply(get_hotel_class_group)

# 做 LDA 主題模型分析（每組跑一次）
t0 = time.time()
groups = ["1~2", "2~3", "3~4", "4~5"]
lda_results = {}

for group in groups:
    group_data = data[data['hotel_class_group'] == group]
    texts = group_data['tokens'].tolist()
    
    if len(texts) < 10:
        print(f"Group {group} has too few samples, skipping...")
        continue
    
    # 建立 docs 的 dictionary物件
    dictionary = Dictionary(docs)
    dictionary.filter_extremes(no_below=5, no_above=0.5)
    print(dictionary)

    # Bag-of-words representation of the documents.
    # 用 gensim ldamodel input 需要將文章轉換成 bag of words 
    corpus = [dictionary.doc2bow(doc) for doc in docs]
    
    # 訓練 LDA 模型
    lda_model = LdaModel(corpus=corpus,
                         id2word=dictionary,
                         num_topics=5,
                         random_state=42,
                         passes=10,
                         alpha='auto',
                         per_word_topics=True)
    
    lda_results[group] = {
        "model": lda_model,
        "dictionary": dictionary,
        "corpus": corpus
    }

    # 顯示主題
    print(f"\n=== LDA Topics for Hotel Class {group} ===")
    topics = lda_model.print_topics(num_words=10)
    for topic_num, topic_words in topics:
        print(f"Topic {topic_num + 1}: {topic_words}")
        
print(f"花費時間: {time.time() - t0} sec")

Dictionary<100000 unique tokens: ['able', 'access', 'across', 'adjoining', 'air']...>

=== LDA Topics for Hotel Class 1~2 ===
Topic 1: 0.013*"nice" + 0.012*"good" + 0.011*"breakfast" + 0.010*"bed" + 0.009*"bathroom" + 0.009*"free" + 0.008*"area" + 0.008*"small" + 0.007*"one" + 0.007*"night"
Topic 2: 0.022*"de" + 0.016*"la" + 0.016*"e" + 0.010*"da" + 0.010*"und" + 0.010*"un" + 0.010*"le" + 0.007*"en" + 0.007*"et" + 0.007*"die"
Topic 3: 0.015*"metro" + 0.014*"great" + 0.014*"walk" + 0.013*"location" + 0.012*"block" + 0.011*"restaurant" + 0.011*"walking" + 0.010*"street" + 0.009*"good" + 0.008*"staff"
Topic 4: 0.010*"u" + 0.009*"would" + 0.009*"night" + 0.009*"desk" + 0.009*"one" + 0.008*"front" + 0.007*"get" + 0.007*"front_desk" + 0.007*"day" + 0.007*"time"
Topic 5: 0.020*"staff" + 0.017*"great" + 0.016*"stay" + 0.014*"service" + 0.008*"stayed" + 0.008*"dc" + 0.008*"friendly" + 0.007*"time" + 0.007*"u" + 0.007*"location"
Dictionary<100000 unique tokens: ['able', 'access', 'across', 'adjo

#### 視覺化Perplexity 和 PMI 評估主題模型表現
+ **Pointwise Mutual Information (PMI)** : <br>
自然語言處理中，想要探討兩個字之間是否存在某種關係。<br>
例如：某些字會一起出現，可能帶有某些訊息，因此這個可以用 PMI 來計算，數字越大越好。
+ **perplexity** :<br>
perplexity 也是評估的指標之一，廣泛用於語言模型的評估，意思為複雜度，因此數字要越小越好。

## 3. LDAvis視覺化結果

LDAvis 是我們經常會使用的視覺化工具，目的為幫助我們解釋主題模型中，在我們建構好主題模型得到 θ(文件的主題分佈) 跟 φ(主題的字分佈)，透過 pyLDAvis 將主題降維成二維，以網頁的形式供我們查看。

+ 四個主題數，因此有四個圈圈
+ 圓越大代表 document 越大
+ 右邊可以看到主題的字分佈
+ 右上幫有一個 bar 調整 lambda：當 lambda=1 也就是代表本來的字分佈 φ，將 lambda 縮越小可以看到越唯一的字，好的分佈是 φ 值高且唯一，因此我們要在這兩者間取平衡
  - λ = 1.0 👉 根據 詞在該主題中出現的機率 排序（也就是根據φ值）
  - λ = 0.0 👉 根據 詞在主題中「相對其他主題」的特異性 排序
+ 圓心越相近，代表主題會越相似；反之，圓心分越開代表主題有唯一性<br>
  --> 假設詞彙本來有 100 字，維度應該是 100，假如本來維度接近(相近)的話，降維後也會接近(相近)

In [9]:
# # 只選擇 hotel_class 在 4~5 區間的評論
group = '4~5'
lda_model = lda_results[group]['model']
corpus = lda_results[group]['corpus']
dictionary = lda_results[group]['dictionary']

# 顯示互動視覺化
pyLDAvis.enable_notebook()
graph = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
graph

# for group in lda_results:
#     model = lda_results[group]['model']
#     corpus = lda_results[group]['corpus']
#     dictionary = lda_results[group]['dictionary']
#     vis = pyLDAvis.gensim_models.prepare(model, corpus, dictionary)
#     vis
    # 儲存html
    # pyLDAvis.save_html(vis, f"LDA_visualization_{group}.html")
    # print(f"Saved: LDA_visualization_{group}.html")

## 主題分佈的應用，搭配其他文章資訊

有了前面訓練的主題模型，接下來可以分析每一章節主題的分佈情況

In [10]:
# 取得每章的主題分佈
topics_doc = lda_model.get_document_topics(corpus)

In [11]:
# 將gensim的表示法轉成稀疏矩陣
m_theta = corpus2csc(topics_doc).T.toarray()
theta = pd.DataFrame(m_theta, columns=[f"topic_{i+1}" for i in range(m_theta.shape[1])])
theta

Unnamed: 0,topic_1,topic_2,topic_3,topic_4,topic_5
0,0.414121,0.000000,0.198677,0.119257,0.267642
1,0.156051,0.000000,0.551167,0.000000,0.284519
2,0.583057,0.000000,0.013367,0.000000,0.394022
3,0.678653,0.000000,0.000000,0.021060,0.295933
4,0.428189,0.000000,0.000000,0.277935,0.291875
...,...,...,...,...,...
843619,0.014629,0.919609,0.019336,0.000000,0.037251
843620,0.000000,0.971767,0.000000,0.000000,0.011236
843621,0.000000,0.961703,0.000000,0.000000,0.021525
843622,0.011578,0.932870,0.000000,0.028546,0.018172


#### 將每個章節的主題機率分布視覺化

In [None]:
fig, ax = plt.subplots(figsize=(15, 6))
theta.plot.bar(ax=ax, stacked=True, color = plt.cm.Set3.colors)



Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "c:\Users\USER\AppData\Local\Programs\Python\Python311\Lib\site-packages\IPython\core\interactiveshell.py", line 3549, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\USER\AppData\Local\Temp\ipykernel_14380\305158882.py", line 2, in <module>
    theta.plot.bar(ax=ax, stacked=True, color = plt.cm.Set3.colors)
  File "c:\Users\USER\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandas\plotting\_core.py", line 1192, in bar
    return self(kind="bar", x=x, y=y, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\USER\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandas\plotting\_core.py", line 1030, in __call__
    return plot_backend.plot(data, kind=kind, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\USER\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandas\plotting\_matplotlib\__init__.py", line 71, in plot
    plot_ob

Error in callback <function _draw_all_if_interactive at 0x0000020BEC9771A0> (for post_execute), with arguments args (),kwargs {}:
Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "c:\Users\USER\AppData\Local\Programs\Python\Python311\Lib\site-packages\IPython\core\events.py", line 82, in trigger
    func(*args, **kwargs)
  File "c:\Users\USER\AppData\Local\Programs\Python\Python311\Lib\site-packages\matplotlib\pyplot.py", line 197, in _draw_all_if_interactive
    draw_all()
  File "c:\Users\USER\AppData\Local\Programs\Python\Python311\Lib\site-packages\matplotlib\_pylab_helpers.py", line 132, in draw_all
    manager.canvas.draw_idle()
  File "c:\Users\USER\AppData\Local\Programs\Python\Python311\Lib\site-packages\matplotlib\backend_bases.py", line 1893, in draw_idle
    self.draw(*args, **kwargs)
  File "c:\Users\USER\AppData\Local\Programs\Python\Python311\Lib\site-packages\matplotlib\backends\backend_agg.py", line 383, in draw
    self.renderer = self.get_renderer()
                    ^^^^^^^^^^^^^^^^^^^
  File "c:\Users\USER\AppData\Local\Programs\Python\Python311\Lib\site-packages\matplotlib\backends\backend_agg.py