# Hotel Review Analysis

In [9]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ast

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_validate, cross_val_predict
from sklearn.svm import LinearSVC
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, make_scorer, f1_score

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# 載入資料

In [10]:
review = pd.read_csv("./raw_data/reviews.csv")
offering = pd.read_csv("./raw_data/offerings.csv")

# 過濾掉 hotel_class 為空值的資料
offering = offering[offering['hotel_class'].notna()]

data = review.merge(offering, left_on="offering_id", right_on="id", suffixes=("_review", "_hotel"))
print(data)

                                                  ratings  \
0       {'service': 5.0, 'cleanliness': 5.0, 'overall'...   
1       {'service': 5.0, 'cleanliness': 5.0, 'overall'...   
2       {'service': 4.0, 'cleanliness': 5.0, 'overall'...   
3       {'service': 5.0, 'cleanliness': 5.0, 'overall'...   
4       {'service': 4.0, 'cleanliness': 5.0, 'overall'...   
...                                                   ...   
843619                                   {'overall': 4.0}   
843620  {'service': 4.0, 'cleanliness': 5.0, 'overall'...   
843621  {'cleanliness': 5.0, 'overall': 5.0, 'rooms': ...   
843622  {'cleanliness': 5.0, 'overall': 5.0, 'rooms': ...   
843623  {'service': 4.0, 'cleanliness': 4.0, 'overall'...   

                                                  title  \
0             “Truly is "Jewel of the Upper Wets Side"”   
1                             “My home away from home!”   
2                                          “Great Stay”   
3                              

# 清洗資料
- 將評論文字轉為小寫
- 移除特殊符號
- 去除停用字
- 詞性還原

In [11]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-z ]', '', text)
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# from langdetect import detect
# def detect_language(text):
#     try:
#         return detect(text)
#     except:
#         return "unknown"
# # 新增欄位標記語言
# data['lang'] = data['text'].apply(detect_language)
# # 只保留英文
# data = data[data['lang'] == 'en']

data['clean_text'] = data['text'].apply(preprocess)

## 新增欄位&訓練分類SVM器
- 將評論分類為 positive / neutral / negative
- 將 ratings 欄位從字串解析成字典，並取 overall 欄位
- overall ≥ 4 → 正評
- overall = 3 → 中評
- overall ≤ 2 → 負評
- 評論轉成 TF-IDF 向量，考慮 1~2-gram
- 切資料，訓練集:測試集 = 7:3
- SVM模型訓練
- 印出精確率、召回率、F1 分數與混淆矩陣。

In [12]:
def extract_overall_rating(r):
    try:
        rating_dict = ast.literal_eval(r)
        return rating_dict.get('overall', np.nan)
    except Exception:
        return np.nan

data['overall_rating'] = data['ratings'].apply(extract_overall_rating)

# 建立情感標籤
conditions = [data['overall_rating'] >= 4, data['overall_rating'] == 3, data['overall_rating'] <= 2]
choices = ['positive', 'neutral', 'negative']
data['label'] = np.select(conditions, choices, default='neutral')

# TF-IDF 向量化
vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=1000, stop_words="english")
X = vectorizer.fit_transform(data['clean_text'])
y = data['label']

# 分割資料集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 分別使用 Logistic Regression & Linear SVM 模型訓練
models = {
    'SVM': LinearSVC(),
    'Logistic Regression': LogisticRegression(max_iter=1000)
}

# 訓練 & 輸出結果
for name, model in models.items():
    print(f"\n===== {name} =====")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # 分類器效果
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    
    # 混淆矩陣
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    

scoring = {
    'accuracy': 'accuracy',
    'f1_macro': make_scorer(f1_score, average='macro')
}

for name, model in models.items():
    print(f"\n===== {name} (5-fold Cross-Validation) =====")
    scores = cross_validate(model, X, y, cv=5, scoring=scoring)
    print(f"Accuracy (mean): {scores['test_accuracy'].mean():.4f}")
    print(f"F1 Macro (mean): {scores['test_f1_macro'].mean():.4f}")




===== SVM =====




Classification Report:
              precision    recall  f1-score   support

    negative       0.71      0.69      0.70     31903
     neutral       0.58      0.16      0.26     35288
    positive       0.85      0.98      0.91    185897

    accuracy                           0.83    253088
   macro avg       0.72      0.61      0.62    253088
weighted avg       0.80      0.83      0.79    253088

Confusion Matrix:
[[ 22058   1809   8036]
 [  6550   5781  22957]
 [  2248   2322 181327]]

===== Logistic Regression =====
Classification Report:
              precision    recall  f1-score   support

    negative       0.74      0.68      0.71     31903
     neutral       0.53      0.26      0.35     35288
    positive       0.87      0.96      0.91    185897

    accuracy                           0.83    253088
   macro avg       0.71      0.63      0.66    253088
weighted avg       0.80      0.83      0.81    253088

Confusion Matrix:
[[ 21608   3374   6921]
 [  5539   9198  20551]
 [



Accuracy (mean): 0.8254
F1 Macro (mean): 0.6237

===== Logistic Regression (5-fold Cross-Validation) =====
Accuracy (mean): 0.8281
F1 Macro (mean): 0.6567


##  LDA 主題模型
- 找出評論中最常見的主題
- 結果有除了英語以外的主題詞彙，可以在Step3濾掉
- 將評論轉為 TF-IDF，再套用 LDA 模型分成 3 個主題
- 印出每個主題的前10個關鍵詞

In [13]:
lda_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
lda_matrix = lda_vectorizer.fit_transform(data['clean_text'])
lda = LatentDirichletAllocation(n_components=3, random_state=2025)
lda.fit(lda_matrix)

terms = lda_vectorizer.get_feature_names_out()
for idx, topic in enumerate(lda.components_):
    print(f"\nTopic #{idx+1}:")
    print([terms[i] for i in topic.argsort()[:-11:-1]])


Topic #1:
['hotel', 'great', 'location', 'staff', 'room', 'square', 'clean', 'good', 'helpful', 'stay']

Topic #2:
['und', 'da', 'die', 'ist', 'di', 'der', 'la', 'sehr', 'il', 'zimmer']

Topic #3:
['room', 'hotel', 'great', 'stay', 'staff', 'nice', 'night', 'good', 'service', 'stayed']
