# Hotel Review Analysis

In [1]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ast
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_validate, cross_val_predict
from sklearn.svm import LinearSVC
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, make_scorer, f1_score
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from pprint import pprint
import time
from gensim.models import Phrases
from gensim.corpora import Dictionary
from gensim.models import LdaModel, CoherenceModel
from gensim.matutils import corpus2csc
import pyLDAvis
import pyLDAvis.gensim_models

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# ËºâÂÖ•Ë≥áÊñô

In [2]:
review = pd.read_csv("./raw_data/reviews.csv")
offering = pd.read_csv("./raw_data/offerings.csv")
offering = offering[offering['hotel_class'].notna()]
data = review.merge(offering, left_on="offering_id", right_on="id", suffixes=("_review", "_hotel"))
data.head(2)

Unnamed: 0,ratings,title,text,author,date_stayed,offering_id,num_helpful_votes,date,id_review,via_mobile,hotel_class,region_id,url,phone,details,address,type,id_hotel,name
0,"{'service': 5.0, 'cleanliness': 5.0, 'overall'...","‚ÄúTruly is ""Jewel of the Upper Wets Side""‚Äù",Stayed in a king suite for 11 nights and yes i...,"{'username': 'Papa_Panda', 'num_cities': 22, '...",December 2012,93338,0,2012-12-17,147643103,False,3.0,60763,http://www.tripadvisor.com/Hotel_Review-g60763...,,,"{'region': 'NY', 'street-address': '2130 Broad...",hotel,93338,Hotel Beacon
1,"{'service': 5.0, 'cleanliness': 5.0, 'overall'...",‚ÄúMy home away from home!‚Äù,"On every visit to NYC, the Hotel Beacon is the...","{'username': 'Maureen V', 'num_reviews': 2, 'n...",December 2012,93338,0,2012-12-17,147639004,False,3.0,60763,http://www.tripadvisor.com/Hotel_Review-g60763...,,,"{'region': 'NY', 'street-address': '2130 Broad...",hotel,93338,Hotel Beacon


# Ê∏ÖÊ¥óË≥áÊñô
- Ë©ûÂππÊ≠£Ë¶èÂåñ & ÂÅúÁî®Â≠ó & Â∞èÂØ´ & ÂñÆ‰∏ÄÂ≠ó

In [None]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-z ]', '', text)
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

data['tokens'] = data['text'].apply(preprocess)
docs = [[word for word in doc.split() if word not in stop_words and len(word)>1] for doc in data['tokens']]
len(docs)

843624

In [4]:
bigram = Phrases(docs, min_count=20)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Â∞áToken (bigram) Âä†ÂÖ•Âà∞docsË£°Èù¢
            docs[idx].append(token)

## Êñ∞Â¢ûÊ¨Ñ‰Ωç&Ë®ìÁ∑¥ÂàÜÈ°ûSVMÂô®
- Â∞áhotelÂàÜÈ°ûÁÇ∫ 1-2 / 2-3 / 3-4 / 4-5 ÊòüÔºåÊñ∞Â¢ûÊ¨Ñ‰Ωçhotel_class_group
- Â∞çÈÄô‰∫õÊ¨Ñ‰ΩçÂ∞çÊáâÁöÑË©ïË´ñÂÅö‰∏ªÈ°åÂàÜÊûê

In [5]:
# ÂàÜÂçÄÈñìÂÆöÁæ©ÂáΩÂºè
def get_hotel_class_group(hotel_class):
    if 1.0 <= hotel_class < 2.0:
        return "1~2"
    elif 2.0 <= hotel_class < 3.0:
        return "2~3"
    elif 3.0 <= hotel_class < 4.0:
        return "3~4"
    elif 4.0 <= hotel_class <= 5.0:
        return "4~5"
    else:
        return "other"
    
data['hotel_class_group'] = data['hotel_class'].apply(get_hotel_class_group)

# ÂÅö LDA ‰∏ªÈ°åÊ®°ÂûãÂàÜÊûêÔºàÊØèÁµÑË∑ë‰∏ÄÊ¨°Ôºâ
t0 = time.time()
groups = ["1~2", "2~3", "3~4", "4~5"]
lda_results = {}

for group in groups:
    group_data = data[data['hotel_class_group'] == group]
    texts = group_data['tokens'].tolist()
    
    if len(texts) < 10:
        print(f"Group {group} has too few samples, skipping...")
        continue
    
    # Âª∫Á´ã docs ÁöÑ dictionaryÁâ©‰ª∂
    dictionary = Dictionary(docs)
    dictionary.filter_extremes(no_below=5, no_above=0.5)
    print(dictionary)

    # Bag-of-words representation of the documents.
    # Áî® gensim ldamodel input ÈúÄË¶ÅÂ∞áÊñáÁ´†ËΩâÊèõÊàê bag of words 
    corpus = [dictionary.doc2bow(doc) for doc in docs]
    
    # Ë®ìÁ∑¥ LDA Ê®°Âûã
    lda_model = LdaModel(corpus=corpus,
                         id2word=dictionary,
                         num_topics=5,
                         random_state=42,
                         passes=10,
                         alpha='auto',
                         per_word_topics=True)
    
    lda_results[group] = {
        "model": lda_model,
        "dictionary": dictionary,
        "corpus": corpus
    }

    # È°ØÁ§∫‰∏ªÈ°å
    print(f"\n=== LDA Topics for Hotel Class {group} ===")
    topics = lda_model.print_topics(num_words=10)
    for topic_num, topic_words in topics:
        print(f"Topic {topic_num + 1}: {topic_words}")
        
print(f"Ëä±Ë≤ªÊôÇÈñì: {time.time() - t0} sec")

Dictionary<100000 unique tokens: ['able', 'access', 'across', 'adjoining', 'air']...>

=== LDA Topics for Hotel Class 1~2 ===
Topic 1: 0.013*"nice" + 0.012*"good" + 0.011*"breakfast" + 0.010*"bed" + 0.009*"bathroom" + 0.009*"free" + 0.008*"area" + 0.008*"small" + 0.007*"one" + 0.007*"night"
Topic 2: 0.022*"de" + 0.016*"la" + 0.016*"e" + 0.010*"da" + 0.010*"und" + 0.010*"un" + 0.010*"le" + 0.007*"en" + 0.007*"et" + 0.007*"die"
Topic 3: 0.015*"metro" + 0.014*"great" + 0.014*"walk" + 0.013*"location" + 0.012*"block" + 0.011*"restaurant" + 0.011*"walking" + 0.010*"street" + 0.009*"good" + 0.008*"staff"
Topic 4: 0.010*"u" + 0.009*"would" + 0.009*"night" + 0.009*"desk" + 0.009*"one" + 0.008*"front" + 0.007*"get" + 0.007*"front_desk" + 0.007*"day" + 0.007*"time"
Topic 5: 0.020*"staff" + 0.017*"great" + 0.016*"stay" + 0.014*"service" + 0.008*"stayed" + 0.008*"dc" + 0.008*"friendly" + 0.007*"time" + 0.007*"u" + 0.007*"location"
Dictionary<100000 unique tokens: ['able', 'access', 'across', 'adjo

#### Ë¶ñË¶∫ÂåñPerplexity Âíå PMI Ë©ï‰º∞‰∏ªÈ°åÊ®°ÂûãË°®Áèæ
+ **Pointwise Mutual Information (PMI)** : <br>
Ëá™ÁÑ∂Ë™ûË®ÄËôïÁêÜ‰∏≠ÔºåÊÉ≥Ë¶ÅÊé¢Ë®éÂÖ©ÂÄãÂ≠ó‰πãÈñìÊòØÂê¶Â≠òÂú®ÊüêÁ®ÆÈóú‰øÇ„ÄÇ<br>
‰æãÂ¶ÇÔºöÊüê‰∫õÂ≠óÊúÉ‰∏ÄËµ∑Âá∫ÁèæÔºåÂèØËÉΩÂ∏∂ÊúâÊüê‰∫õË®äÊÅØÔºåÂõ†Ê≠§ÈÄôÂÄãÂèØ‰ª•Áî® PMI ‰æÜË®àÁÆóÔºåÊï∏Â≠óË∂äÂ§ßË∂äÂ•Ω„ÄÇ
+ **perplexity** :<br>
perplexity ‰πüÊòØË©ï‰º∞ÁöÑÊåáÊ®ô‰πã‰∏ÄÔºåÂª£Ê≥õÁî®ÊñºË™ûË®ÄÊ®°ÂûãÁöÑË©ï‰º∞ÔºåÊÑèÊÄùÁÇ∫Ë§áÈõúÂ∫¶ÔºåÂõ†Ê≠§Êï∏Â≠óË¶ÅË∂äÂ∞èË∂äÂ•Ω„ÄÇ

## 3. LDAvisË¶ñË¶∫ÂåñÁµêÊûú

LDAvis ÊòØÊàëÂÄëÁ∂ìÂ∏∏ÊúÉ‰ΩøÁî®ÁöÑË¶ñË¶∫ÂåñÂ∑•ÂÖ∑ÔºåÁõÆÁöÑÁÇ∫Âπ´Âä©ÊàëÂÄëËß£Èáã‰∏ªÈ°åÊ®°Âûã‰∏≠ÔºåÂú®ÊàëÂÄëÂª∫ÊßãÂ•Ω‰∏ªÈ°åÊ®°ÂûãÂæóÂà∞ Œ∏(Êñá‰ª∂ÁöÑ‰∏ªÈ°åÂàÜ‰Ωà) Ë∑ü œÜ(‰∏ªÈ°åÁöÑÂ≠óÂàÜ‰Ωà)ÔºåÈÄèÈÅé pyLDAvis Â∞á‰∏ªÈ°åÈôçÁ∂≠Êàê‰∫åÁ∂≠Ôºå‰ª•Á∂≤È†ÅÁöÑÂΩ¢Âºè‰æõÊàëÂÄëÊü•Áúã„ÄÇ

+ ÂõõÂÄã‰∏ªÈ°åÊï∏ÔºåÂõ†Ê≠§ÊúâÂõõÂÄãÂúàÂúà
+ ÂúìË∂äÂ§ß‰ª£Ë°® document Ë∂äÂ§ß
+ Âè≥ÈÇäÂèØ‰ª•ÁúãÂà∞‰∏ªÈ°åÁöÑÂ≠óÂàÜ‰Ωà
+ Âè≥‰∏äÂπ´Êúâ‰∏ÄÂÄã bar Ë™øÊï¥ lambdaÔºöÁï∂ lambda=1 ‰πüÂ∞±ÊòØ‰ª£Ë°®Êú¨‰æÜÁöÑÂ≠óÂàÜ‰Ωà œÜÔºåÂ∞á lambda Á∏ÆË∂äÂ∞èÂèØ‰ª•ÁúãÂà∞Ë∂äÂîØ‰∏ÄÁöÑÂ≠óÔºåÂ•ΩÁöÑÂàÜ‰ΩàÊòØ œÜ ÂÄºÈ´ò‰∏îÂîØ‰∏ÄÔºåÂõ†Ê≠§ÊàëÂÄëË¶ÅÂú®ÈÄôÂÖ©ËÄÖÈñìÂèñÂπ≥Ë°°
  - Œª = 1.0 üëâ Ê†πÊìö Ë©ûÂú®Ë©≤‰∏ªÈ°å‰∏≠Âá∫ÁèæÁöÑÊ©üÁéá ÊéíÂ∫èÔºà‰πüÂ∞±ÊòØÊ†πÊìöœÜÂÄºÔºâ
  - Œª = 0.0 üëâ Ê†πÊìö Ë©ûÂú®‰∏ªÈ°å‰∏≠„ÄåÁõ∏Â∞çÂÖ∂‰ªñ‰∏ªÈ°å„ÄçÁöÑÁâπÁï∞ÊÄß ÊéíÂ∫è
+ ÂúìÂøÉË∂äÁõ∏ËøëÔºå‰ª£Ë°®‰∏ªÈ°åÊúÉË∂äÁõ∏‰ººÔºõÂèç‰πãÔºåÂúìÂøÉÂàÜË∂äÈñã‰ª£Ë°®‰∏ªÈ°åÊúâÂîØ‰∏ÄÊÄß<br>
  --> ÂÅáË®≠Ë©ûÂΩôÊú¨‰æÜÊúâ 100 Â≠óÔºåÁ∂≠Â∫¶ÊáâË©≤ÊòØ 100ÔºåÂÅáÂ¶ÇÊú¨‰æÜÁ∂≠Â∫¶Êé•Ëøë(Áõ∏Ëøë)ÁöÑË©±ÔºåÈôçÁ∂≠Âæå‰πüÊúÉÊé•Ëøë(Áõ∏Ëøë)

In [9]:
# # Âè™ÈÅ∏Êìá hotel_class Âú® 4~5 ÂçÄÈñìÁöÑË©ïË´ñ
group = '4~5'
lda_model = lda_results[group]['model']
corpus = lda_results[group]['corpus']
dictionary = lda_results[group]['dictionary']

# È°ØÁ§∫‰∫íÂãïË¶ñË¶∫Âåñ
pyLDAvis.enable_notebook()
graph = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
graph

# for group in lda_results:
#     model = lda_results[group]['model']
#     corpus = lda_results[group]['corpus']
#     dictionary = lda_results[group]['dictionary']
#     vis = pyLDAvis.gensim_models.prepare(model, corpus, dictionary)
#     vis
    # ÂÑ≤Â≠òhtml
    # pyLDAvis.save_html(vis, f"LDA_visualization_{group}.html")
    # print(f"Saved: LDA_visualization_{group}.html")

## ‰∏ªÈ°åÂàÜ‰ΩàÁöÑÊáâÁî®ÔºåÊê≠ÈÖçÂÖ∂‰ªñÊñáÁ´†Ë≥áË®ä

Êúâ‰∫ÜÂâçÈù¢Ë®ìÁ∑¥ÁöÑ‰∏ªÈ°åÊ®°ÂûãÔºåÊé•‰∏ã‰æÜÂèØ‰ª•ÂàÜÊûêÊØè‰∏ÄÁ´†ÁØÄ‰∏ªÈ°åÁöÑÂàÜ‰ΩàÊÉÖÊ≥Å

In [10]:
# ÂèñÂæóÊØèÁ´†ÁöÑ‰∏ªÈ°åÂàÜ‰Ωà
topics_doc = lda_model.get_document_topics(corpus)

In [11]:
# Â∞ágensimÁöÑË°®Á§∫Ê≥ïËΩâÊàêÁ®ÄÁñèÁü©Èô£
m_theta = corpus2csc(topics_doc).T.toarray()
theta = pd.DataFrame(m_theta, columns=[f"topic_{i+1}" for i in range(m_theta.shape[1])])
theta

Unnamed: 0,topic_1,topic_2,topic_3,topic_4,topic_5
0,0.414121,0.000000,0.198677,0.119257,0.267642
1,0.156051,0.000000,0.551167,0.000000,0.284519
2,0.583057,0.000000,0.013367,0.000000,0.394022
3,0.678653,0.000000,0.000000,0.021060,0.295933
4,0.428189,0.000000,0.000000,0.277935,0.291875
...,...,...,...,...,...
843619,0.014629,0.919609,0.019336,0.000000,0.037251
843620,0.000000,0.971767,0.000000,0.000000,0.011236
843621,0.000000,0.961703,0.000000,0.000000,0.021525
843622,0.011578,0.932870,0.000000,0.028546,0.018172


#### Â∞áÊØèÂÄãÁ´†ÁØÄÁöÑ‰∏ªÈ°åÊ©üÁéáÂàÜÂ∏ÉË¶ñË¶∫Âåñ

In [None]:
fig, ax = plt.subplots(figsize=(15, 6))
theta.plot.bar(ax=ax, stacked=True, color = plt.cm.Set3.colors)



Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "c:\Users\USER\AppData\Local\Programs\Python\Python311\Lib\site-packages\IPython\core\interactiveshell.py", line 3549, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\USER\AppData\Local\Temp\ipykernel_14380\305158882.py", line 2, in <module>
    theta.plot.bar(ax=ax, stacked=True, color = plt.cm.Set3.colors)
  File "c:\Users\USER\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandas\plotting\_core.py", line 1192, in bar
    return self(kind="bar", x=x, y=y, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\USER\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandas\plotting\_core.py", line 1030, in __call__
    return plot_backend.plot(data, kind=kind, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\USER\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandas\plotting\_matplotlib\__init__.py", line 71, in plot
    plot_ob

Error in callback <function _draw_all_if_interactive at 0x0000020BEC9771A0> (for post_execute), with arguments args (),kwargs {}:
Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "c:\Users\USER\AppData\Local\Programs\Python\Python311\Lib\site-packages\IPython\core\events.py", line 82, in trigger
    func(*args, **kwargs)
  File "c:\Users\USER\AppData\Local\Programs\Python\Python311\Lib\site-packages\matplotlib\pyplot.py", line 197, in _draw_all_if_interactive
    draw_all()
  File "c:\Users\USER\AppData\Local\Programs\Python\Python311\Lib\site-packages\matplotlib\_pylab_helpers.py", line 132, in draw_all
    manager.canvas.draw_idle()
  File "c:\Users\USER\AppData\Local\Programs\Python\Python311\Lib\site-packages\matplotlib\backend_bases.py", line 1893, in draw_idle
    self.draw(*args, **kwargs)
  File "c:\Users\USER\AppData\Local\Programs\Python\Python311\Lib\site-packages\matplotlib\backends\backend_agg.py", line 383, in draw
    self.renderer = self.get_renderer()
                    ^^^^^^^^^^^^^^^^^^^
  File "c:\Users\USER\AppData\Local\Programs\Python\Python311\Lib\site-packages\matplotlib\backends\backend_agg.py