In [1]:
# {hotel_name} 별로 최대 100개씩 리뷰를 스크랩
# 각 호텔의 디렉터리를 생성하여 scrap_data/{hotel_name}/reviews.txt 에 100개의 리뷰를 저장.
import os
import time
import random
from collections import Counter
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager

class MotelReviewScraper:
    def __init__(self):
        self.browser = None

    def start_browser(self):
        options = webdriver.ChromeOptions()
        options.add_argument("--headless")
        service = Service(executable_path=ChromeDriverManager().install())
        self.browser = webdriver.Chrome(service=service, options=options)

    def stop_browser(self):
        if self.browser:
            self.browser.quit()

    def scrap_reviews(self, url, max_reviews=100):
        reviews = []  # 리뷰 수집 리스트

        self.browser.get(url)  # 페이지 로드
        time.sleep(2)  # 초기 로딩을 위한 대기

        scroll_count = 0  # 스크롤 횟수 카운트
        while len(reviews) < max_reviews and scroll_count < 10:
            # 스크롤을 맨 아래로 내림
            self.browser.execute_script("window.scrollTo(0, document.documentElement.scrollHeight)")
            time.sleep(random.uniform(1, 2))  # 스크롤 후 대기
            
            # 리뷰 추출
            review_elements = self.browser.find_elements(By.CSS_SELECTOR, "#__next > section > div > div.css-1js0bc8 > div > div:nth-child(3) > div > div:nth-child(5) > div.css-1kpa3g > p")

            # 리뷰 텍스트 수집
            for element in review_elements:
                review = element.text
                if review:  # 빈 리뷰는 제외
                    reviews.append(review)
                    if len(reviews) >= max_reviews:
                        break
            
            # 다음 스크롤을 위해 스크롤 횟수 증가
            scroll_count += 1

        return reviews[:max_reviews]  # 최대 리뷰 개수만 반환

def save_reviews_to_txt(motel_name, reviews):
    dir_path = f"scrap_data/motel/{motel_name}"
    os.makedirs(dir_path, exist_ok=True)  # 호텔 디렉터리 생성
    file_path = os.path.join(dir_path, "reviews.txt")
    with open(file_path, "w", encoding="utf-8") as file:
        for i, review in enumerate(reviews, 1):
            file.write(f"{i}번째 리뷰:\n{review}\n\n")

def main():
    motel_url_lists = [
        "https://www.yanolja.com/reviews/domestic/26969",
        "https://www.yanolja.com/reviews/domestic/26526",
        "https://www.yanolja.com/reviews/domestic/3015853",
        "https://www.yanolja.com/reviews/domestic/1016239",
        "https://www.yanolja.com/reviews/domestic/1019375",
        "https://www.yanolja.com/reviews/domestic/3015828",
        "https://www.yanolja.com/reviews/domestic/3016692",
        "https://www.yanolja.com/reviews/domestic/27127",
        "https://www.yanolja.com/reviews/domestic/26412",
        "https://www.yanolja.com/reviews/domestic/1016973"
    ]
    motel_name_lists = [
        "서귀포 중문 제이힐&시티 호텔",
        "제주 컬리넌 호텔",
        "제주 휴",
        "서귀포(성산) 호텔 MCC",
        "제주 사하라호텔",
        "제주(탑동) HOTEL W",
        "제주 라임",
        "제주 F1",
        "제주 Fantastic Oceanview 시드니호텔",
        "제주 오션패밀리호텔"
    ]

    scraper = MotelReviewScraper()
    scraper.start_browser()

    try:
        for motel_name, motel_url in zip(motel_name_lists, motel_url_lists):
            reviews = scraper.scrap_reviews(motel_url)
            save_reviews_to_txt(motel_name, reviews)  # 리뷰를 텍스트 파일에 저장
            print(f"Reviews for {motel_name} saved to scrap_data/motel/{motel_name}/reviews.txt")
    finally:
        scraper.stop_browser()
if __name__ == "__main__":
    main()


Reviews for 서귀포 중문 제이힐&시티 호텔 saved to scrap_data/motel/서귀포 중문 제이힐&시티 호텔/reviews.txt
Reviews for 제주 컬리넌 호텔 saved to scrap_data/motel/제주 컬리넌 호텔/reviews.txt
Reviews for 제주 휴 saved to scrap_data/motel/제주 휴/reviews.txt
Reviews for 서귀포(성산) 호텔 MCC saved to scrap_data/motel/서귀포(성산) 호텔 MCC/reviews.txt
Reviews for 제주 사하라호텔 saved to scrap_data/motel/제주 사하라호텔/reviews.txt
Reviews for 제주(탑동) HOTEL W saved to scrap_data/motel/제주(탑동) HOTEL W/reviews.txt
Reviews for 제주 라임 saved to scrap_data/motel/제주 라임/reviews.txt
Reviews for 제주 F1 saved to scrap_data/motel/제주 F1/reviews.txt
Reviews for 제주 Fantastic Oceanview 시드니호텔 saved to scrap_data/motel/제주 Fantastic Oceanview 시드니호텔/reviews.txt
Reviews for 제주 오션패밀리호텔 saved to scrap_data/motel/제주 오션패밀리호텔/reviews.txt


In [2]:
# scrap_data/{hotel_name}/reviews.txt를 읽어서
# {hotel_name}의 리뷰들의 명사와 동사들의 빈도수를 각각 출력
# 그것을 scrap_data/{hotel_name}/nouns_and_verbs.txt에 저장
from collections import Counter
from konlpy.tag import Okt
import os

def save_nouns_and_verbs_to_file(motel_name, nouns, verbs):
    dir_path = f"scrap_data/motel/{motel_name}"
    os.makedirs(dir_path, exist_ok=True)  # 호텔 디렉터리 생성
    file_path = os.path.join(dir_path, "nouns_and_verbs.txt")
    with open(file_path, "w", encoding="utf-8") as file:
        file.write(f"명사 빈도수:\n")
        for word, count in nouns:
            file.write(f"{word}: {count}\n")
        file.write(f"\n동사 빈도수:\n")
        for word, count in verbs:
            file.write(f"{word}: {count}\n")
        file.write("\n")

def main():
    motel_name_lists = [
        "서귀포 중문 제이힐&시티 호텔",
        "제주 컬리넌 호텔",
        "제주 휴",
        "서귀포(성산) 호텔 MCC",
        "제주 사하라호텔",
        "제주(탑동) HOTEL W",
        "제주 라임",
        "제주 F1",
        "제주 Fantastic Oceanview 시드니호텔",
        "제주 오션패밀리호텔"
    ]

    try:
        for motel_name in motel_name_lists:
            okt = Okt()
            nouns = Counter()
            verbs = Counter()
            with open(f"scrap_data/motel/{motel_name}/reviews.txt", "r", encoding="utf-8") as file:
                reviews = file.readlines()
                for review in reviews:
                    pos = okt.pos(review.strip())
                    nouns.update([word for word, pos_tag in pos if pos_tag.startswith('Noun')])
                    verbs.update([word for word, pos_tag in pos if pos_tag.startswith('Verb')])
                nouns = nouns.most_common(50)
                verbs = verbs.most_common(50)
                print(f"Scraped Reviews for {motel_name}:")
                save_nouns_and_verbs_to_file(motel_name, nouns, verbs)
                print(f"  - Nouns and verbs saved for {motel_name}")
    except FileNotFoundError:
        print("Hotel review files not found.")
        return

if __name__ == "__main__":
    main()


Scraped Reviews for 서귀포 중문 제이힐&시티 호텔:
  - Nouns and verbs saved for 서귀포 중문 제이힐&시티 호텔
Scraped Reviews for 제주 컬리넌 호텔:
  - Nouns and verbs saved for 제주 컬리넌 호텔
Scraped Reviews for 제주 휴:
  - Nouns and verbs saved for 제주 휴
Scraped Reviews for 서귀포(성산) 호텔 MCC:
  - Nouns and verbs saved for 서귀포(성산) 호텔 MCC
Scraped Reviews for 제주 사하라호텔:
  - Nouns and verbs saved for 제주 사하라호텔
Scraped Reviews for 제주(탑동) HOTEL W:
  - Nouns and verbs saved for 제주(탑동) HOTEL W
Scraped Reviews for 제주 라임:
  - Nouns and verbs saved for 제주 라임
Scraped Reviews for 제주 F1:
  - Nouns and verbs saved for 제주 F1
Scraped Reviews for 제주 Fantastic Oceanview 시드니호텔:
  - Nouns and verbs saved for 제주 Fantastic Oceanview 시드니호텔
Scraped Reviews for 제주 오션패밀리호텔:
  - Nouns and verbs saved for 제주 오션패밀리호텔


In [4]:
# scrap_data/{hotel_name}/nouns 와 scrap_data/{hotel_name}/verbs 디렉터리를 생성하고
# scrap_data/{hotel_name}/nouns_and_verbs.txt에서 명사와 동사의 빈도수를 읽어서
# (명사의 빈도수 plot, wordcloud)와 (동사의 빈도수 plot, wordcloud)을 각각
# scrap_data/{hotel_name}/nouns/nouns_word_frequency_plot.png
# scrap_data/{hotel_name}/nouns/nouns_word_wordcloud.png
# scrap_data/{hotel_name}/verbs/verbs_word_frequency_plot.png
# scrap_data/{hotel_name}/verbs/verbs_word_wordcloud.png
# 로 저장.
import os
import matplotlib.pyplot as plt
from collections import Counter
from wordcloud import WordCloud
from konlpy.tag import Okt

def generate_word_frequency_plot_and_word_cloud(motel_name, nouns, verbs):
    # Create directories if they don't exist
    nouns_dir = f"scrap_data/motel/{motel_name}/nouns"
    verbs_dir = f"scrap_data/motel/{motel_name}/verbs"
    os.makedirs(nouns_dir, exist_ok=True)
    os.makedirs(verbs_dir, exist_ok=True)

    # 명사 빈도수 플롯
    if nouns:
        words_words, words_counts = zip(*nouns)
        plt.rcParams['font.family'] = 'malgun gothic'
        plt.rcParams['axes.unicode_minus'] = False
        plt.figure(figsize=(12, 6))
        plt.bar(words_words, words_counts, color='skyblue')
        plt.title(f'Word Frequency Plot for {motel_name} (Nouns)')
        plt.xlabel('Words')
        plt.ylabel('Frequency')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.savefig(f'{nouns_dir}/{motel_name}_nouns_word_frequency_plot.png')
        plt.close()

        # 명사 워드 클라우드 생성
        wc = WordCloud(
            font_path='C:\\Windows\\Fonts\\malgunbd.ttf',
            max_words=100,
            background_color='white'
        )
        word_cloud_img = wc.generate_from_frequencies(dict(nouns))
        word_cloud_img.to_file(f'{nouns_dir}/{motel_name}_nouns_wordcloud.png')

    # 동사 빈도수 플롯
    if verbs:
        words_words, words_counts = zip(*verbs)
        plt.rcParams['font.family'] = 'malgun gothic'
        plt.rcParams['axes.unicode_minus'] = False
        plt.figure(figsize=(12, 6))
        plt.bar(words_words, words_counts, color='salmon')
        plt.title(f'Word Frequency Plot for {motel_name} (Verbs)')
        plt.xlabel('Words')
        plt.ylabel('Frequency')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.savefig(f'{verbs_dir}/{motel_name}_verbs_word_frequency_plot.png')
        plt.close()

        # 동사 워드 클라우드 생성
        wc = WordCloud(
            font_path='C:\\Windows\\Fonts\\malgunbd.ttf',
            max_words=100,
            background_color='white'
        )
        word_cloud_img = wc.generate_from_frequencies(dict(verbs))
        word_cloud_img.to_file(f'{verbs_dir}/{motel_name}_verbs_wordcloud.png')

def main():
    motel_name_lists = [
        "서귀포 중문 제이힐&시티 호텔",
        "제주 컬리넌 호텔",
        "제주 휴",
        "서귀포(성산) 호텔 MCC",
        "제주 사하라호텔",
        "제주(탑동) HOTEL W",
        "제주 라임",
        "제주 F1",
        "제주 Fantastic Oceanview 시드니호텔",
        "제주 오션패밀리호텔"
    ]

    for motel_name in motel_name_lists:
        nouns = []  # 명사 리스트 초기화
        verbs = []  # 동사 리스트 초기화

        # Read nouns and verbs from file
        with open(f"scrap_data/motel/{motel_name}/nouns_and_verbs.txt", "r", encoding="utf-8") as file:
            lines = file.readlines()

        # Extract nouns and verbs data
        found_nouns = False
        found_verbs = False
        for line in lines:
            line = line.strip()
            if line == "명사 빈도수:":
                found_nouns = True
                found_verbs = False
                continue
            elif line == "동사 빈도수:":
                found_nouns = False
                found_verbs = True
                continue
            elif line == "":
                found_nouns = False
                found_verbs = False
                continue
            if found_nouns:
                word, count = line.split(":")
                nouns.append((word.strip(), int(count.strip())))
            elif found_verbs:
                word, count = line.split(":")
                verbs.append((word.strip(), int(count.strip())))

        # Generate word frequency plot and word cloud
        generate_word_frequency_plot_and_word_cloud(motel_name, nouns, verbs)
        print(f"Word Frequency Plot and Word Cloud saved for {motel_name}")

if __name__ == "__main__":
    main()


Word Frequency Plot and Word Cloud saved for 서귀포 중문 제이힐&시티 호텔
Word Frequency Plot and Word Cloud saved for 제주 컬리넌 호텔
Word Frequency Plot and Word Cloud saved for 제주 휴
Word Frequency Plot and Word Cloud saved for 서귀포(성산) 호텔 MCC
Word Frequency Plot and Word Cloud saved for 제주 사하라호텔
Word Frequency Plot and Word Cloud saved for 제주(탑동) HOTEL W
Word Frequency Plot and Word Cloud saved for 제주 라임
Word Frequency Plot and Word Cloud saved for 제주 F1
Word Frequency Plot and Word Cloud saved for 제주 Fantastic Oceanview 시드니호텔
Word Frequency Plot and Word Cloud saved for 제주 오션패밀리호텔
