# Google 商家評論爬蟲

Github 連結: https://github.com/TimLai666/google-maps-store-review-crawler

## 安裝套件

1. 非 Colab 環境:
    ```bash
    pip install -r requirements.txt
    ```
2. Colab 環境:<br/>
    執行下方程式碼⬇️

In [None]:
!pip install beautifulsoup4 requests pandas emoji

## 主要程式碼

In [None]:
from bs4 import BeautifulSoup
from datetime import datetime
from IPython.display import display_html
import requests
import json
import emoji
import re
import pandas as pd
import random
import time

class GoogleMapsReviewCrawler:
    def __init__(self):
        self.__config_url__ ="https://raw.githubusercontent.com/TimLai666/google-maps-store-review-crawler/refs/heads/main/crawler_config.json"
        res = requests.get(self.__config_url__)
        config = res.json()
        self.headers = config.get("headers")
        self.store_id_url = config.get("storeSearchUrl")
        self.store_name_url = config.get("storeNameUrl")
        self.review_url = config.get("reviewUrl")
        
    def get_store_id(self, store_name: str):
        '''store_name 必須與 google 地圖搜尋結果完全一致, 例如: 隱家拉麵 士林店'''
        url = self.store_id_url.format(store_name=store_name)
        response = requests.get(url, headers=self.headers)
        soup = BeautifulSoup(response.text, "lxml")
        pattern = r'0x.{16}:0x.{16}'
        match = re.search(pattern, str(soup))
        store_id = match.group()
        
        return store_id

    def get_store_name(self, store_id: str):
        url = self.store_name_url.format(store_id=store_id)
        response = requests.get(url, headers=self.headers)
        soup = BeautifulSoup(response.text, "lxml")
        meta_list=soup.find_all('meta')
        if len(meta_list) == 0:
            raise Exception('無法取得商家資料')
        store_name=[]
        for i in meta_list:
            if '''itemprop="name"''' in str(i):
                matched = re.search('".*·',str(i))
                if not matched:
                    raise Exception('無法取得商家資料')
                store_name.append(matched.group()[1:-2])
        store_name=store_name[0]

        return store_name

    
    def get_related_store_names(self, store_name):
        '''輸入店名，返回與搜尋最相關的店名與id'''
        url = self.store_id_url.format(store_name=store_name)
        response = requests.get(url, headers=self.headers)
        soup = BeautifulSoup(response.text, "lxml")
        pattern = r'0x.{16}:0x.{16}'
        store_id_list = set(re.findall(pattern, str(soup)))
        store_id_list = [store_id.replace('\\', '') for store_id in store_id_list]
        store_name_list = []
        for store_id in store_id_list:
            try:
                store_name_list.append(self.get_store_name(store_id))
            except:
                pass

        store_dict = {index: letter for index,
                    letter in zip(store_name_list, store_id_list)}

        return store_dict
    
    def get_review(self, store_id: str, page_count=1, sorted_by=2, max_waiting_interval=5):
        '''
        page_count 參數對應：
            0 - 抓取所有評論
            n - 抓取 n 頁評論
        
        sorted_by 參數對應：
            1 - 最相關 (Most Relevant)
            2 - 最新 (Newest)
            3 - 評分最高 (Highest Rating)
            4 - 評分最低 (Lowest Rating)
        
        每個 page 會有10筆資料，除非評論數未達10筆
        '''
        next_token = ""
        commont_list = []
        page = 1
        while page_count == 0 or page <= page_count:
            # 每頁抓取間隔時間
            interval_time = random.randint(1, max_waiting_interval)
            print(f"第 {page} 頁開始抓取")
            
            params = {
                "authuser": "0",
                "hl": "zh-TW",
                "gl": "tw",
                "pb": (
                    f"!1m6!1s{store_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!2s"
                    f"{next_token}"
                    f"!5m2!1s0OBwZ4OnGsrM1e8PxIjW6AI!7e81!8m5!1b1!2b1!3b1!5b1!7b1!11m0!13m1!1e{sorted_by}"
                )
            }

            response = requests.get(self.review_url, params=params, headers=self.headers)
            data = json.loads(emoji.demojize(response.text[4:]))
            print(f"第 {page} 抓取結束")
            
            if not data:
                print(f"沒有任何評論，結束抓取")
                break

            next_token = data[1]
            commont_list.extend(data[2])
            if not next_token or page == page_count:
                print(f"所有評論已抓取完成，總共抓取 {len(commont_list)} 則評論")
                break
            
            # 等待間隔時間，避免被 google 封鎖
            print(f"等待 {interval_time} 秒後抓取下一頁")
            time.sleep(interval_time)
            page += 1
            
        # 提取需要的資料
        commont_dict_list = []
        for review_data in commont_list:
            
            try:
                review_date = review_data[0][2][2][0][1][21][6][-1]
                review_date = datetime(review_date[0], review_date[1], review_date[2], review_date[3]).strftime('%Y/%m/%d %H:%M:%S')
            except:
                review_date = None

            try:
                review_text = review_data[0][2][-1][0][0]
            except:
                review_text = None

            review_info = {
                "評論者": review_data[0][1][4][5][0],
                "評論者id": review_data[0][0],
                "評論者狀態": review_data[0][1][4][5][10][0],
                "評論者等級": review_data[0][1][4][5][9],
                "留言時間": review_data[0][1][6],
                "留言日期": review_date,
                "評論": review_text,
                "評論分數": review_data[0][2][0][0]
            }
            commont_dict_list.append(review_info)

        return commont_dict_list
    
    def display_table(self, reviews):
        df = pd.DataFrame(reviews)
        display_html(df)

    def save_reviews_to_csv(self, reviews, filename):
        df = pd.DataFrame(reviews)
        df.to_csv(filename, index=False, encoding='utf-8-sig')
        print(f"評論已儲存至 {filename}")

if __name__ == "__main__":
    gmcc = GoogleMapsReviewCrawler()
    store_id, store_name = "", ""
    while True:
        store_name_input = input("請輸入店名: ")
        if not store_name_input:
            print("店名不得為空，請重新輸入店名")
            continue
    
        store_dict = gmcc.get_related_store_names(store_name_input)
        print(store_dict)
        for name, id in store_dict.items():
            is_correct = input(f"是否為您要找的店? [ {name} ] (y/n): ").lower() == 'y'
            if is_correct:
                store_name, store_id = name, id
                break
        if not store_id:
            print("沒有更多商家資料，請重新輸入店名")
        break
    
    print(f"完整店名: {store_name}")
    page_count = int(input("輸入要爬取的頁數，輸入0則爬取所有評論: "))
    commont_dict_list = gmcc.get_review(store_id=store_id, page_count=page_count, sorted_by=2)
    print(commont_dict_list)
    
    gmcc.display_table(commont_dict_list)
    
    does_save_to_csv = input("是否要儲存評論至 CSV 檔? (y/n): ").lower() == 'y'
    while does_save_to_csv:
        csv_filepath = input("輸入 CSV 路徑: ")
        try:
            gmcc.save_reviews_to_csv(commont_dict_list, csv_filepath)
            break
        except:
            print("輸入錯誤，請重新輸入CSV 路徑")