In [None]:
from bs4 import BeautifulSoup
from datetime import datetime
import requests
import json
import emoji
import re
import pandas as pd
import random
import time

class GoogleMapSpider:
    def __init__(self):
        self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
                       "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"
        }
        self.store_id_url = "https://www.google.com.tw/maps/search/{store_name}"
        self.store_name_url = "https://www.google.com.tw/maps/place/data=!4m5!3m4!1s{store_id}!8m2!3d25.0564743!4d121.5204167?authuser=0&hl=zh-TW&rclk=1"
        self.comment_url = "https://www.google.com.tw/maps/rpc/listugcposts"
        
        
    def get_store_id(self, store_name):
        '''store_name 必須與 google 地圖搜尋結果完全一致, 例如: 隱家拉麵 士林店'''
        url = self.store_id_url.format(store_name=store_name)
        response = requests.get(url, headers=self.headers)
        soup = BeautifulSoup(response.text, "lxml")
        pattern = r'0x.{16}:0x.{16}'
        match = re.search(pattern, str(soup))
        store_id = match.group()
        
        return store_id

    def get_store_name(self, store_id):
        url = self.store_name_url.format(store_id=store_id)
        response = requests.get(url, headers=self.headers)
        soup = BeautifulSoup(response.text, "lxml")
        meta_list=soup.find_all('meta')
        if len(meta_list) == 0:
            raise Exception('無法取得店家資料')
        store_name=[]
        for i in meta_list:
            if '''itemprop="name"''' in str(i):
                matched = re.search('".*·',str(i))
                if not matched:
                    raise Exception('無法取得店家資料')
                store_name.append(matched.group()[1:-2])
        store_name=store_name[0]

        return store_name

    
    def get_related_store_names(self, store_name):
        '''輸入店名，返回與搜尋最相關的店名與id'''
        url = self.store_id_url.format(store_name=store_name)
        response = requests.get(url, headers=self.headers)
        soup = BeautifulSoup(response.text, "lxml")
        pattern = r'0x.{16}:0x.{16}'
        store_id_list = set(re.findall(pattern, str(soup)))
        store_id_list = [store_id.replace('\\', '') for store_id in store_id_list]
        store_name_list = []
        for store_id in store_id_list:
            try:
                store_name_list.append(self.get_store_name(store_id))
            except:
                pass

        store_dict = {index: letter for index,
                    letter in zip(store_name_list, store_id_list)}

        return store_dict
    
    def get_comment(self, store_id, page_count=1, sorted_by=2, max_waiting_interval=5):
        '''
        page_count 參數對應：
            0 - 抓取所有評論
            n - 抓取 n 頁評論
        
        sorted_by 參數對應：
            1 - 最相關 (Most Relevant)
            2 - 最新 (Newest)
            3 - 評分最高 (Highest Rating)
            4 - 評分最低 (Lowest Rating)
        
        每個 page 會有10筆資料，除非評論數未達10筆
        '''
        next_token = ""
        commont_list = []
        page = 1
        while page_count == 0 or page <= page_count:
            # 每頁抓取間隔時間
            interval_time = random.randint(1, max_waiting_interval)
            print(f"第 {page} 頁開始抓取")
            
            params = {
                "authuser": "0",
                "hl": "zh-TW",
                "gl": "tw",
                "pb": (
                    f"!1m6!1s{store_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!2s"
                    f"{next_token}"
                    f"!5m2!1s0OBwZ4OnGsrM1e8PxIjW6AI!7e81!8m5!1b1!2b1!3b1!5b1!7b1!11m0!13m1!1e{sorted_by}"
                )
            }

            response = requests.get(self.comment_url, params=params, headers=self.headers)
            data = json.loads(emoji.demojize(response.text[4:]))
            print(f"第 {page} 抓取結束")

            next_token = data[1]
            commont_list.extend(data[2])
            if not next_token:
                print(f"所有評論已抓取完成，總共抓取 {len(commont_list)} 則評論")
                break
            
            # 等待間隔時間，避免被 google 封鎖
            print(f"等待 {interval_time} 秒後抓取下一頁")
            time.sleep(interval_time)
            page += 1
                
        # 提取需要的資料
        commont_dict_list = []
        for comment_data in commont_list:
            
            try:
                comment_date = comment_data[0][2][2][0][1][21][6][-1]
                comment_date = datetime(comment_date[0], comment_date[1], comment_date[2], comment_date[3]).strftime('%Y/%m/%d %H:%M:%S')
            except:
                comment_date = None

            try:
                comment_text = comment_data[0][2][-1][0][0]
            except:
                comment_text = None

            comment_info = {
                "評論者": comment_data[0][1][4][5][0],
                "評論者id": comment_data[0][0],
                "評論者狀態": comment_data[0][1][4][5][10][0],
                "評論者等級": comment_data[0][1][4][5][9],
                "留言時間": comment_data[0][1][6],
                "留言日期": comment_date,
                "評論": comment_text,
                "評論分數": comment_data[0][2][0][0]
            }
            commont_dict_list.append(comment_info)

        return commont_dict_list

    def save_comments_to_csv(self, comments, filename):
        df = pd.DataFrame(comments)
        df.to_csv(filename, index=False, encoding='utf-8-sig')
        print(f"評論已儲存至 {filename}")

if __name__ == "__main__":
    gms = GoogleMapSpider()
    store_id, store_name = "", ""
    while True:
        store_name = input("請輸入店名: ")
        if not store_name:
            print("店名不得為空，請重新輸入店名")
            continue
        
        store_id = gms.get_store_id(store_name)
    
        if store_id == "":
            print("無法取得店家資料，請重新輸入店名")
            continue
        print(f"店家id: {store_id}")
        
        try:
            store_name = gms.get_store_name(store_id)
        except Exception as e:
            if str(e) == '無法取得店家資料':
                print("無法取得店家資料，請重新輸入店名")
                continue
            else:
                raise e
        break
    print(f"店名: {store_name}")
    store_dict = gms.get_related_store_names(store_name)
    page_count = int(input("輸入要爬取的頁數，輸入0則爬取所有評論: "))
    commont_dict_list = gms.get_comment(store_id=store_id, page_count=page_count, sorted_by=2)
    print(commont_dict_list)
    
    while True:
        csv_filepath = input("輸入 CSV 路徑: ")
        try:
            gms.save_comments_to_csv(commont_dict_list, csv_filepath)
            break
        except:
            print("輸入錯誤，請重新輸入CSV 路徑")