In [1]:
# 具有分類路徑、出版日期與ISBN13的爬蟲程式
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import logging

# 設置日誌記錄
logging.basicConfig(filename='/content/drive/My Drive/Colab Notebooks/sanmin_scraper.log', level=logging.INFO,
                    format='%(asctime)s - %(level)s - %(message)s')

# 基本設置
base_url = "https://www.sanmin.com.tw/promote/top/?id=yy&item=11403"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

# 使用 session 來減少重複連線開銷
session = requests.Session()
data = []

# 設置自動重試機制
def fetch_with_retry(url, max_retries=3):
    """嘗試請求 URL，最多重試 max_retries 次"""
    for attempt in range(max_retries):
        try:
            response = session.get(url, headers=headers, timeout=10)
            if response.status_code == 200:
                return response
            else:
                print(f"⚠️ Attempt {attempt+1} failed: {response.status_code}")
                time.sleep(2)  # 延遲再試
        except requests.exceptions.RequestException as e:
            print(f"⚠️ Attempt {attempt+1} failed: {e}")
            time.sleep(2)
    return None  # 如果三次都失敗，返回 None

# 爬取每一頁的資料
for page in range(1, 26):
    url = f"{base_url}&pi={page}"
    response = session.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')

    logging.info(f"Processing page {page}")
    print(f"Processing page {page}")

    # 查找所有產品資訊
    products = soup.find_all('div', class_='sProduct')

    for product in products:
        try:
            rank = product.find('div', class_='Title').get_text(strip=True).split('.')[0]
            name = product.find('h3').get_text(strip=True)

            # 提取商品ID
            product_id = product['class'][1].replace('Prod', '')
            detail_url = f"https://www.sanmin.com.tw/product/index/{product_id}"

            # 提取作者
            author_tags = product.select('div.Author span.text-green a')
            authors = '; '.join([tag.get_text(strip=True) for tag in author_tags]) if author_tags else '未知'

            # 提取出版社
            publisher_tag = product.select_one('div.Author span.text-green a[href*="/search/index/?pu="]')
            publisher = publisher_tag.get_text(strip=True) if publisher_tag else '未知'

            # 提取價格
            price_tag = product.select_one('.Price')
            price = price_tag.get_text(strip=True) if price_tag else '未知'

            # 訪問商品詳細頁面
            detail_response = fetch_with_retry(detail_url)
            if detail_response is None:
                print(f"❌ Failed to fetch {detail_url} after retries")
                logging.error(f"Failed to fetch {detail_url} after retries")
                continue  # 跳過此書，避免影響後續爬取

            detail_soup = BeautifulSoup(detail_response.text, 'html.parser')


            # 提取分類路徑
            breadcrumb_tags = detail_soup.select('#breadcrumb-trail a')  # 直接選擇所有 <a> 標籤
            category_path = '-'.join([tag.get_text(strip=True) for tag in breadcrumb_tags if tag.get_text(strip=True) != "三民網路書店"]) if breadcrumb_tags else '未知'

            # 提取 ISBN13
            try:
                isbn_tag = next((li for li in detail_soup.find_all('li', class_='mainText ga') if "ISBN13：" in li.text), None)
                isbn13 = isbn_tag.get_text(strip=True).replace('ISBN13：', '').strip() if isbn_tag else '未知'
            except Exception as e:
                logging.error(f"Error extracting ISBN13 for product {product_id}: {e}")
                isbn13 = '未知'

            # 提取出版日期
            try:
                pub_date_tag = next((li for li in detail_soup.find_all('li', class_='mainText ga') if "出版日：" in li.text), None)
                pub_date = pub_date_tag.get_text(strip=True).replace('出版日：', '').strip() if pub_date_tag else '未知'
            except Exception as e:
                logging.error(f"Error extracting PubDate for product {product_id}: {e}")
                pub_date = '未知'

            # 提取多個定位關鍵字的內容
            schemes = []
            keywords = ["三民出版品", "親子館", "中文圖書分類", "得獎作品"]
            for scheme_type in keywords:
                scheme_tag = detail_soup.find('a', class_='text-secondary bold', text=scheme_type)
                if scheme_tag:
                    main_title = scheme_tag.get_text(strip=True)
                    sub_titles = scheme_tag.find_parent('div', class_='text-secondary py3').find_all('h3', class_='d-inline fs-14')
                    for sub_title in sub_titles:
                        sub_text = sub_title.get_text(strip=True)
                        schemes.append(f"{main_title}-{sub_text}")

            scheme_text = '; '.join(schemes) if schemes else '未知'

            # 按指定順序添加資料
            data.append({
                "名次": rank,
                "ISBN13": isbn13,
                "名稱": name,
                "作者": authors,
                "出版社": publisher,
                "出版日期": pub_date,
                "價格": price,
                "分類路徑": category_path,
                "其他路徑": scheme_text,
                "商品ID": product_id,
                "詳細頁面URL": detail_url
            })

            logging.info(f"Extracted product: Rank={rank}, ISBN13={isbn13}, Name={name}, Authors={authors}, Publisher={publisher}, PubDate={pub_date}, Price={price}, CategoryPath={category_path}, SchemeText={scheme_text}, ProductID={product_id}")
            print(f"Extracted product: Rank={rank}, ISBN13={isbn13}, Name={name}, Authors={authors}, Publisher={publisher}, PubDate={pub_date}, Price={price}, CategoryPath={category_path}, SchemeText={scheme_text}, ProductID={product_id}")
        except AttributeError as e:
            logging.error(f"Error parsing product: {e}")
            logging.error(f"Failed product HTML: {product}")
            print(f"Error parsing product: {e}")
            print(f"Failed product HTML: {product}")

    # 暫停以避免被伺服器禁止
    time.sleep(2)

# 保存資料到CSV文件
df = pd.DataFrame(data)
output_path = '/content/drive/My Drive/Colab Notebooks/sanminranking11401-1.csv' # 修改過的路徑
df.to_csv(output_path, index=False, encoding='utf-8-sig')

logging.info(f"資料已保存到 {output_path}")
print(f"資料已保存到 {output_path}")



Processing page 1


  scheme_tag = detail_soup.find('a', class_='text-secondary bold', text=scheme_type)


Extracted product: Rank=1, ISBN13=9786269864188, Name=夢 游記【首刷限量隨書贈送：柯佳嬿手寫「夢游小卡」一張（兩款照片隨機附贈），送完為止】, Authors=柯佳嬿; 重版出版, Publisher=重版出版, PubDate=2025/03/14, Price=未知, CategoryPath=中文書-文學作品-文學-東方文學-遠東各地文學, SchemeText=中文圖書分類-小說, ProductID=014043798
Extracted product: Rank=2, ISBN13=9786263124455, Name=新大滿貫複習講義：物理（下）－高中選修物理I～V, Authors=陳世清; 鄭光泓; 顏銘裕; 陳禹潔-編; 翰林, Publisher=翰林, PubDate=2024/10/01, Price=437, CategoryPath=教科考用-升學應試-升大學/插大-自然, SchemeText=未知, ProductID=010138105
Extracted product: Rank=3, ISBN13=9786263124448, Name=贏戰關鍵60天：數學甲, Authors=殷灝; 翰林, Publisher=翰林, PubDate=2024/12/01, Price=428, CategoryPath=教科考用-升學應試-升大學/插大-數學/電腦, SchemeText=未知, ProductID=012738160
Extracted product: Rank=4, ISBN13=9786267436493, Name=哥吉拉大解剖圖鑑：西川伸司解構怪獸深淵, Authors=西川伸司; 奇幻基地, Publisher=奇幻基地, PubDate=2025/03/06, Price=749, CategoryPath=中文書-藝術設計-戲劇-電影, SchemeText=中文圖書分類-各種電影片, ProductID=014017192
Extracted product: Rank=5, ISBN13=9786263125636, Name=贏戰關鍵60天：公民與社會, Authors=廖翠雰; 陳昺泰; 于子芸; 李文正; 田單; 單媛; 翰林, Pub

OSError: Cannot save file into a non-existent directory: '/content/drive/My Drive/Colab Notebooks'