In [2]:
import requests
from bs4 import BeautifulSoup

# 目標網址
url = "https://www.ithome.com.tw/news/152373"

# 發送請求
response = requests.get(url)
response.raise_for_status()  # 確保請求成功

# 解析 HTML
soup = BeautifulSoup(response.text, 'html.parser')

# 抓取文章標題
title = soup.find('h1', class_='page-header').get_text(strip=True)

# 抓取文章內容
content_div = soup.find('div', class_='field-items')
paragraphs = content_div.find_all('p') if content_div else []
content = '\n'.join([p.get_text(strip=True) for p in paragraphs])

# 輸出結果
print("文章標題：", title)
print("\n文章內容：")
print(content)


文章標題： 目標未來3～5年全集團100套系統上雲，國泰金控實現大規模上雲的關鍵策略

文章內容：



In [5]:
import requests
from bs4 import BeautifulSoup
import json
from datetime import datetime
import os
import time
import logging

class iThomeNewsScraper:
    def __init__(self, debug=False):
        # 設定請求標頭，模擬瀏覽器行為
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7',
            'Connection': 'keep-alive',
        }

        # 設定日誌
        self.setup_logging(debug)

    def setup_logging(self, debug):
        """設定日誌系統"""
        level = logging.DEBUG if debug else logging.INFO
        logging.basicConfig(
            level=level,
            format='%(asctime)s - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler('scraper.log', encoding='utf-8'),
                logging.StreamHandler()
            ]
        )
        self.logger = logging.getLogger(__name__)

    def get_article_content(self, url):
        """
        爬取指定URL的新聞內容

        Args:
            url (str): 新聞文章的URL

        Returns:
            dict: 包含新聞資訊的字典
        """
        try:
            # 發送GET請求獲取網頁內容
            self.logger.info(f"開始爬取文章: {url}")
            response = requests.get(url, headers=self.headers, timeout=10)
            response.raise_for_status()
            response.encoding = 'utf-8'

            # 使用BeautifulSoup解析HTML
            soup = BeautifulSoup(response.text, 'html.parser')

            # 使用更靈活的選擇器並加入錯誤處理
            title = self._safe_get_text(soup, ['h1.page-header', 'h1.title'])
            if not title:
                raise ValueError("無法找到文章標題")

            author = self._safe_get_text(soup, ['div.author a', 'div.created a', 'span.author'])
            publish_time = self._safe_get_text(soup, ['div.created', 'div.meta-created'])

            # 取得文章內容，使用多個可能的選擇器
            content_selectors = [
                'div.field-items div.field-item p',
                'div.article-content p',
                'div.content p'
            ]
            content = []
            for selector in content_selectors:
                paragraphs = soup.select(selector)
                if paragraphs:
                    content = [p.text.strip() for p in paragraphs if p.text.strip()]
                    break

            if not content:
                self.logger.warning(f"無法找到文章內容，嘗試使用備用方法")
                # 備用方法：尋找所有段落
                content = [p.text.strip() for p in soup.find_all('p') if p.text.strip()]

            article_data = {
                'title': title,
                'author': author,
                'publish_time': publish_time,
                'content': '\n'.join(content),
                'url': url,
                'scrape_time': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            }

            # 儲存圖片URL
            image_selectors = [
                'div.field-items div.field-item img',
                'div.article-content img',
                'div.content img'
            ]
            images = []
            for selector in image_selectors:
                imgs = soup.select(selector)
                if imgs:
                    images.extend([img['src'] for img in imgs if 'src' in img.attrs])

            article_data['images'] = images

            self.logger.info(f"成功爬取文章: {title}")
            return article_data

        except requests.exceptions.RequestException as e:
            self.logger.error(f"請求錯誤: {str(e)}")
            return None
        except ValueError as e:
            self.logger.error(f"解析錯誤: {str(e)}")
            return None
        except Exception as e:
            self.logger.error(f"未預期的錯誤: {str(e)}")
            return None

    def _safe_get_text(self, soup, selectors):
        """安全地從多個可能的選擇器中獲取文字內容"""
        for selector in selectors:
            element = soup.select_one(selector)
            if element and element.text:
                return element.text.strip()
        return ""

    def save_to_json(self, data, filename):
        """將爬取的資料儲存為JSON檔案"""
        try:
            os.makedirs('output', exist_ok=True)
            output_path = os.path.join('output', filename)
            with open(output_path, 'w', encoding='utf-8') as f:
                json.dump(data, f, ensure_ascii=False, indent=2)
            self.logger.info(f"資料已儲存至 {output_path}")
        except Exception as e:
            self.logger.error(f"儲存檔案時發生錯誤: {str(e)}")

def main():
    # 啟用除錯模式
    scraper = iThomeNewsScraper(debug=True)

    # 測試多個URL
    urls = [
        "https://www.ithome.com.tw/news/152373",
        "https://www.ithome.com.tw/news/159391"
    ]

    for url in urls:
        try:
            article_data = scraper.get_article_content(url)
            if article_data:
                filename = f"{article_data['title'][:30].replace('/', '_')}.json"
                scraper.save_to_json(article_data, filename)
            # 加入延遲，避免過於頻繁的請求
            time.sleep(2)
        except Exception as e:
            logging.error(f"爬取文章 {url} 時發生錯誤: {str(e)}")

if __name__ == "__main__":
    main()