In [2]:
import datetime
from selenium.webdriver.edge.options import Options
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import csv

def find_TVBS_news(n_seconds_ago):

    options = Options()
    driver = webdriver.Edge(options=options)

    # 新聞網站 URL
    url = "https://news.tvbs.com.tw/realtime"
    driver.get(url)

    # 找到所有含有 style 屬性的 li 元素
    li_elements = driver.find_elements(By.CSS_SELECTOR, "li[style]")

    # 將找到的元素的 style 屬性移除
    for li_element in li_elements:
        driver.execute_script("arguments[0].removeAttribute('style')", li_element)

    # 初始化一個空字典來存放新聞資訊
    news = {}
    index = 1
    # 用於存儲已找到的新聞標題
    finded_news_titles = set()

    # 持續滾動頁面,直到找不到符合時間範圍的新聞為止
    while True:
        # 找到所有新聞項目
        news_items = driver.find_elements(By.CSS_SELECTOR, "div.news_list > div.list > ul > li")
        # 如果沒有新的新聞項目,退出循環
        if not news_items:
            break

        # 標記本次循環是否找到了符合時間範圍的新聞
        found_new_news = False

        # 遍歷每個新聞項目
        for item in news_items:
            # 獲取標題
            try:
                title_element = item.find_element(By.CSS_SELECTOR, "h2.txt")
                news_title = title_element.text if title_element else ""

                # 如果標題已經被處理過,跳過
                if news_title in finded_news_titles:
                    continue

                # 獲取連結
                link_element = item.find_element(By.CSS_SELECTOR, "a")
                news_link = link_element.get_attribute('href') if title_element else ""

                # 獲取相對時間
                time_element = item.find_element(By.CSS_SELECTOR, "div.time")
                relative_time_str = time_element.text if time_element else ""

                # 轉換相對時間為絕對時間
                absolute_time = convert_relative_time_to_absolute(relative_time_str)

                # 獲取分類
                category_element = item.find_element(By.CSS_SELECTOR, "a > div.type")
                news_category = category_element.text if category_element else ""
            except:
                continue
            # 如果新聞時間在n秒範圍內
            if absolute_time > n_seconds_ago:
                finded_news_titles.add(news_title)
                found_new_news = True

                # print(f"標題: {news_title}")
                # print(f"連結: {news_link}")
                # print(f"時間: {absolute_time}")
                # print(f"分類: {news_category}")
                # print("-------------------")

                news[index] = [news_title,
                               news_link,
                               absolute_time,
                               news_category]
                index += 1
            else:
                break

        # 如果本次循環中沒有找到符合時間範圍的新聞,就退出
        if not found_new_news:
            break

        # 模擬滾動到頁面底部
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(10)  # 等待頁面響應

    # 訪問每個新聞詳情頁,爬取內文
    for index in news:
        driver.get(news[index][1])
        content_elements = driver.find_elements(By.CSS_SELECTOR, "div#news_detail_div")
        news_content = "\n".join([p.text for p in content_elements])
        news[index].append(news_content)
        # print(f"標題: {news[index][0]}")
        # print(f"內文: {news_content}")

    # 關閉瀏覽器
    driver.quit()

    # # 將新聞資料存儲為 CSV 檔案
    # with open('TVBS_news_data.csv', 'w', newline='', encoding='utf-8-sig') as csvfile:
    #     csv_writer = csv.writer(csvfile)
    #     # 寫入 CSV 標題行
    #     csv_writer.writerow(['標題', '連結', '時間', '分類', '內文'])
    #     # 寫入新聞資料
    #     for index in news:
    #         csv_writer.writerow(news[index])

    return news

def convert_relative_time_to_absolute(relative_time_str):
    now = datetime.datetime.now()
    if "秒" in relative_time_str:
        seconds = int(relative_time_str.replace("秒前", ""))
        return now - datetime.timedelta(seconds=seconds)
    elif "分鐘" in relative_time_str:
        minutes = int(relative_time_str.replace("分鐘前", ""))
        return now - datetime.timedelta(minutes=minutes)
    elif "小時" in relative_time_str:
        hours = int(relative_time_str.replace("小時前", ""))
        return now - datetime.timedelta(hours=hours)
    elif "天" in relative_time_str:
        days = int(relative_time_str.replace("天前", ""))
        return now - datetime.timedelta(days=days)
    else:
        return now
