In [6]:
import asyncio
from playwright.async_api import async_playwright
import nest_asyncio
import json

# 允许嵌套 asyncio 事件循环（适用于 Jupyter Notebook）
nest_asyncio.apply()

# 期刊主页 URL 对应表
JOURNAL_URLS = {
    "PANS": "https://www.pnas.org",
    "Nature": "https://www.nature.com/nature/articles",
    "Science": "https://www.science.org/journal/science",
    "PRL": "https://journals.aps.org/prl/recent",
    "JCP": "https://pubs.aip.org/aip/jcp",
    "JFM": "https://www.cambridge.org/core/journals/journal-of-fluid-mechanics/latest-issue",
    "PRF": "https://journals.aps.org/prfluids/recent",
}

async def fetch_articles(journal, mode="new", keyword=None, days=7):
    """爬取指定期刊的文章信息"""
    if journal not in JOURNAL_URLS:
        print(f"❌ 不支持的期刊: {journal}")
        return

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False, slow_mo=100)
        context = await browser.new_context()
        page = await context.new_page()

        url = JOURNAL_URLS[journal]
        print(f"🌍 访问期刊: {journal} ({url})")
        await page.goto(url)
        await page.wait_for_load_state("networkidle")

        articles = []
        
        # **模式 A: 新文推荐**
        if mode == "new":
            selectors = ["article", "ol li.ResultItem", "div.js-article"]
            for selector in selectors:
                try:
                    await page.wait_for_selector(selector, timeout=5000)
                    found_elements = await page.query_selector_all(selector)
                    if found_elements:
                        print(f"✅ 选择器 `{selector}` 找到 {len(found_elements)} 篇文章")
                        for el in found_elements[:10]:  # 限制 10 篇文章
                            title = await el.inner_text() or "未知标题"
                            link = await el.get_attribute("href") or "未知链接"
                            articles.append({"title": title, "link": link})
                        break
                except asyncio.TimeoutError:
                    print(f"❌ 选择器 `{selector}` 超时")

        # **模式 B: 关键词搜索**
        elif mode == "search" and keyword:
            search_url = f"{url}/search?q={keyword.replace(' ', '+')}"
            print(f"🔍 关键词搜索: {keyword} ({search_url})")
            await page.goto(search_url)
            await page.wait_for_load_state("networkidle")

            search_selectors = ["article", "ol li.ResultItem", "div.js-article"]
            for selector in search_selectors:
                try:
                    await page.wait_for_selector(selector, timeout=5000)
                    found_elements = await page.query_selector_all(selector)
                    if found_elements:
                        print(f"✅ 搜索 `{keyword}` 找到 {len(found_elements)} 篇文章")
                        for el in found_elements[:10]:  # 限制 10 篇文章
                            title = await el.inner_text() or "未知标题"
                            link = await el.get_attribute("href") or "未知链接"
                            articles.append({"title": title, "link": link})
                        break
                except asyncio.TimeoutError:
                    print(f"❌ 选择器 `{selector}` 超时")

        # **模式 C: 高被引/编辑推荐**
        elif mode == "recommended":
            print(f"📈 获取 {journal} 高被引/推荐文章（功能待完善）")
            # 这里可以添加高被引文章的爬取逻辑

        # **输出结果**
        if articles:
            print(f"📄 获取到 {len(articles)} 篇文章:")
            for i, article in enumerate(articles, 1):
                print(f"{i}. {article['title']} ({article['link']})")
            
            # 保存到 JSON 文件
            with open(f"{journal}_articles.json", "w", encoding="utf-8") as f:
                json.dump(articles, f, ensure_ascii=False, indent=2)
            print(f"✅ 文章已保存: {journal}_articles.json")
        
        else:
            print("⚠️ 未找到相关文章，请检查选择的模式和关键词")

        await browser.close()

# **使用示例**
journal_name = "JCP"   # 选择期刊
mode = "new"           # "new"（新文推荐） | "search"（关键词搜索） | "recommended"（高被引）
keyword = "fluid dynamics"  # 关键词搜索时使用

await fetch_articles(journal_name, mode, keyword)






🌍 访问期刊: JCP (https://pubs.aip.org/aip/jcp)


TimeoutError: Page.goto: Timeout 30000ms exceeded.
Call log:
  - navigating to "https://pubs.aip.org/aip/jcp", waiting until "load"
