# Agent that retrieves papers from the Internet based on certain keywords.

In [None]:
import asyncio, random, json, base64
from pathlib import Path
from dotenv import load_dotenv
from pydantic import BaseModel
from browser_use import Agent, BrowserSession, BrowserProfile, Controller, ActionResult
from browser_use.browser.types import Page
from browser_use.llm import ChatOpenAI

In [None]:
# from browser_use.file_system import InMemoryFileSystem
load_dotenv()

doi = '10.1016/j.ijggc.2012.07.024' # This is a sample DOI showing how the system works.

DOWNLOADS_DIR = Path("./downloads/").expanduser()
DOWNLOADS_DIR.mkdir(parents=True, exist_ok=True)

llm = ChatOpenAI(model="gpt-4.1-2025-04-14")

profile = BrowserProfile(
    stealth=True,
    wait_between_actions=0.5,
    minimum_wait_page_load_time=0.25,
    wait_for_network_idle_page_load_time=2.5,
    maximum_wait_page_load_time=8,
    downloads_path=str(DOWNLOADS_DIR),
    user_agent=("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
                "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36"),
    headless=False,          # keep headful; we’ll use CDP not page.pdf()
    slow_mo=120,
    timezone_id="Asia/Kolkata",
    locale="en-US",
)

browser_session = BrowserSession(
    executable_path="/Applications/Brave Browser.app/Contents/MacOS/Brave Browser",
    user_data_dir=str(Path("~/.config/browseruse/profiles/default").expanduser()),
    accept_downloads=True,
    headless=False,
    browser_profile=profile,
    # optional: ["--kiosk-printing","--disable-print-preview"] if you ever do window.print()
)

# ---------- Output State ----------

class ResearchPaper(BaseModel):
	authors: str
	publish_date: str
	title: str
	doi: str
	paper_platform: str
	pdf_file_path: str

# ---------- Controller ----------

controller = Controller(output_model=ResearchPaper)

# ---------- Utils ----------
async def _close_modal_if_present(page: Page, selector="button[aria-label='close window']"):
    if await page.locator(selector).count():
        await page.locator(selector).first.click()
        await page.wait_for_timeout(300)

async def _ensure_full_render(page: Page, pages_to_scroll=35):
    # crude: scroll down chunk by chunk to trigger lazy loads
    for _ in range(pages_to_scroll):
        await page.mouse.wheel(0, 1200)
        await page.wait_for_timeout(random.randint(150, 350))

# ---------- Custom Actions ----------
class SavePagePDFParams(BaseModel):
    filename: str | None = None
    print_background: bool = True
    margin_mm: int = 10

@controller.action("save page as pdf", param_model=SavePagePDFParams)
async def save_page_as_pdf(params: SavePagePDFParams, page: Page) -> ActionResult:
    await _close_modal_if_present(page)
    await _ensure_full_render(page)   # load lazy stuff
    # Use CDP printToPDF
    cdp = await page.context.new_cdp_session(page)
    margins_in = params.margin_mm / 25.4
    pdf_resp = await cdp.send("Page.printToPDF", {
        "printBackground": params.print_background,
        "displayHeaderFooter": False,
        "marginTop": margins_in,
        "marginBottom": margins_in,
        "marginLeft": margins_in,
        "marginRight": margins_in,
        "scale": 1.0
    })
    data = base64.b64decode(pdf_resp["data"])
    fname = params.filename or "article_page_capture.pdf"
    path = DOWNLOADS_DIR / fname
    path.write_bytes(data)
    return ActionResult(extracted_content=json.dumps({"pdf_file_path": str(path)}))


class DownloadParams(BaseModel):
    selector: str | None = None   # CSS to click (preferred)
    url: str | None = None        # direct PDF URL (fallback)
    filename: str | None = None   # override final filename


@controller.action("download pdf", param_model=DownloadParams)
async def download_pdf(params: DownloadParams, page: Page) -> ActionResult:
    """
    Try to download a PDF either by clicking a selector (best for sites with blob URLs or auth)
    or by fetching a direct URL. Returns {"pdf_file_path": "<abs path>"} on success.
    """
    try:
        await _close_modal_if_present(page)

        # --- Helper to persist playwright Download object ---
        async def _save_download(d, override_name: str | None):
            suggested = d.suggested_filename or "download.pdf"
            fname = override_name or suggested
            target = DOWNLOADS_DIR / fname
            await d.save_as(target)
            return target

        # --- 1) Selector click path (uses Playwright's download hook) ---
        if params.selector:
            try:
                # Ensure element exists & is visible
                locator = page.locator(params.selector).first
                await locator.wait_for(state="visible", timeout=5_000)

                async with page.expect_download(timeout=15_000) as dl_info:
                    await locator.click()
                download = await dl_info.value
                saved = await _save_download(download, params.filename)
                return ActionResult(extracted_content=json.dumps({"pdf_file_path": str(saved)}))
            except Exception as e:
                # fall through to other strategies
                pass

        # --- 2) If we have a direct URL, try a programmatic fetch ---
        if params.url:
            try:
                # Prefer Playwright's built-in request context (inherits cookies/session)
                resp = await page.context.request.get(params.url)
                if not resp.ok:
                    raise RuntimeError(f"GET {params.url} -> {resp.status}")
                body = await resp.body()
                # Crude content-type check
                ctype = resp.headers.get("content-type", "")
                if "pdf" not in ctype.lower() and not params.url.lower().endswith(".pdf"):
                    # Still save, but warn
                    pass

                fname = params.filename or Path(params.url).name or "file.pdf"
                target = DOWNLOADS_DIR / fname
                target.write_bytes(body)
                return ActionResult(extracted_content=json.dumps({"pdf_file_path": str(target)}))
            except Exception:
                # final fallback below
                pass

        # --- 3) Last resort: force a download via DOM (anchor.click) + expect_download ---
        try:
            if params.url:
                async with page.expect_download(timeout=15_000) as dl_info:
                    await page.evaluate(
                        "(url)=>{const a=document.createElement('a');a.href=url;a.download='';"
                        "document.body.appendChild(a);a.click();a.remove();}", params.url
                    )
                download = await dl_info.value
                saved = await _save_download(download, params.filename)
                return ActionResult(extracted_content=json.dumps({"pdf_file_path": str(saved)}))
        except Exception:
            pass

        return ActionResult(error="Could not download PDF via selector or url.")

    except Exception as e:
        return ActionResult(error=f"download_pdf failed: {e!r}")

class ResetTabsParams(BaseModel):
    url: str = "about:blank"

@controller.action("reset to single tab", param_model=ResetTabsParams)
async def reset_tabs(params: ResetTabsParams, page: Page) -> ActionResult:
    ctx = page.context
    new_page = await ctx.new_page()
    await new_page.goto(params.url)
    await new_page.bring_to_front()

    for p in ctx.pages[:]:
        if p is not new_page and not p.is_closed():
            try:
                await p.close()
            except Exception:
                pass

    return ActionResult(extracted_content=json.dumps({"active_url": new_page.url}))

from typing import Literal, List, Optional
class MuteUIParams(BaseModel):
    patterns: List[str] = ["download pdf", "view pdf"]   # phrases to kill
    mode: Literal["hide", "remove"] = "hide"             # CSS hide or DOM remove
    min_repeat: int = 3                                   # auto-detect spam if repeated ≥ N times
    tag_filter: Optional[List[str]] = ["a","button","div","span"]  # which tags to scan

@controller.action("mute noisy elements", param_model=MuteUIParams)
async def mute_noisy(params: MuteUIParams, page: Page) -> ActionResult:
    # Build JS
    js_fn = """
            (cfg) => {
            const tags = (cfg.tag_filter || ["a","button","div","span"]);
            const nodes = [...document.querySelectorAll(tags.join(','))];
            const norm = t => t.trim().toLowerCase().replace(/\\s+/g,' ');
            const counts = {};
            nodes.forEach(n => {
                const text = norm(n.textContent || "");
                if (text.length < 3) return;
                counts[text] = (counts[text] || 0) + 1;
            });

            const patternRegexes = (cfg.patterns || []).map(p => new RegExp(p, 'i'));
            const isPatternHit = t => patternRegexes.some(r => r.test(t));

            const spamTexts = new Set(
                Object.entries(counts)
                .filter(([t,c]) => c >= cfg.min_repeat || isPatternHit(t))
                .map(([t]) => t)
            );

            const removed = [];
            nodes.forEach(n => {
                const t = norm(n.textContent || "");
                if (spamTexts.has(t)) {
                if (cfg.mode === "remove") n.remove();
                else n.style.setProperty("display","none","important");
                if (removed.length < 50) removed.push(t);
                }
            });
            return removed;
            }
            """.strip()
    removed = await page.evaluate(js_fn, params.dict())
    return ActionResult(extracted_content=json.dumps({"muted_samples": removed}))


# ---------- Hook ----------
async def human_pause(agent: Agent):
    await asyncio.sleep(random.uniform(0.4, 1.6))

# ---------- Task ----------

prompt = f"""
You have these custom actions available:
- "reset to single tab" -> params: {{ "url": "about:blank" }}
- "save page as pdf"   -> params: {{ "filename": str|None, "print_background": bool, "margin_mm": int }}
- "download pdf"       -> params: {{ "selector": str|None, "url": str|None, "filename": str|None }}

Follow the steps EXACTLY:

0. **Immediately call** "reset to single tab" with {{"url":"about:blank"}}. Do this before any browsing.
1. Navigate to {doi}.
2. Extract the following from the HTML page (not the PDF):
   - "title": string
   - "authors": array of strings
   - "abstract": string
3. **Do NOT click “View PDF”.** Stay on the HTML article page.
4. Scroll the entire article to trigger all lazy-loaded content (images, references, etc.).
5. Call "save page as pdf" with:
   {{
     "filename": "`use the title of the paper you extracted above`.pdf",
     "print_background": true,
     "margin_mm": 10
   }}
   This should capture the fully rendered HTML page.
6. If (and only if) there is a way to get a *real* PDF file (button/link) without violating step 3:
   - Prefer selector click: call "download pdf" with {{"selector": "<css selector>", "filename": "download.pdf"}}
   - Otherwise if you see a direct .pdf URL, call "download pdf" with {{"url": "<pdf_url>", "filename": "download.pdf"}}
   Skip this if neither is clearly available.
7. Final output: return a single JSON object with keys:
   {{
     "title": "...",
     "authors": [...],
     "abstract": "...",
     "pdf_file_path_html": "<from save page as pdf>",
     "pdf_file_path_download": "<from download pdf or null if not downloaded>"
   }}

Rules:
- Always call "reset to single tab" first.
- Never click "View PDF".
- Be concise in tool outputs; no extraneous text, just JSON values.
"""


async def main():
    agent = Agent(
        task=prompt,
        llm=llm,
        browser_session=browser_session,
        controller=controller,
        # file_system=InMemoryFileSystem()
    )
    result = await agent.run(on_step_start=human_pause, max_steps=40)
    print(result)

if __name__ == "__main__":
    await (main())
