In [1]:
!pip install -U crawl4ai

Collecting crawl4ai
  Downloading Crawl4AI-0.4.248-py3-none-any.whl.metadata (29 kB)
Collecting aiosqlite~=0.20 (from crawl4ai)
  Downloading aiosqlite-0.21.0-py3-none-any.whl.metadata (4.3 kB)
Collecting litellm>=1.53.1 (from crawl4ai)
  Downloading litellm-1.60.2-py3-none-any.whl.metadata (36 kB)
Collecting pillow~=10.4 (from crawl4ai)
  Downloading pillow-10.4.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (9.2 kB)
Collecting playwright>=1.49.0 (from crawl4ai)
  Downloading playwright-1.50.0-py3-none-manylinux1_x86_64.whl.metadata (3.5 kB)
Collecting python-dotenv~=1.0 (from crawl4ai)
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Collecting tf-playwright-stealth>=1.1.0 (from crawl4ai)
  Downloading tf_playwright_stealth-1.1.1-py3-none-any.whl.metadata (2.6 kB)
Collecting xxhash~=3.4 (from crawl4ai)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting rank-bm25~=0.2 (from crawl4ai)
  Downloading rank

In [2]:
!python -m playwright install --with-deps chromium

Installing dependencies...
Hit:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:2 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:3 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:4 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:5 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:6 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:7 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Reading package lists... Done
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Reading package lists... Done
Building depe

In [6]:
import asyncio
import nest_asyncio
import os
import json
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
from crawl4ai.content_filter_strategy import PruningContentFilter
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator

nest_asyncio.apply()

temp_file = "extracted_data.json"

async def crawl_website(url):
    prune_filter = PruningContentFilter(threshold=0.45, threshold_type="dynamic", min_word_threshold=5)
    md_generator = DefaultMarkdownGenerator(content_filter=prune_filter)

    config = CrawlerRunConfig(markdown_generator=md_generator, cache_mode=CacheMode.BYPASS)

    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun(url=url, config=config, max_pages=30, follow_links=True)

        if result.success:
            with open(temp_file, "w", encoding="utf-8") as f:
                json.dump({"extracted_content": result.markdown_v2.fit_markdown}, f, indent=4)
            print("Crawl complete! Document saved.")
        else:
            print("Error:", result.error_message)

if __name__ == "__main__":
    url = input("Enter the homepage URL: ")
    loop = asyncio.get_event_loop()
    loop.run_until_complete(crawl_website(url))


Enter the homepage URL: https://developer.algorand.org/docs/
[INIT].... → Crawl4AI 0.4.248
[FETCH]... ↓ https://developer.algorand.org/docs/... | Status: True | Time: 2.98s
[SCRAPE].. ◆ Processed https://developer.algorand.org/docs/... | Time: 351ms
[COMPLETE] ● https://developer.algorand.org/docs/... | Status: True | Total: 3.38s
Crawl complete! Document saved.


In [7]:
import asyncio
import nest_asyncio
from crawl4ai import AsyncWebCrawler

nest_asyncio.apply()

async def main():
    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun(
            url="https://developer.algorand.org/docs/",
            css_selectors=[".content-area"]
        )
        print(result.markdown)

await main()


[INIT].... → Crawl4AI 0.4.248
[COMPLETE] ● Database backup created at: /root/.crawl4ai/crawl4ai.db.backup_20250205_082621
[INIT].... → Starting database migration...
[COMPLETE] ● Migration completed. 0 records processed.
[FETCH]... ↓ https://developer.algorand.org/docs/... | Status: True | Time: 2.96s
[SCRAPE].. ◆ Processed https://developer.algorand.org/docs/... | Time: 418ms
[COMPLETE] ● https://developer.algorand.org/docs/... | Status: True | Total: 4.56s
[ Skip to content ](https://developer.algorand.org/docs/<#top-level-sections>)
[ Developer Portal ](https://developer.algorand.org/docs/<https:/developer.algorand.org/>)
Showing results in "Docs"
  * Build 
    * [ Docs Comprehensive Algorand documentation ](https://developer.algorand.org/docs/<https:/developer.algorand.org/docs/>)
    * [ Dev Environment Install AlgoKit and start building ](https://developer.algorand.org/docs/<https:/developer.algorand.org/docs/get-started/algokit/>)
    * [ SDKs SDKs for Python, Javascript, Go, a

In [None]:
''';