In [5]:
from playwright.async_api import async_playwright
from markdownify import markdownify
import re




def clean_text(text: str) -> str:
    """Clean crawled markdown text into plain readable text."""
    # 1. Remove image markdown like ![](url)
    text = re.sub(r"!\[.*?\]\(.*?\)", "", text)
    # 2. Replace markdown links [text](url) → keep text only
    text = re.sub(r"\[([^\]]+)\]\([^)]*\)", r"\1", text)
    # 3. Remove bare URLs (http/https/www)
    text = re.sub(r"http[s]?://\S+|www\.\S+", "", text)
    # 4. Remove dangling empty () or []
    text = re.sub(r"\(\s*\)|\[\s*\]", "", text)
    # 5. Remove lines with only special chars (*, #, spaces)
    text = re.sub(r"^[\s*#]+$", "", text, flags=re.MULTILINE)
    text = re.sub(r"^!\[\].*\n", "", text)
    # 6. Collapse repeated sections (optional deduplication)
    lines = text.splitlines()
    seen = set()
    deduped = []
    for line in lines:
        line_stripped = line.strip()
        if (line_stripped and line_stripped not in seen) or len(line_stripped) < 2:
            deduped.append(line)
            seen.add(line_stripped)
    text = "\n".join(deduped)
    # 7. Normalize whitespace
    text = re.sub(r"\n\s*\n+", "\n\n", text)  # collapse multiple blank lines
    text = re.sub(r" {2,}", " ", text)  # collapse multiple spaces
    return text.strip()





async def playwright(cur_url):
    """Fetch markdown content for a single URL using playwright."""
    try:
        print(f"Playwright: Extracting content from {cur_url}")
        async with async_playwright() as p:
            browser = await p.chromium.launch(headless=True)
            page = await browser.new_page()
            await page.goto(cur_url, timeout=30000)
            html = await page.content()
            md = markdownify(html)
            md = clean_text(md)
            await browser.close()
            return md, html
    except Exception as e:
        raise e
    
    
await playwright('https://www.nov.com/')

Playwright: Extracting content from https://www.nov.com/


('Global Energy Services | Oilfield Equipment & Technologies | NOV',
 '<!DOCTYPE html><html lang="en"><head><meta charset="utf-8"><meta name="viewport" content="width=device-width, initial-scale=1"><link rel="stylesheet" href="https://www.nov.com/_next/static/css/4747d147ae3ba659.css?dpl=dpl_6qcmvJ8dwNc4yYTLYeYKTzzMVi85" data-precedence="next"><link rel="stylesheet" href="https://www.nov.com/_next/static/css/894eb5afa7495dbe.css?dpl=dpl_6qcmvJ8dwNc4yYTLYeYKTzzMVi85" data-precedence="next"><link rel="stylesheet" href="https://www.nov.com/_next/static/css/3126877277990380.css?dpl=dpl_6qcmvJ8dwNc4yYTLYeYKTzzMVi85" data-precedence="next"><link rel="stylesheet" href="https://www.nov.com/_next/static/css/99d5d33e604b24f0.css?dpl=dpl_6qcmvJ8dwNc4yYTLYeYKTzzMVi85" data-precedence="next"><link rel="stylesheet" href="https://www.nov.com/_next/static/css/7a3713cd5db19bb2.css?dpl=dpl_6qcmvJ8dwNc4yYTLYeYKTzzMVi85" data-precedence="next"><link rel="stylesheet" href="https://www.nov.com/_next/static/

In [54]:
import asyncio
import aiofiles
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode, DefaultMarkdownGenerator

def remove_duplicate_lines_in_memory(text: str) -> str:
    """Remove duplicate lines from a string and return the cleaned text."""
    seen = set()
    cleaned_lines = []
    for line in text.splitlines():
        if line not in seen:
            seen.add(line)
            cleaned_lines.append(line)
    return "\n".join(cleaned_lines)

async def quick_parallel_example(urls):
    generator = DefaultMarkdownGenerator(content_source="fit_html")
    crawl_config = CrawlerRunConfig(
        markdown_generator=generator,
        cache_mode=CacheMode.BYPASS,
        stream=False
    )

    output_file = "output.txt"
    all_content = []  # Collect all markdown content first

    async with AsyncWebCrawler() as crawler:
        results = await crawler.arun_many(urls, config=crawl_config)

        for res in results:
            if res.success:
                print(f"[OK] {res.url}, length: {len(res.markdown)}")
                # Add a separator for clarity
                all_content.append(f"\n\n--- Content from {res.url} ---\n\n")
                all_content.append(res.markdown)
            else:
                print(f"[ERROR] {res.url} => {res.error_message}")

    # Combine all content into a single string
    combined_text = "\n".join(all_content)

    # Remove duplicate lines in memory before saving
    cleaned_text = remove_duplicate_lines_in_memory(combined_text)

    # Save the cleaned content to file
    async with aiofiles.open(output_file, 'w', encoding='utf-8') as f:
        await f.write(cleaned_text)

    print(f"\n✅ Cleaned content saved to {output_file}")

# Run the async main function

urls_to_crawl = [
    "https://www.nov.com/",
    "https://www.nov.com/search?q=Energy%20Transition",
    "https://www.nov.com/products-and-services"
]
await quick_parallel_example(urls_to_crawl)


[OK] https://www.nov.com/, length: 1
[OK] https://www.nov.com/products-and-services, length: 3476
[OK] https://www.nov.com/search?q=Energy%20Transition, length: 2677

✅ Cleaned content saved to output.txt


In [None]:
async def remove_duplicate_lines_async(filename):
    """Remove duplicate lines from a file asynchronously."""
    seen = set()
    temp_file = filename + ".tmp"

    async with aiofiles.open(filename, 'r', encoding='utf-8') as infile, \
            aiofiles.open(temp_file, 'w', encoding='utf-8') as outfile:
        async for line in infile:
            if line not in seen:
                await outfile.write(line)
                seen.add(line)

    os.replace(temp_file, filename)
    