In [1]:
!pip install google-generativeai
import requests
import json
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin



In [4]:
FIRECRAWL_API_KEY = "API-KEY"  # Replace with your key

session = requests.Session()  # reuse connection for speed

def scrape_static(url, timeout=10):
    """
    Static scrape: HTTP GET + BeautifulSoup parse.
    Returns (title, text snippet, soup).
    Raises RuntimeError on failure or insufficient content.
    """
    try:
        resp = session.get(url, timeout=timeout)
        resp.raise_for_status()
        soup = BeautifulSoup(resp.text, 'html.parser')
        title = soup.title.string.strip() if soup.title else "No title found"
        text = soup.get_text(separator=' ', strip=True)
        if len(text) < 200:
            raise RuntimeError("Insufficient static content extracted.")
        return title, text[:1500], soup
    except Exception as e:
        raise RuntimeError(f"Static scrape failed: {e}")

def scrape_dynamic(url, timeout=20):
    """
    Dynamic scrape using Firecrawl API for JS-rendered content.
    Returns (title, text snippet, soup).
    Raises RuntimeError on failure.
    """
    endpoint = "https://api.firecrawl.dev/v1/scrape"
    headers = {
        "Authorization": f"Bearer {FIRECRAWL_API_KEY}",
        "Content-Type": "application/json"
    }
    payload = {"url": url, "dynamic": True, "extractorOptions": {"type": "html"}}

    try:
        resp = session.post(endpoint, headers=headers, json=payload, timeout=timeout)
        resp.raise_for_status()
        html = resp.json().get("html")
        if not html:
            raise RuntimeError("No HTML content returned by Firecrawl.")
        soup = BeautifulSoup(html, 'html.parser')
        title = soup.title.string.strip() if soup.title else "No title found"
        text = soup.get_text(separator=' ', strip=True)
        return title, text[:1500], soup
    except Exception as e:
        raise RuntimeError(f"Firecrawl scraping failed: {e}")

def extract_headlines(soup):
    """
    Extract and format all h1, h2, h3 headlines.
    """
    headers = soup.find_all(['h1', 'h2', 'h3'])
    if not headers:
        return "No headlines found."
    return "\n".join(f"üîπ {h.get_text(strip=True)}" for h in headers)

def extract_links(soup, limit=50):
    """
    Extract unique hrefs from anchors, up to limit.
    """
    links = {link['href'].strip() for link in soup.find_all('a', href=True) if link['href'].strip()}
    if not links:
        return "No links found."
    return "\n".join(f"üîó {link}" for link in list(links)[:limit])

def extract_images(soup, limit=50):
    """
    Extract unique image src URLs, up to limit.
    """
    images = {img['src'].strip() for img in soup.find_all('img', src=True) if img['src'].strip()}
    if not images:
        return "No images found."
    return "\n".join(f"üñºÔ∏è {img}" for img in list(images)[:limit])

def extract_paragraphs(soup, limit=10):
    """
    Extract text from up to limit paragraphs.
    """
    paragraphs = soup.find_all('p')
    if not paragraphs:
        return "No paragraphs found."
    return "\n\n".join(p.get_text(strip=True) for p in paragraphs[:limit])

def is_internal_link(base_url, link):
    """
    Check if a link is internal (same domain as base_url).
    """
    base_netloc = urlparse(base_url).netloc
    link_netloc = urlparse(urljoin(base_url, link)).netloc
    return base_netloc == link_netloc

def extract_internal_links(soup, base_url):
    """
    Extract unique internal links from soup.
    """
    links = set()
    for a in soup.find_all('a', href=True):
        href = a['href'].strip()
        full_url = urljoin(base_url, href).split('#')[0]  # strip fragment
        if is_internal_link(base_url, full_url):
            links.add(full_url)
    return list(links)

def get_element_path(element):
    """
    Generate CSS-like path with sibling indices for an element.
    """
    path = []
    while element and element.name != '[document]':
        sibling_index = 1
        sibling = element.previous_sibling
        while sibling:
            if getattr(sibling, 'name', None) == element.name:
                sibling_index += 1
            sibling = sibling.previous_sibling
        tag = f"{element.name}[{sibling_index}]" if sibling_index > 1 else element.name
        path.append(tag)
        element = element.parent
    return " > ".join(reversed(path))

def search_keyword_in_soup(soup, keyword):
    """
    Search keyword in <p> tags of soup.
    Returns list of (element_path, snippet) or None.
    """
    keyword_lower = keyword.lower()
    results = []
    for p in soup.find_all('p'):
        text = p.get_text(strip=True)
        text_lower = text.lower()
        if keyword_lower in text_lower:
            idx = text_lower.index(keyword_lower)
            start = max(idx - 40, 0)
            end = min(idx + 110, len(text))
            snippet = text[start:end]
            path = get_element_path(p)
            results.append((path, snippet))
    return results if results else None

def search_keyword_on_page(url, keyword, timeout=10):
    """
    Static scrape keyword search fallback.
    """
    try:
        resp = session.get(url, timeout=timeout)
        resp.raise_for_status()
        soup = BeautifulSoup(resp.text, 'html.parser')
        return search_keyword_in_soup(soup, keyword)
    except Exception:
        return None

def main():
    url = input("üîó Enter the website URL to scrape: ").strip()
    menu = (
        "üéØ What do you want to extract? Choose one:\n"
        "    1: headlines\n"
        "    2: title\n"
        "    3: text (full page text)\n"
        "    4: links\n"
        "    5: images\n"
        "    6: content previews\n"
        "    7: search keyword in paragraphs\n"
        "Your choice (1-7): "
    )
    choice = input(menu).strip()

    choice_map = {
        '1': 'headlines',
        '2': 'title',
        '3': 'text',
        '4': 'links',
        '5': 'images',
        '6': 'paragraphs',
        '7': 'search'
    }

    target_info = choice_map.get(choice)
    if not target_info:
        print(" Invalid choice. Exiting.")
        return

    try:
        print("\n Attempting dynamic scrape with Firecrawl API...")
        title, text, soup = scrape_dynamic(url)
    except RuntimeError as e:
        print(f" Firecrawl dynamic scrape failed: {e}")
        print(" Falling back to static scrape...")
        try:
            title, text, soup = scrape_static(url)
        except RuntimeError as ex:
            print(f" Both dynamic and static scraping failed: {ex}")
            return

    if target_info == 'search':
        keyword = input(" Enter the keyword to search for in paragraphs: ").strip()
        print(f"\n Searching keyword '{keyword}' across multiple pages (limit 10 pages)...\n")

        internal_links = extract_internal_links(soup, url)
        if url not in internal_links:
            internal_links.insert(0, url)

        found_on_pages = []
        for page_url in internal_links[:10]:
            try:
                _, _, page_soup = scrape_dynamic(page_url)
                results = search_keyword_in_soup(page_soup, keyword)
            except RuntimeError:
                results = search_keyword_on_page(page_url, keyword)
            if results:
                found_on_pages.append((page_url, results))

        if not found_on_pages:
            print("\n Keyword not found on any scanned pages.")
        else:
            print(f"\n Keyword found on {len(found_on_pages)} page(s):\n")
            for page_url, snippets in found_on_pages:
                print(f"üîó Page: {page_url}")
                for _, snippet in snippets[:3]:
                    print(f"  üìù ...{snippet}...\n")

    else:
        extractor_map = {
            'headlines': extract_headlines,
            'title': lambda s: title,
            'links': extract_links,
            'images': extract_images,
            'paragraphs': extract_paragraphs,
            'text': lambda s: text
        }
        extractor = extractor_map.get(target_info)
        extracted = extractor(soup) if extractor else "No data extracted."

        print(f"\n Title: {title}")
        print(f"\n Extracted Content Preview:\n{extracted[:1500]}...\n")
        print(f" Requested Data: {target_info}")

In [5]:
if __name__ == "__main__":
    main()

üîó Enter the website URL to scrape: https://wiki.openstreetmap.org/
üéØ What do you want to extract? Choose one:
    1: headlines
    2: title
    3: text (full page text)
    4: links
    5: images
    6: content previews
    7: search keyword in paragraphs
Your choice (1-7): 6

 Attempting dynamic scrape with Firecrawl API...
 Firecrawl dynamic scrape failed: Firecrawl scraping failed: 402 Client Error: Payment Required for url: https://api.firecrawl.dev/v1/scrape
 Falling back to static scrape...

 Title: OpenStreetMap Wiki

 Extracted Content Preview:
More about OpenStreetMap|How to contribute|Where to get help...

 Requested Data: paragraphs


In [None]:
if __name__ == "__main__":
    main()

üîó Enter the website URL to scrape: https://wiki.openstreetmap.org/
üéØ What do you want to extract? Choose one:
    1: headlines
    2: title
    3: text (full page text)
    4: links
    5: images
    6: paragraphs
    7: search keyword in paragraphs
Your choice (1-7): 5

 Attempting dynamic scrape with Firecrawl API...
 Firecrawl dynamic scrape failed: Firecrawl scraping failed: 402 Client Error: Payment Required for url: https://api.firecrawl.dev/v1/scrape
 Falling back to static scrape...

 Title: OpenStreetMap Wiki

 Extracted Content Preview:
üñºÔ∏è /cc-wiki.png
üñºÔ∏è https://upload.wikimedia.org/wikipedia/commons/thumb/d/d9/Nuvola_web_pen.svg/80px-Nuvola_web_pen.svg.png
üñºÔ∏è https://upload.wikimedia.org/wikipedia/commons/thumb/1/11/Preferences-system.svg/80px-Preferences-system.svg.png
üñºÔ∏è /w/resources/assets/poweredby_mediawiki.svg
üñºÔ∏è https://upload.wikimedia.org/wikipedia/commons/thumb/b/b0/Openstreetmap_logo.svg/100px-Openstreetmap_logo.svg.png
üñºÔ∏è /w/e

In [None]:
if __name__ == "__main__":
    main()

üîó Enter the website URL to scrape: https://www.bbc.com/
üéØ What do you want to extract? Choose one:
    1: headlines
    2: title
    3: text (full page text)
    4: links
    5: images
    6: content previews
    7: search keyword in paragraphs
Your choice (1-7): 7

 Attempting dynamic scrape with Firecrawl API...
 Firecrawl dynamic scrape failed: Firecrawl scraping failed: 429 Client Error: Too Many Requests for url: https://api.firecrawl.dev/v1/scrape
 Falling back to static scrape...
 Enter the keyword to search for in paragraphs: culture

 Searching keyword 'culture' across multiple pages (limit 10 pages)...


 Keyword found on 3 page(s):

üîó Page: https://www.bbc.com/travel/destinations/middle-east
  üìù ...helping to preserve traditional Bedouin culture....

  üìù ...As social distancing lingers, many cultures around the world are adapting their distinct greetings to fit the new normal....

üîó Page: https://www.bbc.com/travel/destinations/europe
  üìù ...neighbourhoo