In [1]:
import requests
from bs4 import BeautifulSoup
import time
import re
from urllib.parse import urljoin, urlparse
from collections import deque

# Simple breadth-first crawler with depth limit and politeness
def crawl_site(seed_url, max_pages=20, delay=1.0):
    visited = set()
    docs = {}
    queue = deque([seed_url])
    count = 0

    domain = urlparse(seed_url).netloc

    while queue and count < max_pages:
        url = queue.popleft()
        if url in visited:
            continue
        try:
            resp = requests.get(url, timeout=5, headers={"User-Agent": "SimpleSearchBot/1.0"})
            if resp.status_code != 200 or 'text/html' not in resp.headers.get('Content-Type',''):
                continue
            soup = BeautifulSoup(resp.text, 'html.parser')
            title = soup.title.string.strip() if soup.title and soup.title.string else ''
            body = soup.get_text(separator=' ')
            tokens = re.findall(r'\b\w+\b', (title + ' ' + body).lower())
            # basic stopword filtering
            STOPWORDS = {'the','and','is','in','to','of','a','an'}
            tokens = [t for t in tokens if t not in STOPWORDS]

            doc_id = f'doc_live_{count}'
            docs[doc_id] = {
                'title': title,
                'tokens': tokens,
                'url': url
            }
            count += 1
            visited.add(url)

            # extract same-domain links
            for a in soup.find_all('a', href=True):
                href = urljoin(url, a['href'])
                parsed = urlparse(href)
                if parsed.netloc == domain:
                    normalized = parsed.scheme + '://' + parsed.netloc + parsed.path
                    if normalized not in visited:
                        queue.append(normalized)
        except Exception as e:
            # skip failures
            pass
        time.sleep(delay)  # politeness

    return docs

# Example usage: crawl first 10 pages of example.com (replace with your target)
live_docs = crawl_site("https://www.kkwagh.edu.in", max_pages=10, delay=1.0)
print(f"Crawled {len(live_docs)} pages.")
for k,v in live_docs.items():
    print(k, v['title'], v['url'])

Crawled 10 pages.
doc_live_0 KK Wagh https://www.kkwagh.edu.in
doc_live_1 KK Wagh https://www.kkwagh.edu.in/
doc_live_2 Distinguish Faculty https://www.kkwagh.edu.in/faculty
doc_live_3 KK Wagh https://www.kkwagh.edu.in/cpage.aspx
doc_live_4 Contact Us https://www.kkwagh.edu.in/contact-us
doc_live_5 Tender https://www.kkwagh.edu.in/tenders
doc_live_6 Careers https://www.kkwagh.edu.in/careers
doc_live_7 Admissions https://www.kkwagh.edu.in/admissions
doc_live_8 Overview https://www.kkwagh.edu.in/overview
doc_live_9 Our Legacy https://www.kkwagh.edu.in/our-legacy
