In [1]:
from collections import deque
import time
from threading import Thread

# Dummy function to simulate fetching web page links
def fetch_links(url):
    dummy_web = {
        "home": ["about", "services", "contact"],
        "about": ["team", "mission"],
        "services": ["web_dev", "data_science"],
        "contact": [],
        "team": [],
        "mission": [],
        "web_dev": [],
        "data_science": []
    }
    return dummy_web.get(url, [])

# 1. Breadth-First Search (BFS) Crawler
def bfs_crawler(start_url):
    print("\nBFS Crawler")
    queue = deque([start_url])
    visited = set()

    while queue:
        url = queue.popleft()
        if url not in visited:
            print(f"Visiting: {url}")
            visited.add(url)
            queue.extend(fetch_links(url))
            time.sleep(1)

# 2. Depth-First Search (DFS) Crawler
def dfs_crawler(start_url, visited=None):
    if visited is None:
        visited = set()
        print("\nDFS Crawler")

    if start_url not in visited:
        print(f"Visiting: {start_url}")
        visited.add(start_url)
        time.sleep(1)

        for link in fetch_links(start_url):
            dfs_crawler(link, visited)

# 3. Focused Crawler (only searches pages with "data" in the name)
def focused_crawler(start_url, keyword):
    print("\nFocused Crawler")
    queue = deque([start_url])
    visited = set()

    while queue:
        url = queue.popleft()
        if url not in visited:
            visited.add(url)
            if keyword in url:
                print(f"Visiting: {url} (Focused Search)")
            queue.extend(fetch_links(url))  # Continue exploring child links
            time.sleep(1)

# 4. Incremental Crawler (Only visits new/updated pages)
def incremental_crawler(start_url, last_visited):
    print("\nIncremental Crawler")
    queue = deque([start_url])
    visited = set()

    while queue:
        url = queue.popleft()
        if url not in visited and url not in last_visited:
            print(f"Visiting: {url} (New page)")
            visited.add(url)
            queue.extend(fetch_links(url))
            time.sleep(1)

# 5. Parallel Crawler (Simulated with multiple threads)
def parallel_worker(start_url):
    bfs_crawler(start_url)

def parallel_crawler(start_urls):
    print("\nParallel Crawler")
    threads = []
    for url in start_urls:
        thread = Thread(target=parallel_worker, args=(url,))
        threads.append(thread)
        thread.start()

    for thread in threads:
        thread.join()

# Testing the crawlers
bfs_crawler("home")
dfs_crawler("home")
focused_crawler("home", "data")
incremental_crawler("home", {"about", "team"})
parallel_crawler(["home", "services"])



BFS Crawler
Visiting: home
Visiting: about
Visiting: services
Visiting: contact
Visiting: team
Visiting: mission
Visiting: web_dev
Visiting: data_science

DFS Crawler
Visiting: home
Visiting: about
Visiting: team
Visiting: mission
Visiting: services
Visiting: web_dev
Visiting: data_science
Visiting: contact

Focused Crawler
Visiting: data_science (Focused Search)

Incremental Crawler
Visiting: home (New page)
Visiting: services (New page)
Visiting: contact (New page)
Visiting: web_dev (New page)
Visiting: data_science (New page)

Parallel Crawler

BFS Crawler
Visiting: home

BFS Crawler
Visiting: services
Visiting: about
Visiting: web_dev
Visiting: services
Visiting: data_science
Visiting: contact
Visiting: team
Visiting: mission
Visiting: web_dev
Visiting: data_science
