#Step 1: Importing Required Libraries

In [1]:
# Required Libraries
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
import networkx as nx
import matplotlib.pyplot as plt
from collections import deque

# Optional: Enable inline plotting for Jupyter
%matplotlib inline

#Step 2: Initialize the Web Crawler Parameters

Here we define the starting URL and domain. We'll also initialize the graph and the BFS queue.

In [2]:
# Starting page URL and domain
start_url = 'https://example.com'
domain = 'example.com'

# Create a directed graph
G = nx.DiGraph()

# Define a queue and store the starting URL (BFS)
queue = deque([(start_url, 0)])

# Store visited URLs in a set to avoid revisiting
visited = set()

#Step 3: Web Crawling with BFS

This step will perform the BFS crawling, adding nodes and edges to the graph for the URLs and links found.

In [3]:
# BFS for crawling the web
while queue:
    url, depth = queue.popleft()

    # Skip URLs that are not on the specified domain or that have already been visited
    if urlparse(url).netloc != domain or url in visited:
        continue

    # Mark the current URL as visited
    visited.add(url)

    # Fetch the HTML content of the current URL
    try:
        response = requests.get(url)
        html = response.text
    except Exception as e:
        print(f"Error fetching {url}: {e}")
        continue

    # Parse the HTML content using Beautiful Soup
    soup = BeautifulSoup(html, 'html.parser')

    # Add a node to the graph for the current URL
    title = soup.title.string if soup.title else ''
    G.add_node(url, label=title)

    # Find all links on the page
    links = soup.find_all('a')

    # Add edges to the graph for each link on the page
    for link in links:
        href = link.get('href')

        # Skip links with no href attribute
        if not href:
            continue

        # Convert relative URLs to absolute URLs
        href = urljoin(url, href)

        # Add an edge to the graph for the link if it is on the specified domain
        if urlparse(href).netloc == domain:
            G.add_edge(url, href)

            # Add the linked URL to the queue for crawling if its depth is within the limit
            if depth < 2:
                queue.append((href, depth + 1))

#Step 4: Save and Visualize the Graph

We'll save the graph for future use, as well as visualize it using matplotlib.

In [None]:
# Set up the node labels
labels = {url: data.get('label', '') for url, data in G.nodes(data=True)}

# Save the graph so you don't have to re-run the code
nx.write_gexf(G, "example_crawl.gexf")

# Visualize the graph (optional: interactive version)
plt.figure(figsize=(12, 12))
pos = nx.spring_layout(G)  # Positions for all nodes

# Draw nodes and edges
nx.draw_networkx_nodes(G, pos, node_size=300, node_color='blue')
nx.draw_networkx_edges(G, pos, arrowstyle="->", arrowsize=10)

# Draw node labels
nx.draw_networkx_labels(G, pos, labels, font_size=10)

# Show the graph
plt.title("Web Crawl Graph for example.com")
plt.show()

#Step 5: Interactivity

Adding interactivity with libraries like pyvis to visualize the graph dynamically in the browser. If needed, install pyvis:

In [None]:
pip install pyvis

This will create an interactive graph where you can explore the nodes and edges dynamically.

In [None]:
from pyvis.network import Network

# Create a network visualization
net = Network(notebook=True, height='750px', width='100%', directed=True)
net.from_nx(G)  # Load the networkx graph
net.show("example_crawl.html")