# Ping the Server

In [21]:
import requests

START_URL = "http://localhost:3000"

print(f"Pinging {START_URL}...")

try:
    response = requests.get(START_URL)
    response.raise_for_status()

    print("\n--- Raw Server Response Text ---")
    print(response.text)

except requests.exceptions.HTTPError as e:
    print(f"\nHTTP Error: {e}")
    print("This might be a '404 Not Found', which is useful!")
    print("\n--- Raw Server Response Text (from error) ---")
    print(e.response.text)
    
except requests.exceptions.ConnectionError:
    print(f"Error: Could not connect to {START_URL}")
    print("Please double-check that the Docker container is running.")
except requests.exceptions.RequestException as e:
    print(f"An error occurred: {e}")

Pinging http://localhost:3000...

--- Raw Server Response Text ---

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Crawling Assignment  - Main Portal</title>
    <style>
        * {
            margin: 0;
            padding: 0;
            box-sizing: border-box;
        }
        
        body {
            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, sans-serif;
            background: #f5f7fa;
            min-height: 100vh;
            color: #2c3e50;
            line-height: 1.6;
        }
        
        .header {
            background: #34495e;
            color: white;
            padding: 1rem 0;
            box-shadow: 0 2px 4px rgba(0,0,0,0.1);
        }
        
        .header-content {
            max-width: 1200px;
            margin: 0 auto;
            padding: 0 2rem;
            display: flex;
            justif

In [22]:
import requests
from bs4 import BeautifulSoup
import re # We'll use regex for the node_id

BASE_URL = "http://localhost:3000"

try:
    response = requests.get(BASE_URL)
    response.raise_for_status()
    
    # --- This is where we parse ---
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # 1. Find the Page ID
    page_id_tag = soup.find('div', class_='page-id')
    # The text is "Page ID: page_y5alujtp", so we split it
    page_id = page_id_tag.text.split(':')[-1].strip()
    
    # 2. Find the Node ID
    node_id_tag = soup.find('span', class_='node-id')
    # The text is "Node ID: <b>7er2elqm01lj</b>", so we find the <b> tag
    node_id = node_id_tag.find('b').text.strip()
    
    # 3. Find all Outgoing Links
    outgoing_links = []
    # Find all <a> tags with the class 'file-link'
    link_tags = soup.find_all('a', class_='file-link')
    for tag in link_tags:
        # Get the 'href' attribute, which is the link
        outgoing_links.append(tag['href'])
        
    print("--- HTML Parsing Successful ---")
    print(f"Start Page (Path): /")
    print(f"Page ID: {page_id}")
    print(f"Node ID: {node_id}")
    print(f"Outgoing Links: {outgoing_links}")
    
except requests.exceptions.ConnectionError:
    print(f"Error: Could not connect to {BASE_URL}")
    print("Please ensure the Docker container is running.")
except Exception as e:
    print(f"An error occurred during parsing: {e}")
    print("The HTML structure might have changed.")

--- HTML Parsing Successful ---
Start Page (Path): /
Page ID: page_2xq0nsn7
Node ID: j5p246evofgv
Outgoing Links: ['/page_0lfz4eyh', '/page_qkqgewn3', '/page_9380fo98', '/page_juthp0u7', '/page_5ne7xos4']


# Crawling

In [23]:
import requests
from bs4 import BeautifulSoup
from collections import deque
import time
import re
import json

BASE_URL = "http://localhost:3000"

def parse_page(html_content):
    """
    Parses the HTML of a page and extracts all required data.
    """
    try:
        soup = BeautifulSoup(html_content, 'html.parser')
        
        # 1. Find Page ID
        page_id_tag = soup.find('div', class_='page-id')
        page_id = page_id_tag.text.split(':')[-1].strip()
        
        # 2. Find Node ID
        node_id_tag = soup.find('span', class_='node-id')
        node_id = node_id_tag.find('b').text.strip()
        
        # 3. Find Outgoing Links
        outgoing_links = []
        link_tags = soup.find_all('a', class_='file-link')
        for tag in link_tags:
            outgoing_links.append(tag['href'])
            
        # 4. Find Node ID History
        history = []
        details_tag = soup.find('details')
        if details_tag:
            # Find all <div>s with the style 'margin-left: 1rem...'
            history_divs = details_tag.find_all('div', style=re.compile(r'margin-left'))
            for div in history_divs:
                # Text is '‚Ä¢ p5zg2ka84j0e (2025-11-07 16:48:53 UTC)'
                text = div.text.strip('‚Ä¢ ')
                match = re.search(r'^(.*?) \((.*? UTC)\)$', text)
                if match:
                    prev_node_id = match.group(1).strip()
                    timestamp = match.group(2).strip()
                    history.append({'node_id': prev_node_id, 'timestamp': timestamp})

        return {
            'page_id': page_id,
            'node_id': node_id,
            'outgoing_links': outgoing_links,
            'history': history
        }
        
    except Exception as e:
        print(f"Error parsing page: {e}")
        return None

# --- Main Crawler Logic ---

# 1. Data Structures
pages_to_visit = deque(['/'])  # Start at the root
visited_pages = set()          # Keep track of where we've been
page_graph = {}                # This is our final result

print("--- Starting Initial Crawl (BFS) ---")

while pages_to_visit:
    current_path = pages_to_visit.popleft() # Get the next page from the queue
    
    if current_path in visited_pages:
        continue # Skip if we've already been here
        
    # Mark as visited
    visited_pages.add(current_path)
    print(f"Crawling: {current_path}")

    # 2. Fetch the page
    try:
        url = BASE_URL + current_path
        response = requests.get(url)
        response.raise_for_status()
        
        # 3. Parse the page
        page_data = parse_page(response.text)
        
        if page_data:
            # 4. Store the data
            page_graph[current_path] = page_data
            
            # 5. Add new, unvisited links to the queue
            for link in page_data['outgoing_links']:
                if link not in visited_pages:
                    pages_to_visit.append(link)
                    
        # Be a polite crawler (even locally)
        time.sleep(0.05) 

    except requests.exceptions.RequestException as e:
        print(f"Error crawling {current_path}: {e}")
    except Exception as e:
        print(f"An unknown error occurred at {current_path}: {e}")

# --- Crawl is Complete ---
print("\n--- Crawl Complete ---")
print(f"Total unique pages discovered: {len(page_graph)}")

print("\n--- Discovered Page Graph (First 3 Items) ---")
# Print the first 3 items from the graph
preview = {k: page_graph[k] for k in list(page_graph.keys())[:3]}
print(json.dumps(preview, indent=2))

--- Starting Initial Crawl (BFS) ---
Crawling: /
Crawling: /page_0lfz4eyh
Crawling: /page_qkqgewn3
Crawling: /page_9380fo98
Crawling: /page_juthp0u7
Crawling: /page_5ne7xos4
Crawling: /page_2xq0nsn7
Crawling: /page_rbxstjjl
Crawling: /page_vfzvzhyx
Crawling: /page_e9u26xeo
Crawling: /page_hxvqnxxz
Crawling: /page_pz3yh635
Crawling: /page_dndqper7
Crawling: /page_tzj68wez

--- Crawl Complete ---
Total unique pages discovered: 14

--- Discovered Page Graph (First 3 Items) ---
{
  "/": {
    "page_id": "page_2xq0nsn7",
    "node_id": "j5p246evofgv",
    "outgoing_links": [
      "/page_0lfz4eyh",
      "/page_qkqgewn3",
      "/page_9380fo98",
      "/page_juthp0u7",
      "/page_5ne7xos4"
    ],
    "history": []
  },
  "/page_0lfz4eyh": {
    "page_id": "page_0lfz4eyh",
    "node_id": "yfcwvfusn3ci",
    "outgoing_links": [
      "/page_2xq0nsn7",
      "/page_rbxstjjl",
      "/page_vfzvzhyx",
      "/page_5ne7xos4"
    ],
    "history": []
  },
  "/page_qkqgewn3": {
    "page_id": "pa

# Estimate PageRank

In [24]:
import networkx as nx
import json

# --- This script assumes 'page_graph' already exists in memory ---
# Do NOT copy/paste the example dictionary.

print("--- Calculating PageRank ---")

# 1. Check if the page_graph variable exists and is not empty
if 'page_graph' not in locals() or not page_graph:
    print("Error: 'page_graph' is not in memory or is empty.")
    print("Please re-run your crawler script first.")
else:
    # 2. Create a new Directed Graph
    G = nx.DiGraph()

    # 3. Build the graph from your 'page_graph' dictionary
    for page_path, page_data in page_graph.items():
        # Add the page as a node
        G.add_node(page_path)
        
        # Add a directed edge for each outgoing link
        for link in page_data['outgoing_links']:
            # Ensure the link is also a node (for pages that might only be linked to)
            if link not in G:
                G.add_node(link) 
            G.add_edge(page_path, link)

    # 4. Calculate PageRank
    # This returns a dictionary: {page_path: pagerank_score, ...}
    pagerank_scores = nx.pagerank(G)

    # 5. Sort the results for easy viewing
    sorted_pagerank = sorted(pagerank_scores.items(), key=lambda item: item[1], reverse=True)

    print("\n--- PageRank Results (Sorted) ---")
    for page, score in sorted_pagerank:
        print(f"{page:<20} | Score: {score:.4f}")

--- Calculating PageRank ---

--- PageRank Results (Sorted) ---
/page_hxvqnxxz       | Score: 0.1203
/page_5ne7xos4       | Score: 0.1180
/page_juthp0u7       | Score: 0.1162
/page_2xq0nsn7       | Score: 0.0822
/page_rbxstjjl       | Score: 0.0780
/page_0lfz4eyh       | Score: 0.0777
/page_qkqgewn3       | Score: 0.0767
/page_dndqper7       | Score: 0.0733
/page_tzj68wez       | Score: 0.0726
/page_vfzvzhyx       | Score: 0.0615
/page_pz3yh635       | Score: 0.0594
/page_e9u26xeo       | Score: 0.0270
/page_9380fo98       | Score: 0.0265
/                    | Score: 0.0107


# Estimate Change Frequency

In [25]:
import pandas as pd # For easy timestamp parsing
import numpy as np

# --- Assumes 'page_graph' (from the crawl) ---
# --- and 'pagerank_scores' (from PageRank) ---
# --- are both in memory. ---

print("--- Analyzing Node ID Change Frequency ---")

# Store our results here
page_data = []

# 1. Check if the required variables exist
if 'page_graph' not in locals() or 'pagerank_scores' not in locals():
    print("Error: 'page_graph' or 'pagerank_scores' not in memory.")
    print("Please re-run your crawler and PageRank scripts first.")
else:
    for page_path, data in page_graph.items():
        history = data['history']
        
        avg_change_interval = np.nan # Default for pages with < 2 history items
        
        if len(history) >= 2:
            # 2. Parse timestamps
            # We add the 'current' node_id to the history to get the most recent interval
            current_node_info = {'node_id': data['node_id'], 'timestamp': pd.Timestamp.now(tz='UTC')}
            
            # Combine history and current data
            all_node_events = history + [current_node_info]

            # Convert all timestamps to datetime objects
            timestamps = [
                pd.to_datetime(event['timestamp']) for event in all_node_events
            ]
            
            # 3. Calculate time differences (deltas)
            # We sort just in case, though history is likely sorted
            timestamps.sort() 
            deltas_in_seconds = [
                (timestamps[i] - timestamps[i-1]).total_seconds() 
                for i in range(1, len(timestamps))
            ]
            
            # 4. Get the average
            if deltas_in_seconds:
                avg_change_interval = np.mean(deltas_in_seconds)
        
        # 5. Store the results
        page_data.append({
            'page_path': page_path,
            'pagerank': pagerank_scores.get(page_path, 0),
            'avg_interval_sec': avg_change_interval,
            'last_node_id': data['node_id']
        })

    # --- Display the new data ---
    
    # Create a DataFrame for easy viewing
    df_page_data = pd.DataFrame(page_data)
    
    # Calculate a simple "freshness" (1 / interval)
    # Smaller interval = higher freshness
    df_page_data['freshness_score'] = 1 / df_page_data['avg_interval_sec']
    
    # Normalize PageRank and Freshness to 0-1 range for a combined score
    df_page_data['pagerank_norm'] = (
        df_page_data['pagerank'] - df_page_data['pagerank'].min()
    ) / (df_page_data['pagerank'].max() - df_page_data['pagerank'].min())
    
    df_page_data['freshness_norm'] = (
        df_page_data['freshness_score'] - df_page_data['freshness_score'].min()
    ) / (df_page_data['freshness_score'].max() - df_page_data['freshness_score'].min())

    # --- This is the core of your revisit strategy ---
    # We combine PageRank and Freshness
    df_page_data['revisit_priority'] = (
        df_page_data['pagerank_norm'] + df_page_data['freshness_norm']
    )
    
    print("\n--- Page Analysis Complete ---")
    print(df_page_data.sort_values(by='revisit_priority', ascending=False).to_string())

--- Analyzing Node ID Change Frequency ---

--- Page Analysis Complete ---
         page_path  pagerank  avg_interval_sec  last_node_id  freshness_score  pagerank_norm  freshness_norm  revisit_priority
0                /  0.010714               NaN  j5p246evofgv              NaN       0.000000             NaN               NaN
1   /page_0lfz4eyh  0.077657               NaN  yfcwvfusn3ci              NaN       0.610616             NaN               NaN
2   /page_qkqgewn3  0.076651               NaN  ecut9e1oe3r6              NaN       0.601440             NaN               NaN
3   /page_9380fo98  0.026510               NaN  fpbebh02zhi0              NaN       0.144079             NaN               NaN
4   /page_juthp0u7  0.116154               NaN  mzsb3dkawtnf              NaN       0.961764             NaN               NaN
5   /page_5ne7xos4  0.117979               NaN  3aqmqch2ib3t              NaN       0.978413             NaN               NaN
6   /page_2xq0nsn7  0.082201        