# Test the Parser

Extracting
- page_id
- node_id
- outgoing links

In [11]:
import requests
import sys

# --- Test Step 1: HTML Parser (v1.2) ---
# Goal: Test the *new* parser on the portal page ('/')
# We expect this to pass, as the portal page contains all the data.

try:
    # This will now import our new parser function
    from parser import parse_page
    print("Successfully imported parse_page function from parser.py")
except ImportError:
    print("‚ùå FAILED TO IMPORT 'parser'. Make sure 'parser.py' is in the same folder as this notebook.")
    sys.exit(1)

BASE_URL = "http://localhost:3000"

def test_step_1_parser():
    """
    Tests the new v1.2 parser on the portal page '/'.
    """
    print("\n--- üöÄ Running Test Step 1: v1.2 Parser Test ---")
    
    target_url = f"{BASE_URL}/" 

    try:
        print(f"Fetching {target_url}...")
        response = requests.get(target_url, timeout=5)
        response.raise_for_status()
        
        print("Fetch successful. HTTP Status:", response.status_code)
        
        # --- Test the Parser ---
        print("\nParsing HTML content with new parser...")
        parsed_data = parse_page(response.text)
        
        if parsed_data:
            print("‚úÖ‚úÖ‚úÖ Parse Successful! ‚úÖ‚úÖ‚úÖ")
            print(f"  Page ID:   {parsed_data.get('page_id')}")
            print(f"  Node ID:   {parsed_data.get('node_id')}")
            print(f"  Num Links: {len(parsed_data.get('links', []))}")
            print(f"  Links:     {parsed_data.get('links')}")
        else:
            print("‚ùå‚ùå‚ùå Parse Failed. Check 'parser.py' and its error messages.")

    except requests.exceptions.ConnectionError:
        print(f"\n‚ùå TEST FAILED: Connection Error.")
        print(f"Could not connect to {BASE_URL}.")
        
    except requests.exceptions.RequestException as e:
        print(f"\n‚ùå TEST FAILED: An error occurred: {e}")

# --- Run the test ---
test_step_1_parser()

Successfully imported parse_page function from parser.py

--- üöÄ Running Test Step 1: v1.2 Parser Test ---
Fetching http://localhost:3000/...
Fetch successful. HTTP Status: 200

Parsing HTML content with new parser...
‚úÖ‚úÖ‚úÖ Parse Successful! ‚úÖ‚úÖ‚úÖ
  Page ID:   page_ondib3z5
  Node ID:   yrlelpggirpt
  Num Links: 5
  Links:     ['page_ql618swi', 'page_idxild28', 'page_2x0nyoul', 'page_q0mdnv7q', 'page_58bjp2st']


# Test The Crawler

This crawler must:
- Start by visiting / (or /page_xxx)
- Parse page_id, node_id, outgoing links
- Discover new pages via BFS
- Store:
    - graph (adjacency list)
    - latest node_id
    - timestamps
- Track visit count
- Avoid revisiting pages too frequently

And it must use aiohttp + asyncio for speed.

Here, what is provided:
- BFS Discovery
    Fast traversal of the entire graph.
- Node ID Tracking
    Stored in self.node_ids.
- Visit Times
    Used for staleness estimation.
- revisit_pages()
    Allows refreshing node_ids with minimal revisits later.

In [12]:
import asyncio
import nest_asyncio

# This patch is needed for Jupyter notebooks to run asyncio
nest_asyncio.apply()

# --- Test Step 2: Asynchronous Crawler ---
# Goal: Run the crawler and discover the entire graph.

try:
    from crawler import Crawler
    print("Successfully imported Crawler class from crawler.py")
except ImportError as e:
    print(f"‚ùå FAILED TO IMPORT 'Crawler'. Make sure 'crawler.py' is in the same folder.")
    print(f"Error details: {e}")
    # Stop execution if import fails
    sys.exit(1)


async def test_step_2_crawler():
    """
    Initializes and runs the crawler.
    """
    # We can start from '/' which is the portal page
    crawler = Crawler(start_page="/") 
    await crawler.crawl()
    
    print("\n--- Crawler State (for inspection) ---")
    print(f"Graph size: {len(crawler.graph)}")
    print(f"Node data size: {len(crawler.node_data)}")
    
    # You can uncomment this to see the full graph
    # print("\nFull Graph:")
    # print(crawler.graph)

# --- Run the test ---
# In a notebook, we must run the async function like this:
print("Starting async crawler test...")
asyncio.run(test_step_2_crawler())
print("Async crawler test finished.")

Successfully imported Crawler class from crawler.py
Starting async crawler test...
--- üöÄ Starting crawl from / ---

--- ‚úÖ Crawl Finished ---
  Total pages found: 55
  Total visits made: 56
  Time taken:        1.40 seconds

--- Node Data (Sample) ---
  page_ondib3z5: yrlelpggirpt
  page_ql618swi: a9y21zjt0crr
  page_idxild28: z39371o0rkua
  page_2x0nyoul: bdwtz31ug689
  page_q0mdnv7q: dgmd3gmf82ee
...

--- Crawler State (for inspection) ---
Graph size: 55
Node data size: 55
Async crawler test finished.


# Test The PageRank

In [13]:
import sys
import asyncio
import nest_asyncio

# This patch is needed for Jupyter notebooks to run asyncio
nest_asyncio.apply()

# --- Test Step 3: PageRank Calculator ---
# Goal: Run the crawler, then use its graph to calculate PageRank.

try:
    from crawler import Crawler
    print("Successfully imported Crawler class from crawler.py")
    from pagerank import calculate_pagerank
    print("Successfully imported calculate_pagerank from pagerank.py")
except ImportError as e:
    print(f"‚ùå FAILED TO IMPORT. Make sure 'crawler.py' and 'pagerank.py' are in the same folder.")
    print(f"Error details: {e}")
    sys.exit(1)


async def test_step_3_pagerank():
    """
    Runs the crawler and then calculates PageRank.
    """
    print("\n--- üöÄ Running Crawler to get graph... ---")
    crawler = Crawler(start_page="/") 
    await crawler.crawl()
    
    if not crawler.graph:
        print("‚ùå TEST FAILED: Crawler returned an empty graph.")
        return

    print(f"\n--- ‚úÖ Crawler finished. Found {len(crawler.graph)} pages. ---")
    print("--- üöÄ Calculating PageRank... ---")
    
    pagerank_scores = calculate_pagerank(crawler.graph)
    
    if not pagerank_scores:
        print("‚ùå TEST FAILED: PageRank calculation returned no scores.")
        return
        
    print(f"--- ‚úÖ PageRank Calculated for {len(pagerank_scores)} pages. ---")
    
    # --- Print a sample of the scores ---
    print("\n--- PageRank Scores (Sample) ---")
    total_score = 0.0
    for i, (page, score) in enumerate(pagerank_scores.items()):
        if i < 10: # Print first 10
            print(f"  {page:<15}: {score:.6f}")
        total_score += score
    
    if len(pagerank_scores) > 10:
        print("  ... and so on")
        
    print("\n--- Validation ---")
    print(f"  Total score (should be ~1.0): {total_score:.6f}")
    print(f"  Pages in graph:   {len(crawler.graph)}")
    print(f"  Pages in PageRank: {len(pagerank_scores)}")


# --- Run the test ---
print("Starting PageRank test (includes full crawl)...")
asyncio.run(test_step_3_pagerank())
print("PageRank test finished.")

Successfully imported Crawler class from crawler.py
Successfully imported calculate_pagerank from pagerank.py
Starting PageRank test (includes full crawl)...

--- üöÄ Running Crawler to get graph... ---
--- üöÄ Starting crawl from / ---

--- ‚úÖ Crawl Finished ---
  Total pages found: 55
  Total visits made: 56
  Time taken:        1.33 seconds

--- Node Data (Sample) ---
  page_ondib3z5: yrlelpggirpt
  page_ql618swi: a9y21zjt0crr
  page_idxild28: z39371o0rkua
  page_2x0nyoul: bdwtz31ug689
  page_q0mdnv7q: dgmd3gmf82ee
...

--- ‚úÖ Crawler finished. Found 55 pages. ---
--- üöÄ Calculating PageRank... ---
--- ‚úÖ PageRank Calculated for 55 pages. ---

--- PageRank Scores (Sample) ---
  page_yq2gbbd1  : 0.022767
  page_ox8yl94w  : 0.013298
  page_j5vo7ivh  : 0.023040
  page_k38nkc2n  : 0.032661
  page_5ebo1o8u  : 0.015242
  page_y58wfhk5  : 0.017040
  page_rascybsa  : 0.011747
  page_trtuhhex  : 0.023146
  page_qen7f20c  : 0.021183
  page_q0mdnv7q  : 0.003725
  ... and so on

--- Vali

# Final Test

In [14]:
import sys
import asyncio
import nest_asyncio
import aiohttp
import json

# This patch is needed for Jupyter notebooks to run asyncio
nest_asyncio.apply()

# --- Test Step 4: First Evaluation Submission ---
# Goal: Run all components and submit to /evaluate once.

try:
    from crawler import Crawler
    print("Successfully imported Crawler.")
    from pagerank import calculate_pagerank
    print("Successfully imported calculate_pagerank.")
    from evaluator import format_payload, submit_evaluation
    print("Successfully imported evaluator functions.")
except ImportError as e:
    print(f"‚ùå FAILED TO IMPORT. Make sure all .py files are in the same folder.")
    print(f"Error details: {e}")
    sys.exit(1)


async def test_step_4_submit():
    """
    Runs the full pipeline: Crawl -> PageRank -> Format -> Submit
    """
    print("\n--- üöÄ [1/4] Running Crawler to get graph... ---")
    # Our crawler's first fetch to '/' starts the 60s timer
    crawler = Crawler(start_page="/") 
    await crawler.crawl()
    
    if not crawler.graph:
        print("‚ùå TEST FAILED: Crawler returned an empty graph.")
        return

    print(f"\n--- ‚úÖ [1/4] Crawler finished. Found {len(crawler.graph)} pages. ---")
    
    print("\n--- üöÄ [2/4] Calculating PageRank... ---")
    pagerank_scores = calculate_pagerank(crawler.graph)
    print(f"--- ‚úÖ [2/4] PageRank Calculated. ---")

    print("\n--- üöÄ [3/4] Formatting payload... ---")
    payload = format_payload(crawler.node_data, pagerank_scores)
    print(f"--- ‚úÖ [3/4] Payload formatted with {len(payload['entries'])} entries. ---")

    # Uncomment this line if you want to inspect the JSON
    # print(json.dumps(payload, indent=2))

    print("\n--- üöÄ [4/4] Submitting to /evaluate... ---")
    async with aiohttp.ClientSession() as session:
        # The crawler already visited the site, so the timer has started
        # and this submission is allowed.
        submission_response = await submit_evaluation(session, payload)
    
    print("\n--- ‚úÖ [4/4] ...Submission Complete! ---")
    print("\n" + "="*30)
    print(" SERVER EVALUATION RESPONSE ")
    print("="*30)
    print(json.dumps(submission_response, indent=2))
    print("="*30)
    
    if 'mse' in submission_response:
        print("\nüéâüéâüéâ TEST PASSED! We successfully submitted and got scored. üéâüéâüéâ")
    else:
        print("\n‚ùå TEST FAILED. Server returned an error. Check the response above.")


# --- Run the test ---
print("Starting Full Pipeline test (Crawl, PageRank, Submit)...")
asyncio.run(test_step_4_submit())
print("Full Pipeline test finished.")

Successfully imported Crawler.
Successfully imported calculate_pagerank.
Successfully imported evaluator functions.
Starting Full Pipeline test (Crawl, PageRank, Submit)...

--- üöÄ [1/4] Running Crawler to get graph... ---
--- üöÄ Starting crawl from / ---

--- ‚úÖ Crawl Finished ---
  Total pages found: 55
  Total visits made: 56
  Time taken:        1.33 seconds

--- Node Data (Sample) ---
  page_ondib3z5: yrlelpggirpt
  page_ql618swi: a9y21zjt0crr
  page_idxild28: z39371o0rkua
  page_2x0nyoul: bdwtz31ug689
  page_q0mdnv7q: dgmd3gmf82ee
...

--- ‚úÖ [1/4] Crawler finished. Found 55 pages. ---

--- üöÄ [2/4] Calculating PageRank... ---
--- ‚úÖ [2/4] PageRank Calculated. ---

--- üöÄ [3/4] Formatting payload... ---
--- ‚úÖ [3/4] Payload formatted with 55 entries. ---

--- üöÄ [4/4] Submitting to /evaluate... ---
Submit successful (HTTP 200).

--- ‚úÖ [4/4] ...Submission Complete! ---

 SERVER EVALUATION RESPONSE 
{
  "avg_staleness": 3647.018181818182,
  "coverage": 0.96491228070