# University Academic Web Crawler with Ollama

This notebook implements an intelligent web crawler that uses **Ollama (Llama 3.2)** to filter and extract educational program data from university websites.

**Goal**: Build a database of:
- Educational courses & programs
- Admission criteria & requirements
- Certificates & diplomas
- Academic pathways & progressions

## Step 1: Install Ollama

In [None]:
!sudo apt update
!sudo apt install -y pciutils
!curl -fsSL https://ollama.com/install.sh | sh

## Step 2: Start Ollama Server

In [None]:
import threading
import subprocess
import time

def run_ollama_serve():
    subprocess.Popen(["ollama", "serve"])

# Kill any existing Ollama processes
!pkill -9 ollama
time.sleep(2)

# Start fresh Ollama server
thread = threading.Thread(target=run_ollama_serve)
thread.start()
time.sleep(5)
print("âœ“ Ollama server started")

## Step 3: Pull Llama Model

In [None]:
!ollama pull llama3.2

## Step 4: Install Dependencies

In [None]:
!pip install -q langchain-ollama beautifulsoup4 lxml requests

## Step 5: Import Libraries

In [None]:
import os
import json
import requests
import time
from pathlib import Path
from typing import Any, Dict, List, Set
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
from langchain_ollama.llms import OllamaLLM
from langchain_core.prompts import ChatPromptTemplate

print("âœ“ All libraries imported successfully")

## Step 6: Initialize Ollama Model

In [None]:
# Initialize Llama model with connection retry
import time

max_retries = 3
for attempt in range(max_retries):
    try:
        llm = OllamaLLM(model="llama3.2", temperature=0)
        # Test the connection
        test = llm.invoke("Say OK")
        print("âœ“ Llama 3.2 model initialized successfully")
        break
    except Exception as e:
        if attempt < max_retries - 1:
            print(f"âš  Connection failed (attempt {attempt+1}/{max_retries}), retrying...")
            # Restart Ollama server
            !pkill -9 ollama
            time.sleep(2)
            import threading
            import subprocess
            def run_ollama():
                subprocess.Popen(["ollama", "serve"])
            threading.Thread(target=run_ollama).start()
            time.sleep(5)
        else:
            print(f"âœ— Failed to initialize Ollama after {max_retries} attempts")
            print("Please manually run: ollama serve")
            raise

## Step 7: Define Utility Functions

In [None]:
def normalize_url(url: str) -> str:
    """Remove fragments and queries, return canonical URL."""
    parsed = urlparse(url)
    return parsed._replace(fragment="", query="").geturl()

def fetch_page(url: str) -> Dict[str, Any]:
    """Fetch a page and return its content."""
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }
    r = requests.get(url, timeout=30, headers=headers)
    r.raise_for_status()
    
    soup = BeautifulSoup(r.text, "lxml")
    text = soup.get_text(separator="\n", strip=True)
    
    return {
        "url": url,
        "status_code": r.status_code,
        "html": r.text,
        "text": text
    }

def extract_internal_links_with_anchor(html: str, base_url: str) -> List[Dict[str, str]]:
    """Extract internal links and their anchor text from HTML."""
    soup = BeautifulSoup(html, "lxml")
    base_domain = urlparse(base_url).netloc
    results = []
    
    for a in soup.find_all("a", href=True):
        full_url = urljoin(base_url, a["href"])
        parsed = urlparse(full_url)
        
        if parsed.netloc == base_domain and parsed.scheme in ("http", "https"):
            clean_url = parsed._replace(fragment="", query="").geturl()
            anchor = a.get_text(strip=True)
            results.append({"url": clean_url, "anchor": anchor})
    
    # Deduplicate by URL
    seen = {}
    for r in results:
        seen[r["url"]] = r["anchor"]
    
    return [{"url": u, "anchor": a} for u, a in seen.items()]

print("âœ“ Utility functions defined")

## Step 8: AI-Powered Link Filtering

In [None]:
def filter_relevant_urls_with_ai(links: List[Dict[str, str]], llm, batch_size: int = 15) -> List[Dict[str, str]]:
    """
    Use Ollama/Llama to intelligently filter links based on educational content relevance.
    """
    if not links:
        return []
    
    filtered = []
    total_batches = (len(links) + batch_size - 1) // batch_size
    
    for batch_num, i in enumerate(range(0, len(links), batch_size), 1):
        batch = links[i:i + batch_size]
        
        # Format links for AI analysis
        links_text = "\n".join([
            f"{idx}. URL: {link['url']}\n   Anchor: {link['anchor']}"
            for idx, link in enumerate(batch, 1)
        ])
        
        prompt = f"""You are filtering web links for an educational database crawler.

INCLUDE links about:
- Courses, programs, degrees, diplomas, certificates
- Admissions, eligibility, requirements, applications
- Curriculum, syllabus, course structures, pathways
- Academic departments with program listings

EXCLUDE links about:
- Staff profiles, news, events, research papers
- Libraries, IT services, student portals
- Administration, governance, about pages
- Login pages, downloads, galleries

Links to analyze:
{links_text}

Return ONLY a JSON array of relevant link numbers, e.g., [1, 3, 5]
If none are relevant, return []

JSON:"""
        
        try:
            response = llm.invoke(prompt)
            # Extract JSON from response
            response = response.strip()
            if response.startswith('[') and response.endswith(']'):
                relevant_indices = json.loads(response)
            else:
                # Try to find JSON in response
                import re
                match = re.search(r'\[.*?\]', response)
                if match:
                    relevant_indices = json.loads(match.group())
                else:
                    relevant_indices = []
            
            # Add relevant links
            for idx in relevant_indices:
                if 1 <= idx <= len(batch):
                    filtered.append(batch[idx - 1])
            
            print(f"    [Filter Batch {batch_num}/{total_batches}] {len(relevant_indices)}/{len(batch)} links relevant")
            
        except Exception as e:
            print(f"    âš  AI filter error: {e}")
            # Fallback to simple keyword filtering
            for link in batch:
                text = (link['url'] + ' ' + link['anchor']).lower()
                if any(kw in text for kw in ['course', 'program', 'degree', 'admission', 'curriculum']):
                    if not any(kw in text for kw in ['news', 'event', 'staff', 'research', 'login']):
                        filtered.append(link)
        
        # Rate limiting
        if batch_num < total_batches:
            time.sleep(1)
    
    return filtered

print("âœ“ AI link filter defined")

## Step 9: AI-Powered Page Expansion Decision

In [None]:
def should_expand_page(page_content: str, url: str, llm) -> tuple[bool, str]:
    """
    Use Ollama/Llama to decide if page should be expanded for further crawling.
    Returns (should_expand, reason)
    """
    prompt = f"""Analyze if this webpage contains educational program information worth crawling deeper.

URL: {url}

EXPAND if page contains:
- Course/program listings or descriptions
- Admission requirements or eligibility criteria
- Curriculum details or academic pathways
- Certificate/diploma program information

SKIP if page is about:
- Staff profiles, news, events, research
- Administrative info, IT services, libraries
- About us, history, governance, rankings

Page content (first 3000 chars):
{page_content[:3000]}

Return ONLY a JSON object: {{"expand": true/false, "reason": "brief explanation"}}

JSON:"""
    
    try:
        response = llm.invoke(prompt)
        # Try to parse JSON from response
        import re
        response = response.strip()
        
        # Try direct JSON parse
        if response.startswith('{'):
            result = json.loads(response)
        else:
            # Extract JSON from markdown or text
            match = re.search(r'\{.*?\}', response, re.DOTALL)
            if match:
                result = json.loads(match.group())
            else:
                return False, "Failed to parse AI response"
        
        should_expand = bool(result.get("expand", False))
        reason = result.get("reason", "No reason provided")
        return should_expand, reason
        
    except Exception as e:
        print(f"    âš  AI expansion error: {e}")
        return False, f"Error: {str(e)}"

print("âœ“ Page expansion function defined")

## Step 10: Academic Crawler Class

In [None]:
class AcademicCrawler:
    """
    Recursive academic web crawler with Ollama/Llama-based filtering.
    """
    def __init__(self, llm, output_dir: str, max_depth: int = 3, rate_limit: int = 1):
        self.llm = llm
        self.output_dir = output_dir
        self.max_depth = max_depth
        self.rate_limit = rate_limit
        self.visited: Set[str] = set()
    
    def recursive_ingest(self, url: str, base_url: str, depth: int) -> None:
        """Recursively crawl pages starting from the given URL."""
        url = normalize_url(url)
        if url in self.visited:
            return
        self.visited.add(url)
        
        print(f"\n[DEPTH {depth}] Fetching: {url}")
        try:
            page = fetch_page(url)
        except Exception as e:
            print(f"    âœ— ERROR: {e}")
            return
        
        # Decide if page should be expanded
        should_expand, reason = should_expand_page(page["text"], url, self.llm)
        
        if should_expand:
            print(f"    âœ“ EXPAND: {reason}")
        else:
            print(f"    âœ— SKIP: {reason}")
        
        # Save page with metadata
        depth_dir = os.path.join(self.output_dir, f"depth_{depth}")
        os.makedirs(depth_dir, exist_ok=True)
        filename = (
            urlparse(url).path.strip("/")
            .replace("/", "_")
            .replace(".", "_")
            or "root"
        )
        out_path = os.path.join(depth_dir, f"{filename}.json")
        page_json = {
            **page,
            "crawl_depth": depth,
            "parent_url": base_url if depth > 0 else None,
            "should_expand": should_expand,
            "expand_reason": reason
        }
        with open(out_path, "w", encoding="utf-8") as f:
            json.dump(page_json, f, ensure_ascii=False, indent=2)
        
        # Stop conditions
        if depth >= self.max_depth:
            print(f"    â†’ Max depth reached")
            return
        
        if not should_expand:
            return
        
        # Extract and filter links
        links = extract_internal_links_with_anchor(page["html"], url)
        print(f"    â†’ Extracted {len(links)} total links, filtering with AI...")
        relevant_links = filter_relevant_urls_with_ai(links, self.llm, batch_size=15)
        print(f"    â†’ Final: {len(relevant_links)} educational links to explore")
        
        # Recursively crawl relevant links
        for link in relevant_links:
            next_url = normalize_url(link["url"])
            if urlparse(next_url).netloc != urlparse(base_url).netloc:
                continue
            if next_url not in self.visited:
                time.sleep(self.rate_limit)
                self.recursive_ingest(next_url, base_url, depth + 1)

print("âœ“ Academic Crawler class defined")

## Step 11: Run the Crawler

In [None]:
# Configuration
START_URL = "https://cmb.ac.lk/"  # Change to your target university
MAX_DEPTH = 2  # Start with depth 2 for testing
RATE_LIMIT = 2  # Seconds between requests
OUTPUT_DIR = "/content/crawled_pages"

print(f"""
=============================
University Academic Crawler
=============================
[CONFIG]
  start_url  = {START_URL}
  max_depth  = {MAX_DEPTH}
  rate_limit = {RATE_LIMIT}s
  output_dir = {OUTPUT_DIR}
  model      = Llama 3.2 (Ollama)

[GOAL]
  Building database of:
  - Educational courses & programs
  - Admission criteria & requirements
  - Certificates & diplomas
  - Academic pathways & progressions
=============================
""")

# Initialize and run crawler
crawler = AcademicCrawler(
    llm=llm,
    output_dir=OUTPUT_DIR,
    max_depth=MAX_DEPTH,
    rate_limit=RATE_LIMIT
)

# Start crawling
crawler.recursive_ingest(START_URL, START_URL, 0)

print(f"""
=============================
[CRAWL COMPLETE]
=============================
Total pages crawled: {len(crawler.visited)}
Output directory: {OUTPUT_DIR}
=============================
""")

## Step 12: Analyze Crawled Data

In [None]:
import glob
from collections import Counter

# Count files per depth
depth_counts = {}
for depth in range(MAX_DEPTH + 1):
    depth_dir = os.path.join(OUTPUT_DIR, f"depth_{depth}")
    if os.path.exists(depth_dir):
        count = len([f for f in os.listdir(depth_dir) if f.endswith('.json')])
        depth_counts[depth] = count

print("\nðŸ“Š Crawl Statistics:")
print("=" * 40)
for depth, count in sorted(depth_counts.items()):
    print(f"Depth {depth}: {count} pages")
print(f"\nTotal pages: {sum(depth_counts.values())}")

# Sample a crawled page
json_files = glob.glob(os.path.join(OUTPUT_DIR, "**/*.json"), recursive=True)
if json_files:
    print(f"\nðŸ“„ Sample crawled page:")
    with open(json_files[0], 'r') as f:
        sample = json.load(f)
        print(f"URL: {sample['url']}")
        print(f"Depth: {sample['crawl_depth']}")
        print(f"Should Expand: {sample['should_expand']}")
        print(f"Reason: {sample.get('expand_reason', 'N/A')}")
        print(f"Content length: {len(sample['text'])} chars")

## Step 13: Download Crawled Data (Optional)

In [None]:
# Create a zip file of all crawled data
!zip -r /content/crawled_data.zip {OUTPUT_DIR}

print("âœ“ Crawled data saved to: /content/crawled_data.zip")
print("You can download this file from the Files panel on the left.")