<a href="https://colab.research.google.com/github/Thiwanka-Sandakalum/ETL-pipline/blob/main/Web_Crawlerv2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# üéì Academic Web Crawler - Clean Pipeline

A modular web crawler for academic websites. Run each cell independently.

## Pipeline Steps:
1. **Setup** - Install packages and start Ollama
2. **Configure** - Set your target URL and parameters
3. **Discover URLs** - Crawl website and find all links
4. **Filter URLs** - Use AI to select relevant academic pages
5. **Download Content** - Get HTML content from filtered URLs
6. **Extract Text** - Clean HTML and save to .txt files

---

## üì¶ Step 1: Installation & Setup

Install all required packages and set up Ollama.

In [None]:
# Install Python packages
!pip install -q langchain-ollama beautifulsoup4 lxml requests tqdm

print("‚úÖ Packages installed successfully!")

In [None]:
# Install Ollama
!sudo apt update > /dev/null 2>&1
!sudo apt install -y pciutils > /dev/null 2>&1
!curl -fsSL https://ollama.com/install.sh | sh

print("‚úÖ Ollama installed successfully!")

In [None]:
!ollama serve

In [None]:
# Download Llama 3.2 model
!ollama pull llama3.2

print("‚úÖ Llama 3.2 model ready!")

## üöÄ Step 2: Initialize Ollama Server & LLM

Start the Ollama server and initialize the language model.

In [None]:
import subprocess
import threading
import time
from langchain_ollama.llms import OllamaLLM

print("üöÄ Starting Ollama server...")

# Kill any existing Ollama processes
subprocess.run(["pkill", "-9", "ollama"], stderr=subprocess.DEVNULL)
time.sleep(2)

# Start Ollama server in background
def run_ollama_serve():
    subprocess.Popen(["ollama", "serve"],
                     stdout=subprocess.DEVNULL,
                     stderr=subprocess.DEVNULL)

# Start the server for the first time
thread = threading.Thread(target=run_ollama_serve, daemon=True)
thread.start()
time.sleep(5) # Give the server some time to start

# Ensure the model is pulled after the server starts
print("‚¨áÔ∏è Pulling Llama 3.2 model if not present...")
subprocess.run(["ollama", "pull", "llama3.2"],
                 stdout=subprocess.DEVNULL,
                 stderr=subprocess.DEVNULL)
print("‚úÖ Llama 3.2 model pull initiated (or already present).")


# Initialize LLM with retry logic
print("ü§ñ Initializing Llama 3.2...")
llm = None

for attempt in range(3):
    try:
        llm = OllamaLLM(model="llama3.2", temperature=0)
        test_response = llm.invoke("Say OK")
        print(f"‚úÖ LLM initialized successfully! Test response: {test_response}")
        break
    except Exception as e:
        if attempt < 2:
            print(f"‚ö†Ô∏è  Retry {attempt + 1}/3...")
            # Kill and restart server
            subprocess.run(["pkill", "-9", "ollama"], stderr=subprocess.DEVNULL)
            time.sleep(2)
            threading.Thread(target=run_ollama_serve, daemon=True).start()
            time.sleep(5)
            # Re-attempt pulling the model
            print("‚¨áÔ∏è Retrying Llama 3.2 model pull...")
            subprocess.run(["ollama", "pull", "llama3.2"],
                             stdout=subprocess.DEVNULL,
                             stderr=subprocess.DEVNULL)
            time.sleep(2) # Give some time for pull
        else:
            raise Exception(f"‚ùå Failed to initialize LLM: {e}")

## ‚öôÔ∏è Step 3: Configuration

Set your target URL and crawling parameters.

In [None]:
from google.colab import drive
drive.mount('/content/drive')
print("‚úÖ Google Drive mounted.")

In [None]:
from urllib.parse import urlparse

# ============================================================
# CONFIGURATION - MODIFY THESE VALUES
# ============================================================

# Target website to crawl
START_URL = "https://mgmt.cmb.ac.lk/mgmt_department-of-accounting"

# Maximum crawl depth (0 = only start page, 1 = start + linked pages, etc.)
MAX_DEPTH = 50
MAX_WORKERS=100
# Output directory for final text files
# Automatically generate OUTPUT_DIR based on START_URL's domain
parsed_url = urlparse(START_URL)
domain = parsed_url.netloc.replace('www.', '').replace('.', '_')
OUTPUT_DIR = f"/content/drive/MyDrive/academic_content_output/{domain}"

# Parallel processing settings
MAX_DOWNLOAD_WORKERS = 10  # Concurrent downloads
AI_BATCH_SIZE = 10         # URLs per AI batch
AI_MAX_WORKERS = 5        # Concurrent AI requests

# Minimum text length to save (characters)
MIN_TEXT_LENGTH = 500

print("‚úÖ Configuration set:")
print(f"   Start URL: {START_URL}")
print(f"   Max Depth: {MAX_DEPTH}")
print(f"   Output Dir: {OUTPUT_DIR}")

## üï∑Ô∏è Step 4: URL Discovery

Crawl the website and discover all URLs.

In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse, urldefrag
from typing import Set, List, Optional, Dict
from collections import deque # Import deque
import threading # Import threading for locks
from queue import Queue # Import Queue
from concurrent.futures import ThreadPoolExecutor # Import ThreadPoolExecutor
MAX_WORKERS=20

# Keywords to reject during crawling
REJECT_KEYWORDS = set([
    "assets", "attachments", "audio", "css", "downloads", "favicon", "fonts", "images", "img", "js", "media", "misc", "pdf", "photo", "pict", "png", "scripts", "static", "styles", "themes", "uploads", "video", "wp-content", "wp-includes", ".jpg", ".jpeg", ".png", ".gif", ".svg", ".mp4", ".mp3", ".zip", ".tar", ".gz",
    # authentication / portals
    "account", "auth", "authenticate", "authentication", "cas", "dashboard", "ezproxy", "forgot", "identity", "login", "logout", "mfa", "my-account", "my-profile", "netid", "password", "portal", "proxy", "register", "saml", "shibboleth", "signin", "signout", "signup", "sso", "user", "validate",
    # news / marketing / media
    "announcement", "archive", "blog", "calendar", "category", "event", "events", "feed", "gallery", "magazine", "news", "newsletter", "press", "rss", "schedule", "slideshow", "stories", "tags", "upcoming", "view-event",
    # careers / jobs
    "applicant", "benefits", "career", "careers", "compensation", "employment", "hiring", "hr", "human-resources", "internship", "job", "jobs", "onboarding", "opportunities", "payroll", "position", "recruitment", "staff-training", "vacancy", "vacancies",
    # legal / policy pages
    "accessibility", "ada", "compliance", "cookie", "cookies", "copyright", "disclaimer", "legal", "license", "maintainer", "maintenance", "policy", "privacy", "security", "terms", "terms-and-conditions",
    # social media / external platforms
    "facebook", "instagram", "linkedin", "pinterest", "share", "snapchat", "tiktok", "tumblr", "twitter", "vimeo", "whatsapp", "youtube",
    # tracking / analytics
    "analytics", "fbclid", "ga_", "gclid", "google-analytics", "log", "logs", "metrics", "pixel", "stats", "tracker", "tracking", "utm_",
    # search / filters / pagination
    "filter", "limit", "offset", "order", "page", "query", "results", "search", "sort", "view", "viewitems",
    "tag/", "tags/",
    "category/", "categories/",
    "author/",
    "page/",          # pagination
    "/202",
    # system / administrative
    "admin", "api", "backup", "bin", "cache", "cgi-bin", "config", "configuration", "cron", "devel", "dev", "etc", "install", "modules", "node/add", "php", "plugins", "server-status", "settings", "sql", "structure", "tmp", "update", "upgrade", "var", "wp-admin", "xmlrpc"
])

# File extensions to skip
REJECT_EXTENSIONS = (
    ".jpg", ".jpeg", ".png", ".gif", ".svg", ".webp",
    ".css", ".js", ".map", ".pdf", ".zip", ".rar",
    ".mp4", ".mp3", ".doc", ".docx", ".xls", ".xlsx"
)

# ================= GLOBAL STATE =================
session = requests.Session()
session.headers.update({"User-Agent": "Mozilla/5.0"})

visited: Set[str] = set()
program_tree: Dict[str, List[str]] = {}

visited_lock = threading.Lock()
tree_lock = threading.Lock()

task_queue = Queue()

# ================= UTILITIES =================
def normalize(url: str) -> str | None:
    url, _ = urldefrag(url)
    parsed = urlparse(url)
    if parsed.scheme not in ("http", "https"):
        return None
    return parsed._replace(query="").geturl().rstrip("/")

def is_crawlable_link(url: str, base_domain: str) -> bool:
    parsed = urlparse(url)
    if parsed.netloc != base_domain:
        return False
    path = parsed.path.lower()
    if any(k in path for k in REJECT_KEYWORDS):
        return False
    if any(path.endswith(ext) for ext in REJECT_EXTENSIONS):
        return False
    return True

# ================= EXTRACTION =================
def extract_program_links(page_url: str, base_domain: str) -> List[str]:
    try:
        r = session.get(page_url, timeout=10)
        if r.status_code != 200:
            return []

        soup = BeautifulSoup(r.text, "html.parser")
        links = []

        for a in soup.find_all("a", href=True):
            full = normalize(urljoin(page_url, a["href"]))
            if not full:
                continue

            with visited_lock:
                if full in visited:
                    continue

            if is_crawlable_link(full, base_domain):
                links.append(full)

        return list(dict.fromkeys(links))  # preserve order
    except requests.RequestException:
        return []

# ================= WORKER =================
def worker(base_domain: str):
    while True:
        try:
            url, depth = task_queue.get(timeout=2)
        except:
            return  # Queue empty ‚Üí exit worker

        with visited_lock:
            if url in visited:
                task_queue.task_done()
                continue
            visited.add(url)

        print(f"[Depth {depth}] {url}")

        if depth < MAX_DEPTH:
            children = extract_program_links(url, base_domain)

            with tree_lock:
                program_tree[url] = children

            for child in children:
                task_queue.put((child, depth + 1))

        task_queue.task_done()

# ================= RUNNER =================
def run_crawler_and_get_urls() -> List[str]:
    start = normalize(START_URL)
    if not start:
        return []

    base_domain = urlparse(start).netloc
    task_queue.put((start, 0))

    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        print("max workers",MAX_WORKERS)
        for _ in range(MAX_WORKERS):
            executor.submit(worker, base_domain)

        task_queue.join()  # Wait until queue is empty

    return sorted(visited)

# ================= EXECUTION =================
if __name__ == "__main__":
    discovered_urls = run_crawler_and_get_urls()

    print("\nDiscovered URLs (first 20):")
    for u in discovered_urls[:20]:
        print(u)

    if len(discovered_urls) > 20:
        print(f"... and {len(discovered_urls) - 20} more")

    # Optional: print program tree structure
    print("\nProgram tree (sample):")
    for parent, children in list(program_tree.items())[:10]:
        print(f"{parent}")
        for c in children:
            print(f"  ‚îî‚îÄ {c}")

In [None]:
print("\nProgram tree (sample):")
for parent, children in list(program_tree.items()):
        print(f"{parent}")
        for c in children:
            print(f"  ‚îî‚îÄ {c}")

## üîç Step 5: Filter URLs with AI

Use AI and heuristics to select academically relevant URLs.

In [None]:
import json
import re
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import time
import subprocess

def ai_filter_batch(urls_batch: list, llm) -> list:
    """Use LLM to filter a batch of URLs with improved prompting."""
    urls_text = "\n".join(f"- {url}" for url in urls_batch)

    prompt = f"""You are a specialized URL classifier for university websites. Your ONLY task is to identify URLs that contain information about academic programs, courses, and degrees.

**INCLUDE URLs that contain:**
- Program listings (undergraduate/postgraduate programs)
- Degree information (BSc, MSc, PhD, etc.)
- Course catalogs or course unit descriptions
- Department program pages (what programs a department offers)
- Admission requirements for academic programs
- Curriculum or syllabus details for degrees
- Academic prospectus pages

**EXCLUDE URLs about:**
- Individual staff profiles or staff listings
- Research projects, publications, or research centers
- News articles, events, or announcements
- Student societies, clubs, or social activities
- Field trips, workshops, or seminars
- Facilities, museums, libraries, or laboratories
- Alumni information or past students
- Awards, scholarships (unless part of program description)
- Contact pages, history pages, or "about us" pages
- Committee information or administrative details
- Date-based URLs (e.g., /2018/05/23/)
- Image attachments or gallery pages

**EXAMPLES:**
‚úì INCLUDE: /undergraduate-courses, /postgraduate, /bsc-special-degree, /mat/program
‚úó EXCLUDE: /staff, /academic-staff, /publications, /news, /alumni, /contact

Analyze these URLs and return ONLY those that describe academic programs or courses.

URLs to evaluate:
{urls_text}

Return your response as a JSON array of selected URLs. If none qualify, return [].

JSON array:"""

    max_retries = 3
    for attempt in range(max_retries):
        try:
            response = llm.invoke(prompt).strip()

            # Extract JSON array
            match = re.search(r'\[.*?\]', response, re.DOTALL)
            if match:
                selected = json.loads(match.group())
                return [url for url in selected if url in urls_batch]
            else:
                print(f"‚ö†Ô∏è  AI filter: No JSON array in response (attempt {attempt+1}/{max_retries})")
                break
        except Exception as e:
            error_message = str(e).lower()
            if "connection refused" in error_message or isinstance(e, ConnectionRefusedError):
                print(f"‚ö†Ô∏è  Connection error: {e} (attempt {attempt+1}/{max_retries})")
                if attempt < max_retries - 1:
                    time.sleep(2 * (attempt + 1))
                    # Restart Ollama server
                    print("Restarting Ollama server...")
                    subprocess.run(["pkill", "-9", "ollama"], stderr=subprocess.DEVNULL)
                    time.sleep(2)
                    subprocess.Popen(["ollama", "serve"],
                                   stdout=subprocess.DEVNULL,
                                   stderr=subprocess.DEVNULL)
                    time.sleep(5)
                    subprocess.run(["ollama", "pull", "llama3.2"],
                                 stdout=subprocess.DEVNULL,
                                 stderr=subprocess.DEVNULL)
                    time.sleep(2)
                else:
                    print(f"‚ùå Failed after {max_retries} attempts")
                    break
            else:
                print(f"‚ö†Ô∏è  General error: {e} (attempt {attempt+1}/{max_retries})")
                break

    return []

def filter_urls_with_ai(urls: list, llm, batch_size: int, max_workers: int) -> list:
    """Filter URLs using AI processing with progress tracking."""
    print(f"üîç Filtering {len(urls)} URLs using AI...\n")

    batches = [urls[i:i + batch_size]
               for i in range(0, len(urls), batch_size)]

    ai_filtered = []

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(ai_filter_batch, batch, llm): i
                  for i, batch in enumerate(batches)}

        for future in tqdm(as_completed(futures),
                          total=len(futures),
                          desc="   Processing batches"):
            try:
                result = future.result()
                if result:
                    ai_filtered.extend(result)
                time.sleep(0.5)  # Rate limiting
            except Exception as e:
                print(f"‚ö†Ô∏è  Batch processing error: {e}")

    # Deduplicate and sort
    final = sorted(list(set(ai_filtered)))

    print(f"\n‚úÖ AI Filtering complete!")
    print(f"   Final relevant URLs: {len(final)}")
    return final

# Example usage (assuming you have your discovered_urls and llm setup):
# AI_BATCH_SIZE = 10
# AI_MAX_WORKERS = 3
#
filtered_urls = filter_urls_with_ai(
    discovered_urls,
    llm,
    AI_BATCH_SIZE,
    AI_MAX_WORKERS
)

print("\nüìã Filtered Program URLs:")
for url in filtered_urls[:20]:
    print(f"   {url}")
if len(filtered_urls) > 20:
    print(f"   ... and {len(filtered_urls) - 20} more")

In [None]:
print("\nüìã Filtered Program URLs:")
for url in filtered_urls:
    print(f"   {url}")

## üì• Step 6: Download Content

Download HTML content from all filtered URLs.

In [None]:
from typing import Dict, Optional

def download_url(url: str) -> Optional[str]:
    """Download HTML content from URL."""
    try:
        response = requests.get(
            url,
            timeout=15,
            headers={"User-Agent": "Mozilla/5.0 (Academic Crawler)"}
        )
        response.raise_for_status()
        return response.text
    except Exception as e:
        print(f"‚ö†Ô∏è  Failed to download {url}: {e}")
        return None

def download_all_content(urls: List[str], max_workers: int) -> Dict[str, str]:
    """Download HTML content from all URLs in parallel."""
    print(f"üì• Downloading content from {len(urls)} URLs...\n")

    content_map = {}

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(download_url, url): url for url in urls}

        for future in tqdm(as_completed(futures),
                          total=len(futures),
                          desc="   Downloading"):
            url = futures[future]
            try:
                html = future.result()
                if html:
                    content_map[url] = html
            except Exception as e:
                print(f"‚ö†Ô∏è  Error processing {url}: {e}")

    print(f"\n‚úÖ Download complete!")
    print(f"   Successfully downloaded: {len(content_map)}/{len(urls)} pages")
    return content_map

# Download all content
downloaded_content = download_all_content(filtered_urls, MAX_DOWNLOAD_WORKERS)

print(f"\nüìä Content statistics:")
total_size = sum(len(html) for html in downloaded_content.values())
print(f"   Total HTML size: {total_size / 1024 / 1024:.2f} MB")
print(f"   Average page size: {total_size / len(downloaded_content) / 1024:.2f} KB")

## üìù Step 7: Extract Text & Save Files

Extract clean text from HTML and save to .txt files.

In [None]:
import os
from pathlib import Path

def clean_html_to_text(html: str) -> str:
    """Extract clean text from HTML."""
    soup = BeautifulSoup(html, "lxml")

    # Remove unwanted elements
    for tag in soup(["script", "style", "nav", "footer",
                     "aside", "header", "iframe", "form"]):
        tag.decompose()

    # Remove comments
    for comment in soup.find_all(string=lambda text:
                                 isinstance(text, str) and
                                 text.strip().startswith("<!--")):
        comment.extract()

    # Extract text
    text = soup.get_text(separator="\n", strip=True)

    # Clean up whitespace
    lines = [line.strip() for line in text.splitlines() if line.strip()]
    clean_text = "\n".join(lines)

    return clean_text

def safe_filename(url: str) -> str:
    """Generate safe filename from URL."""
    parsed = urlparse(url)
    path = parsed.path if parsed.path else "root"

    # Create readable filename
    filename = f"{parsed.netloc}{path}"
    filename = filename.replace("/", "_").replace("?", "_")
    filename = filename.replace("&", "_").replace(":", "_")
    filename = filename.replace("=", "_").replace(".", "_")

    # Limit length
    if len(filename) > 150:
        filename = filename[:150]

    return filename + ".txt"

def save_content_to_files(
    content_map: Dict[str, str],
    output_dir: str,
    min_length: int
) -> int:
    """Extract text and save to files."""
    print(f"üìù Extracting text and saving files...\n")

    # Create output directory
    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)

    saved_count = 0
    skipped_count = 0

    for url, html in tqdm(content_map.items(), desc="   Processing"):
        try:
            # Extract clean text
            clean_text = clean_html_to_text(html)

            # Skip if too short
            if len(clean_text) < min_length:
                skipped_count += 1
                continue

            # Generate filename
            filename = safe_filename(url)
            filepath = output_path / filename

            # Write to file
            with open(filepath, "w", encoding="utf-8") as f:
                f.write(f"URL: {url}\n")
                f.write("=" * 80 + "\n\n")
                f.write(clean_text)

            saved_count += 1

        except Exception as e:
            print(f"‚ö†Ô∏è  Error processing {url}: {e}")

    print(f"\n‚úÖ Text extraction complete!")
    print(f"   Files saved: {saved_count}")
    print(f"   Files skipped (too short): {skipped_count}")
    print(f"   Output directory: {output_dir}")

    return saved_count

# Extract and save all content
saved_files = save_content_to_files(
    downloaded_content,
    OUTPUT_DIR,
    MIN_TEXT_LENGTH
)