In [None]:
def crawl_links(start_url, max_pages=10000):
    """
    Web crawler that recursively visits URLs and extracts links from each page.
    
    Args:
        start_url (str): The starting URL to begin crawling
        max_pages (int): Maximum number of pages to crawl (safety limit)
        
    Returns:
        set: Set of all discovered links matching the base URL
    """
    from urllib.parse import urlparse, urljoin
    from selenium import webdriver
    from selenium.webdriver.common.by import By
    from selenium.webdriver.chrome.options import Options
    import time
    
    # Set up Chrome options
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in headless mode
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    
    # Initialize the browser
    driver = webdriver.Chrome(options=chrome_options)
    
    # Extract the base URL for comparison
    parsed_url = urlparse(start_url)
    base_url = f"{parsed_url.scheme}://{parsed_url.netloc}/"
    print(f"\n=== CRAWLER STARTING ===")
    print(f"Starting URL: {start_url}")
    print(f"Base URL for filtering: {base_url}")
    
    # Initialize queue and visited set
    queue = [start_url]
    visited = set()
    all_discovered_links = set()
    
    pages_visited = 0
    
    try:
        while queue and pages_visited < max_pages:
            # Get the next URL from the queue
            current_url = queue.pop(0)
            
            # Skip if already visited
            if current_url in visited:
                continue
            
            print(f"\n[{pages_visited + 1}/{max_pages}] Processing: {current_url}")
            
            # Visit the URL
            try:
                driver.get(current_url)
                time.sleep(0.1)  # Wait for page to load
                
                # Mark as visited
                visited.add(current_url)
                pages_visited += 1
                
                # Check if we're on the right site
                current_page_url = driver.current_url
                current_parsed = urlparse(current_page_url)
                current_base = f"{current_parsed.scheme}://{current_parsed.netloc}/"
                
                if current_base != base_url:
                    print(f"ERROR: We've navigated away from {base_url} to {current_base}")
                    continue
                
                # Scroll to make sure all content is loaded
                driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                time.sleep(0.5)
                driver.execute_script("window.scrollTo(0, 0);")
                time.sleep(0.5)
                
                # Get ALL anchor elements with href attributes
                link_elements = driver.find_elements(By.CSS_SELECTOR, "a[href]")
                print(f"Found {len(link_elements)} total links on the page")
                
                # Extract and process links
                new_links = 0
                for element in link_elements:
                    try:
                        href = element.get_attribute('href')
                        
                        if not href or href in ['#', 'javascript:void(0)', 'javascript:;']:
                            continue
                        
                        # Make URL absolute
                        if not href.startswith('http'):
                            href = urljoin(current_page_url, href)
                        
                        # Parse the link to get its base URL
                        link_parsed = urlparse(href)
                        link_base = f"{link_parsed.scheme}://{link_parsed.netloc}/"
                        
                        # Only process links with matching base URL
                        if link_base == base_url:
                            # Add to all discovered links
                            all_discovered_links.add(href)
                            
                            # If not visited and not in queue, add to queue
                            if href not in visited and href not in queue:
                                queue.append(href)
                                new_links += 1
                    except Exception as e:
                        continue
                
                print(f"Added {new_links} new links to the queue")
                print(f"Queue size: {len(queue)}")
                print(f"Total discovered links: {len(all_discovered_links)}")
                
            except Exception as e:
                print(f"Error processing {current_url}: {str(e)}")
        
        if pages_visited >= max_pages:
            print(f"\n=== MAXIMUM PAGES LIMIT REACHED ({max_pages}) ===")
        else:
            print(f"\n=== CRAWL COMPLETE ===")
        
        print(f"Total pages visited: {pages_visited}")
        print(f"Total unique links discovered: {len(all_discovered_links)}")
        
        # Print the array of all discovered links
        print("\n=== ALL DISCOVERED LINKS ===")
        for link in all_discovered_links:
            print(link)
        
        return all_discovered_links
        
    except Exception as e:
        print(f"Crawler error: {str(e)}")
        return all_discovered_links
    finally:
        # Always close the browser
        driver.quit()

In [4]:
crawl_links("https://catalog.manoa.hawaii.edu/content.php?catoid=2&catoid=2&navoid=420&filter%5Bitem_type%5D=3&filter%5Bonly_active%5D=1&filter%5B3%5D=1&filter%5Bcpage%5D=1#acalog_template_course_filter")


=== CRAWLER STARTING ===
Starting URL: https://catalog.manoa.hawaii.edu/content.php?catoid=2&catoid=2&navoid=420&filter%5Bitem_type%5D=3&filter%5Bonly_active%5D=1&filter%5B3%5D=1&filter%5Bcpage%5D=1#acalog_template_course_filter
Base URL for filtering: https://catalog.manoa.hawaii.edu/

[1/100] Processing: https://catalog.manoa.hawaii.edu/content.php?catoid=2&catoid=2&navoid=420&filter%5Bitem_type%5D=3&filter%5Bonly_active%5D=1&filter%5B3%5D=1&filter%5Bcpage%5D=1#acalog_template_course_filter
Found 178 total links on the page
Added 155 new links to the queue
Queue size: 155
Total discovered links: 155

[2/100] Processing: https://catalog.manoa.hawaii.edu/content.php?catoid=2&catoid=2&navoid=420&filter%5Bitem_type%5D=3&filter%5Bonly_active%5D=1&filter%5B3%5D=1&filter%5Bcpage%5D=1#select_catalog
Found 178 total links on the page
Added 0 new links to the queue
Queue size: 154
Total discovered links: 155

[3/100] Processing: https://catalog.manoa.hawaii.edu/search_advanced.php?catoid=2
Fo

KeyboardInterrupt: 

In [10]:
def ai_crawler(start_url, extraction_prompt, output_file="scraped_data.json", max_pages=50, google_api_key=None):
    """
    AI-powered web crawler that visits URLs, extracts structured data using Gemini-1.5-lite,
    and saves results to a JSON file.
    
    Args:
        start_url (str): The starting URL to begin crawling
        extraction_prompt (str): Prompt for the LLM describing what data to extract
        output_file (str): Path to save the extracted JSON data
        max_pages (int): Maximum number of pages to crawl (safety limit)
        google_api_key (str): Google API key for Gemini models
        
    Returns:
        list: List of all extracted data items
    """
    from urllib.parse import urlparse, urljoin
    from selenium import webdriver
    from selenium.webdriver.common.by import By
    from selenium.webdriver.chrome.options import Options
    from bs4 import BeautifulSoup
    import time
    import json
    import os
    import google.generativeai as genai
    
    # Initialize Google Gemini client
    if not google_api_key:
        google_api_key = os.environ.get("GOOGLE_API_KEY")
    
    if not google_api_key:
        raise ValueError("Google API key must be provided either directly or via GOOGLE_API_KEY environment variable")
    
    genai.configure(api_key=google_api_key)
    
    # Set up Chrome options
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in headless mode
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    
    # Initialize the browser
    driver = webdriver.Chrome(options=chrome_options)
    
    # Extract the base URL for comparison
    parsed_url = urlparse(start_url)
    base_url = f"{parsed_url.scheme}://{parsed_url.netloc}/"
    print(f"\n=== AI CRAWLER STARTING ===")
    print(f"Starting URL: {start_url}")
    print(f"Base URL for filtering: {base_url}")
    
    # Initialize queue and visited set
    queue = [start_url]
    visited = set()
    all_discovered_links = set()
    
    # Initialize data storage
    all_extracted_data = []
    
    pages_visited = 0
    
    # Function to extract data using Gemini
    def extract_with_gemini(page_content, page_url):
        try:
            # Create a combined prompt with page URL and content
            combined_prompt = f"""
URL: {page_url}

EXTRACTION INSTRUCTIONS:
{extraction_prompt}

PAGE CONTENT:
{page_content}

Please extract the requested information as a valid JSON array. Each item should be a JSON object.
If no relevant information is found, return an empty array [].
Return ONLY a valid JSON array without any explanations, markdown formatting, or additional text.
"""
            # Set up the model
            model = genai.GenerativeModel('gemini-1.5-flash')
            
            # Call the Gemini API
            response = model.generate_content(
                combined_prompt,
                generation_config=genai.types.GenerationConfig(
                    temperature=0.1,  # Lower temperature for more consistent outputs
                    max_output_tokens=8192,
                    response_mime_type="application/json"
                )
            )
            
            # Get the model response
            llm_response = response.text.strip()
            
            # Extract just the JSON part (in case the model added explanations)
            json_str = llm_response
            if "```json" in json_str:
                json_str = json_str.split("```json")[1].split("```")[0].strip()
            elif "```" in json_str:
                json_str = json_str.split("```")[1].split("```")[0].strip()
                
            # Parse the JSON response
            extracted_data = json.loads(json_str)
            
            # Ensure we have a list
            if not isinstance(extracted_data, list):
                extracted_data = [extracted_data]
                
            # Add source URL to each item
            for item in extracted_data:
                if isinstance(item, dict):
                    item['source_url'] = page_url
                    item['extraction_timestamp'] = time.strftime("%Y-%m-%d %H:%M:%S")
                    
            return extracted_data
            
        except Exception as e:
            print(f"Error extracting data with Gemini: {str(e)}")
            return []
    
    # Function to save data to JSON file
    def save_to_json(data, filepath):
        try:
            with open(filepath, 'w', encoding='utf-8') as f:
                json.dump(data, f, indent=2, ensure_ascii=False)
            print(f"Data saved to {filepath}")
        except Exception as e:
            print(f"Error saving data to {filepath}: {str(e)}")
            # Create backup file
            backup_file = f"{filepath}.backup_{int(time.time())}.json"
            try:
                with open(backup_file, 'w', encoding='utf-8') as f:
                    json.dump(data, f, indent=2, ensure_ascii=False)
                print(f"Backup data saved to {backup_file}")
            except:
                pass
    
    try:
        while queue and pages_visited < max_pages:
            # Get the next URL from the queue
            current_url = queue.pop(0)
            
            # Skip if already visited
            if current_url in visited:
                continue
            
            print(f"\n[{pages_visited + 1}/{max_pages}] Processing: {current_url}")
            
            # Visit the URL
            try:
                driver.get(current_url)
                time.sleep(0.1)  # Wait for page to load
                
                # Mark as visited
                visited.add(current_url)
                pages_visited += 1
                
                # Check if we're on the right site
                current_page_url = driver.current_url
                current_parsed = urlparse(current_page_url)
                current_base = f"{current_parsed.scheme}://{current_parsed.netloc}/"
                
                if current_base != base_url:
                    print(f"ERROR: We've navigated away from {base_url} to {current_base}")
                    continue
                
                # Scroll to make sure all content is loaded
                driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                time.sleep(0.1)
                driver.execute_script("window.scrollTo(0, 0);")
                time.sleep(0.1)
                
                # Get page content
                page_source = driver.page_source
                soup = BeautifulSoup(page_source, 'html.parser')
                
                # Clean up page content (remove scripts, styles, etc.)
                for tag in soup(["script", "style", "noscript", "iframe", "meta"]):
                    tag.decompose()
                
                # Get the cleaned text
                page_text = soup.get_text(separator="\n", strip=True)
                page_text = "\n".join(line.strip() for line in page_text.split("\n") if line.strip())
                
                print(f"Page content length: {len(page_text)} characters")
                
                # Process with Gemini and extract data
                print("Extracting data with Gemini-1.5-lite...")
                extracted_items = extract_with_gemini(page_text[:100000], current_page_url)  # Truncate if too long
                
                # Add to overall results
                if extracted_items:
                    print(f"Extracted {len(extracted_items)} items")
                    all_extracted_data.extend(extracted_items)
                else:
                    print("No data extracted from this page")
                
                # Save after each page (incremental saving to prevent data loss)
                print(f"Saving {len(all_extracted_data)} total items to {output_file}")
                save_to_json(all_extracted_data, output_file)
                
                # Get ALL anchor elements with href attributes
                link_elements = driver.find_elements(By.CSS_SELECTOR, "a[href]")
                print(f"Found {len(link_elements)} total links on the page")
                
                # Extract and process links
                new_links = 0
                for element in link_elements:
                    try:
                        href = element.get_attribute('href')
                        
                        if not href or href in ['#', 'javascript:void(0)', 'javascript:;']:
                            continue
                        
                        # Make URL absolute
                        if not href.startswith('http'):
                            href = urljoin(current_page_url, href)
                        
                        # Parse the link to get its base URL
                        link_parsed = urlparse(href)
                        link_base = f"{link_parsed.scheme}://{link_parsed.netloc}/"
                        
                        # Only process links with matching base URL
                        if link_base == base_url:
                            # Add to all discovered links
                            all_discovered_links.add(href)
                            
                            # If not visited and not in queue, add to queue
                            if href not in visited and href not in queue:
                                queue.append(href)
                                new_links += 1
                    except Exception as e:
                        continue
                
                print(f"Added {new_links} new links to the queue")
                print(f"Queue size: {len(queue)}")
                print(f"Total discovered links: {len(all_discovered_links)}")
                
            except Exception as e:
                print(f"Error processing {current_url}: {str(e)}")
        
        if pages_visited >= max_pages:
            print(f"\n=== MAXIMUM PAGES LIMIT REACHED ({max_pages}) ===")
        else:
            print(f"\n=== CRAWL COMPLETE ===")
        
        print(f"Total pages visited: {pages_visited}")
        print(f"Total unique links discovered: {len(all_discovered_links)}")
        print(f"Total data items extracted: {len(all_extracted_data)}")
        
        # Final save
        save_to_json(all_extracted_data, output_file)
        
        return all_extracted_data
        
    except Exception as e:
        print(f"Crawler error: {str(e)}")
        
        # Try to save any data collected so far
        if all_extracted_data:
            save_to_json(all_extracted_data, output_file)
            
        return all_extracted_data
    finally:
        # Always close the browser
        driver.quit()

In [11]:
description="Traverse the website and extract all course information. The JSON should contain the following information: course prefix (ACC, CINE, AS, BIOL), course number (101, 105, 475, 477), course title (General Biology, Contemporary Middle East: The Politics of Nationalismm Introduction to Women's Studies), course description (Introduction to biology: Cell structure and function, metabolism, molecular, and organismal genetics, and animal physiology., Economic, political and social forces behind current Middle East tensions: Historical origins of Middle East issues, key current conflicts in the Middle East; analysis of the role of oil, religious and national antagonisms and the geopolotical importance o fthe region in twentieth and twenty-first century superpower policy.), num units (4, V, 3), department name (Anthropology, Sociology, Anthropology), metadata (4 units; CSU GE 'B2/B3').. Return as a JSON array of course objects."
extraction_prompt = """
Extract the following information from course catalog pages:
- course_code: The course code (e.g. CS 101)
- title: The course title
- credits: Number of credit hours
- description: Full course description
- prerequisites: Any listed prerequisites

Return as a JSON array of course objects.
"""

results = ai_crawler(
    start_url="https://catalog.manoa.hawaii.edu/content.php?catoid=2&catoid=2&navoid=420&filter%5Bitem_type%5D=3&filter%5Bonly_active%5D=1&filter%5B3%5D=1&filter%5Bcpage%5D=1#acalog_template_course_filter", 
    extraction_prompt=description,
    output_file="course_catalog.json", 
    max_pages=10000,
    google_api_key="AIzaSyAdmHIhoIPCg9gdWCfjTBMVetVy4xgAGLw"  # Or set via GOOGLE_API_KEY env var
)


=== AI CRAWLER STARTING ===
Starting URL: https://catalog.manoa.hawaii.edu/content.php?catoid=2&catoid=2&navoid=420&filter%5Bitem_type%5D=3&filter%5Bonly_active%5D=1&filter%5B3%5D=1&filter%5Bcpage%5D=1#acalog_template_course_filter
Base URL for filtering: https://catalog.manoa.hawaii.edu/

[1/10000] Processing: https://catalog.manoa.hawaii.edu/content.php?catoid=2&catoid=2&navoid=420&filter%5Bitem_type%5D=3&filter%5Bonly_active%5D=1&filter%5B3%5D=1&filter%5Bcpage%5D=1#acalog_template_course_filter
Page content length: 9101 characters
Extracting data with Gemini-1.5-lite...
Extracted 100 items
Saving 100 total items to course_catalog.json
Data saved to course_catalog.json
Found 178 total links on the page
Added 155 new links to the queue
Queue size: 155
Total discovered links: 155

[2/10000] Processing: https://catalog.manoa.hawaii.edu/content.php?catoid=2&catoid=2&navoid=420&filter%5Bitem_type%5D=3&filter%5Bonly_active%5D=1&filter%5B3%5D=1&filter%5Bcpage%5D=1#select_catalog
Page conte

KeyboardInterrupt: 

In [None]:
import time
import json
import re
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Lock
from urllib.parse import urljoin
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By

def load_translations(translations_file_path):
    """Load course prefix to department name translations from JSON file."""
    try:
        with open(translations_file_path, 'r', encoding='utf-8') as f:
            return json.load(f)
    except Exception as e:
        print(f"Warning: Could not load translations file {translations_file_path}: {e}")
        return {}

def parse_course_preview_html(html, url, translations=None):
    soup = BeautifulSoup(html, 'html.parser')
    container = soup.find('td', class_='block_content', colspan="2")
    if not container:
        return None

    # Course prefix, number, title
    course_title_h1 = container.find('h1', id='course_preview_title')
    prefix = number = title = None
    if course_title_h1:
        text = course_title_h1.get_text(separator=' ', strip=True)
        match = re.match(r'([A-Z]+)\s+([\dA-Z]+)\s*-\s*(.*)', text)
        if match:
            prefix, number, title = match.groups()

    # Units (credits)
    credits_match = re.search(r'Credits:\s*(\d+(\.\d+)?)', container.get_text())
    credits_text = credits_match.group(1) if credits_match else None

    # Full text as lines
    full_text = container.get_text(separator='\n', strip=True)
    credits_pos = full_text.find('Credits:')

    # Extract description: from Credits: to before first <strong> label or end of content
    desc = None
    if credits_pos >= 0:
        # Find where the description ends: position of first <strong> tag's text after credits, or end of text if none found
        strong_tags = container.find_all('strong')
        end_pos = len(full_text)
        for tag in strong_tags:
            label = tag.get_text(strip=True)
            idx = full_text.find(label)
            if idx > credits_pos and idx < end_pos:
                end_pos = idx
        desc = full_text[credits_pos + len('Credits:'):end_pos].strip()

    # Now extract all metadata fields dynamically: all <strong> labels and their following text content
    metadata = {}
    strong_tags = container.find_all('strong')
    for tag in strong_tags:
        label = tag.get_text(strip=True).rstrip(':')
        content_parts = []
        sibling = tag.next_sibling
        while sibling:
            if getattr(sibling, 'name', None) == 'strong':
                break
            if isinstance(sibling, str):
                content_parts.append(sibling.strip())
            else:
                content_parts.append(sibling.get_text(strip=True))
            sibling = sibling.next_sibling
        value = ' '.join([part for part in content_parts if part])
        metadata[label] = value

    # Clean metadata of description-related keys
    metadata.pop('Credits', None)
    if '' in metadata:
        metadata.pop('')

    # Get department name from translations
    dept_name = None
    if translations and prefix:
        dept_name = translations.get(prefix, None)
    
    return {
        'course_prefix': prefix,
        'course_number': number,
        'course_title': title,
        'course_desc': desc,
        'num_units': credits_text,
        'dept_name': dept_name,
        'inst_ipeds': None,     # Placeholder if you want to add later
        'metadata': metadata,
        'source_url': url,
        'extraction_timestamp': time.strftime("%Y-%m-%d %H:%M:%S")
    }

def save_to_json(data, filepath, lock=None):
    try:
        if lock:
            with lock:
                with open(filepath, 'w', encoding='utf-8') as f:
                    json.dump(data, f, indent=2, ensure_ascii=False)
                print(f"Saved {len(data)} records to {filepath}")
        else:
            with open(filepath, 'w', encoding='utf-8') as f:
                json.dump(data, f, indent=2, ensure_ascii=False)
            print(f"Saved {len(data)} records to {filepath}")
    except Exception as e:
        print(f"Error saving JSON: {e}")

def extract_single_course(course_url, translations, delay=0.1):
    """Extract a single course using its own webdriver instance."""
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    
    driver = webdriver.Chrome(options=options)
    
    try:
        driver.get(course_url)
        time.sleep(delay)
        
        page_source = driver.page_source
        parsed = parse_course_preview_html(page_source, course_url, translations)
        return parsed
    except Exception as e:
        print(f"Error extracting course {course_url}: {e}")
        return None
    finally:
        driver.quit()

def crawl_courses(base_catalog_url_template, base_course_url_prefix, output_file, start_page=0, end_page=92, 
                 max_workers=4, delay_between_requests=0.1, batch_size=50):
    """
    Parallelized course crawler with configurable parameters.
    
    Args:
        max_workers: Number of parallel browser instances (default: 4)
        delay_between_requests: Delay between requests in seconds (default: 0.1)
        batch_size: Number of courses to process before saving (default: 50)
    """
    print(f"Starting parallelized crawl with {max_workers} workers, {delay_between_requests}s delay, batch size {batch_size}")
    
    # Load translations for department names
    try:
        script_dir = os.path.dirname(os.path.abspath(__file__))
    except NameError:
        script_dir = os.getcwd()
    
    translations_path = os.path.join(script_dir, 'translations.json')
    translations = load_translations(translations_path)
    print(f"Loaded {len(translations)} department translations from {translations_path}")

    # Setup for catalog page collection
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    driver = webdriver.Chrome(options=options)

    visited_courses = set()
    all_extracted = []
    file_lock = Lock()
    total_course_count = 0

    # Process each catalog page one by one
    for page_num in range(start_page, end_page + 1):
        catalog_url = base_catalog_url_template.format(page_number=page_num)
        print(f"\nVisiting catalog page {page_num}: {catalog_url}")
        
        # Collect course links from current catalog page
        current_page_course_links = set()
        try:
            driver.get(catalog_url)
            time.sleep(0.1)

            links = driver.find_elements(By.CSS_SELECTOR, "a[href]")
            for link in links:
                href = link.get_attribute('href')
                if href and href.startswith(base_course_url_prefix):
                    if href not in visited_courses:
                        current_page_course_links.add(href)
            
            print(f"  Found {len(current_page_course_links)} new course links on this page.")
        except Exception as e:
            print(f"  Error visiting catalog page {catalog_url}: {e}")
            continue

        if not current_page_course_links:
            continue

        # Process courses from current catalog page in parallel
        course_urls = list(current_page_course_links)
        print(f"\nProcessing {len(course_urls)} courses from page {page_num} with {max_workers} workers...")
        
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            # Submit all course extraction tasks
            future_to_url = {
                executor.submit(extract_single_course, url, translations, delay_between_requests): url 
                for url in course_urls
            }
            
            # Process completed tasks
            completed_count = 0
            batch_results = []
            
            for future in as_completed(future_to_url):
                course_url = future_to_url[future]
                completed_count += 1
                total_course_count += 1
                
                try:
                    parsed = future.result()
                    if parsed:
                        print(f"[{completed_count}/{len(course_urls)}] [Total: {total_course_count}] Extracted: {parsed['course_prefix']} {parsed['course_number']} - {parsed['course_title']}")
                        batch_results.append(parsed)
                        all_extracted.append(parsed)
                    else:
                        print(f"[{completed_count}/{len(course_urls)}] [Total: {total_course_count}] Failed to parse: {course_url}")
                    
                    visited_courses.add(course_url)
                    
                    # Save in batches to prevent data loss while maintaining performance
                    if len(batch_results) >= batch_size or completed_count == len(course_urls):
                        save_to_json(all_extracted, output_file, file_lock)
                        batch_results = []
                        
                except Exception as e:
                    print(f"[{completed_count}/{len(course_urls)}] [Total: {total_course_count}] Error processing {course_url}: {e}")
        
        print(f"\nCompleted catalog page {page_num}. Total courses extracted so far: {len(all_extracted)}")

    driver.quit()
    print(f"\nParallelized crawling finished. Extracted {len(all_extracted)} courses total.")
    save_to_json(all_extracted, output_file, file_lock)

if __name__ == "__main__":
    BASE_CATALOG_URL_TEMPLATE = (
        "https://catalog.manoa.hawaii.edu/content.php?"
        "catoid=2&catoid=2&navoid=420&"
        "filter%5Bitem_type%5D=3&filter%5Bonly_active%5D=1&"
        "filter%5B3%5D=1&filter%5Bcpage%5D={page_number}#acalog_template_course_filter"
    )
    BASE_COURSE_URL_PREFIX = "https://catalog.manoa.hawaii.edu/preview_course_nopop.php?catoid=2&coid="
    OUTPUT_FILE = "extracted_coursesV2.json"

    # Parallelization parameters - adjust these for speed vs. server load balance
    MAX_WORKERS = 50        # Number of parallel browser instances (higher = faster, but more resource intensive)
    DELAY = 0.05          # Delay between requests in seconds (lower = faster, but higher server load)
    BATCH_SIZE = 100       # Save frequency (lower = more frequent saves, higher = better performance)

    crawl_courses(
        BASE_CATALOG_URL_TEMPLATE, 
        BASE_COURSE_URL_PREFIX, 
        OUTPUT_FILE, 
        start_page=0, 
        end_page=92,
        max_workers=MAX_WORKERS,
        delay_between_requests=DELAY,
        batch_size=BATCH_SIZE
    )


Starting parallelized crawl with 50 workers, 0.05s delay, batch size 100
Loaded 0 department translations from /Users/rodericktabalba/Documents/GitHub/RAG-system/text_extraction_methods/ai_web_scraper/translations.json

Visiting catalog page 0: https://catalog.manoa.hawaii.edu/content.php?catoid=2&catoid=2&navoid=420&filter%5Bitem_type%5D=3&filter%5Bonly_active%5D=1&filter%5B3%5D=1&filter%5Bcpage%5D=0#acalog_template_course_filter
  Found 0 new course links on this page.

Visiting catalog page 1: https://catalog.manoa.hawaii.edu/content.php?catoid=2&catoid=2&navoid=420&filter%5Bitem_type%5D=3&filter%5Bonly_active%5D=1&filter%5B3%5D=1&filter%5Bcpage%5D=1#acalog_template_course_filter
  Found 100 new course links on this page.

Processing 100 courses from page 1 with 50 workers...


In [None]:
import time
import json
import re
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Lock
from urllib.parse import urljoin
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By

def load_translations(translations_file_path):
    """Load course prefix to department name translations from JSON file."""
    try:
        with open(translations_file_path, 'r', encoding='utf-8') as f:
            return json.load(f)
    except Exception as e:
        print(f"Warning: Could not load translations file {translations_file_path}: {e}")
        return {}

def parse_course_preview_html(html, url, translations=None):
    soup = BeautifulSoup(html, 'html.parser')
    container = soup.find('td', class_='block_content', colspan="2")
    if not container:
        return None

    # Course prefix, number, title
    course_title_h1 = container.find('h1', id='course_preview_title')
    prefix = number = title = None
    if course_title_h1:
        text = course_title_h1.get_text(separator=' ', strip=True)
        match = re.match(r'([A-Z]+)\s+([\dA-Z]+)\s*-\s*(.*)', text)
        if match:
            prefix, number, title = match.groups()

    # Units (credits)
    credits_match = re.search(r'Credits:\s*(\d+(\.\d+)?)', container.get_text())
    credits_text = credits_match.group(1) if credits_match else None

    # Full text as lines
    full_text = container.get_text(separator='\n', strip=True)
    credits_pos = full_text.find('Credits:')

    # Extract description: from Credits: to before first <strong> label or end of content
    desc = None
    if credits_pos >= 0:
        # Find where the description ends: position of first <strong> tag's text after credits, or end of text if none found
        strong_tags = container.find_all('strong')
        end_pos = len(full_text)
        for tag in strong_tags:
            label = tag.get_text(strip=True)
            idx = full_text.find(label)
            if idx > credits_pos and idx < end_pos:
                end_pos = idx
        desc = full_text[credits_pos + len('Credits:'):end_pos].strip()

    # Now extract all metadata fields dynamically: all <strong> labels and their following text content
    metadata = {}
    strong_tags = container.find_all('strong')
    for tag in strong_tags:
        label = tag.get_text(strip=True).rstrip(':')
        content_parts = []
        sibling = tag.next_sibling
        while sibling:
            if getattr(sibling, 'name', None) == 'strong':
                break
            if isinstance(sibling, str):
                content_parts.append(sibling.strip())
            else:
                content_parts.append(sibling.get_text(strip=True))
            sibling = sibling.next_sibling
        value = ' '.join([part for part in content_parts if part])
        metadata[label] = value

    # Clean metadata of description-related keys
    metadata.pop('Credits', None)
    if '' in metadata:
        metadata.pop('')

    # Get department name from translations
    dept_name = None
    if translations and prefix:
        dept_name = translations.get(prefix, None)
    
    return {
        'course_prefix': prefix,
        'course_number': number,
        'course_title': title,
        'course_desc': desc,
        'num_units': credits_text,
        'dept_name': dept_name,
        'inst_ipeds': None,     # Placeholder if you want to add later
        'metadata': metadata,
        'source_url': url,
        'extraction_timestamp': time.strftime("%Y-%m-%d %H:%M:%S")
    }

def save_to_json(data, filepath, lock=None):
    try:
        if lock:
            with lock:
                with open(filepath, 'w', encoding='utf-8') as f:
                    json.dump(data, f, indent=2, ensure_ascii=False)
                print(f"Saved {len(data)} records to {filepath}")
        else:
            with open(filepath, 'w', encoding='utf-8') as f:
                json.dump(data, f, indent=2, ensure_ascii=False)
            print(f"Saved {len(data)} records to {filepath}")
    except Exception as e:
        print(f"Error saving JSON: {e}")

def extract_single_course(course_url, translations, delay=0.1):
    """Extract a single course using its own webdriver instance."""
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    
    driver = webdriver.Chrome(options=options)
    
    try:
        driver.get(course_url)
        time.sleep(delay)
        
        page_source = driver.page_source
        parsed = parse_course_preview_html(page_source, course_url, translations)
        return parsed
    except Exception as e:
        print(f"Error extracting course {course_url}: {e}")
        return None
    finally:
        driver.quit()

def crawl_courses(base_catalog_url_template, base_course_url_prefix, output_file, start_page=0, end_page=92, 
                 max_workers=4, delay_between_requests=0.1, batch_size=50):
    """
    Parallelized course crawler with configurable parameters.
    
    Args:
        max_workers: Number of parallel browser instances (default: 4)
        delay_between_requests: Delay between requests in seconds (default: 0.1)
        batch_size: Number of courses to process before saving (default: 50)
    """
    print(f"Starting parallelized crawl with {max_workers} workers, {delay_between_requests}s delay, batch size {batch_size}")
    
    # Load translations for department names
    try:
        script_dir = os.path.dirname(os.path.abspath(__file__))
    except NameError:
        script_dir = os.getcwd()
    
    translations_path = os.path.join(script_dir, 'translations.json')
    translations = load_translations(translations_path)
    print(f"Loaded {len(translations)} department translations from {translations_path}")

    # Setup for catalog page collection
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    driver = webdriver.Chrome(options=options)

    visited_courses = set()
    all_extracted = []
    file_lock = Lock()
    total_course_count = 0

    # Process each catalog page one by one
    for page_num in range(start_page, end_page + 1):
        catalog_url = base_catalog_url_template.format(page_number=page_num)
        print(f"\nVisiting catalog page {page_num}: {catalog_url}")
        
        # Collect course links from current catalog page
        current_page_course_links = set()
        try:
            driver.get(catalog_url)
            time.sleep(0.1)

            links = driver.find_elements(By.CSS_SELECTOR, "a[href]")
            for link in links:
                href = link.get_attribute('href')
                if href and href.startswith(base_course_url_prefix):
                    if href not in visited_courses:
                        current_page_course_links.add(href)
            
            print(f"  Found {len(current_page_course_links)} new course links on this page.")
        except Exception as e:
            print(f"  Error visiting catalog page {catalog_url}: {e}")
            continue

        if not current_page_course_links:
            continue

        # Process courses from current catalog page in parallel
        course_urls = list(current_page_course_links)
        print(f"\nProcessing {len(course_urls)} courses from page {page_num} with {max_workers} workers...")
        
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            # Submit all course extraction tasks
            future_to_url = {
                executor.submit(extract_single_course, url, translations, delay_between_requests): url 
                for url in course_urls
            }
            
            # Process completed tasks
            completed_count = 0
            batch_results = []
            
            for future in as_completed(future_to_url):
                course_url = future_to_url[future]
                completed_count += 1
                total_course_count += 1
                
                try:
                    parsed = future.result()
                    if parsed:
                        print(f"[{completed_count}/{len(course_urls)}] [Total: {total_course_count}] Extracted: {parsed['course_prefix']} {parsed['course_number']} - {parsed['course_title']}")
                        batch_results.append(parsed)
                        all_extracted.append(parsed)
                    else:
                        print(f"[{completed_count}/{len(course_urls)}] [Total: {total_course_count}] Failed to parse: {course_url}")
                    
                    visited_courses.add(course_url)
                    
                    # Save in batches to prevent data loss while maintaining performance
                    if len(batch_results) >= batch_size or completed_count == len(course_urls):
                        save_to_json(all_extracted, output_file, file_lock)
                        batch_results = []
                        
                except Exception as e:
                    print(f"[{completed_count}/{len(course_urls)}] [Total: {total_course_count}] Error processing {course_url}: {e}")
        
        print(f"\nCompleted catalog page {page_num}. Total courses extracted so far: {len(all_extracted)}")

    driver.quit()
    print(f"\nParallelized crawling finished. Extracted {len(all_extracted)} courses total.")
    save_to_json(all_extracted, output_file, file_lock)

if __name__ == "__main__":
    BASE_CATALOG_URL_TEMPLATE = (
        "https://hilo.hawaii.edu/catalog/undergraduate-courses"
    )
    BASE_COURSE_URL_PREFIX = "https://catalog.manoa.hawaii.edu/preview_course_nopop.php?catoid=2&coid="
    OUTPUT_FILE = "extracted_coursesV2.json"

    # Parallelization parameters - adjust these for speed vs. server load balance
    MAX_WORKERS = 50        # Number of parallel browser instances (higher = faster, but more resource intensive)
    DELAY = 0.05          # Delay between requests in seconds (lower = faster, but higher server load)
    BATCH_SIZE = 100       # Save frequency (lower = more frequent saves, higher = better performance)

    crawl_courses(
        BASE_CATALOG_URL_TEMPLATE, 
        BASE_COURSE_URL_PREFIX, 
        OUTPUT_FILE, 
        start_page=0, 
        end_page=92,
        max_workers=MAX_WORKERS,
        delay_between_requests=DELAY,
        batch_size=BATCH_SIZE
    )


# The following code is the start of the pipeline for retrieving information from the bill overview page

In [None]:
import os
import json
import time
import fitz  # PyMuPDF
import shutil
import tempfile
from urllib.parse import urljoin, urlparse, parse_qs
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import undetected_chromedriver as uc

# Setup output filename
# measure_url = "https://www.capitol.hawaii.gov/session/measure_indiv.aspx?billtype=HB&billnumber=727&year=2025"
measure_url = "https://www.capitol.hawaii.gov/session/measure_indiv.aspx?billtype=HB&billnumber=400&year=2025"
parsed = urlparse(measure_url)
params = parse_qs(parsed.query)
billtype = params.get("billtype", ["UNKNOWN"])[0]
billnumber = params.get("billnumber", ["UNKNOWN"])[0]
year = params.get("year", ["UNKNOWN"])[0]
output_filename = f"{billtype}_{billnumber}_{year}.json"

# Setup download directory
download_dir = tempfile.mkdtemp()

options = uc.ChromeOptions()
options.add_experimental_option("prefs", {
    "download.default_directory": download_dir,
    "download.prompt_for_download": False,
    "plugins.always_open_pdf_externally": True
})
driver = uc.Chrome(options=options)

def clean_html_text(html):
    soup = BeautifulSoup(html, "html.parser")
    for tag in soup(["script", "style", "noscript"]):
        tag.decompose()
    return soup.get_text(separator="\n", strip=True)

def extract_pdf_text_from_file(file_path):
    try:
        doc = fitz.open(file_path)
        text = "\n".join(page.get_text() for page in doc)
        doc.close()
        return text
    except Exception as e:
        return f"[ERROR extracting PDF text: {e}]"

try:
    driver.get(measure_url)
    time.sleep(0.1)

    # Collect all links inside main-content
    main = driver.find_element(By.ID, "main-content")
    a_tags = main.find_elements(By.XPATH, ".//a[@href]")
    base_url = measure_url
    raw_links = [urljoin(base_url, a.get_attribute("href")) for a in a_tags]
    filtered = [u for u in raw_links if u.lower().endswith((".htm", ".pdf"))]

    # Prefer .htm if both .htm and .pdf exist for same base
    unique_docs = {}
    for link in filtered:
        path = urlparse(link).path
        base = os.path.splitext(os.path.basename(path))[0]
        key = os.path.dirname(path) + "/" + base
        ext = os.path.splitext(path)[1].lower()
        if ext == ".htm":
            unique_docs[key] = link
        elif ext == ".pdf" and key not in unique_docs:
            unique_docs[key] = link

    results = []
    for doc_url in unique_docs.values():
        print(f"Processing: {doc_url}")
        if doc_url.lower().endswith(".htm"):
            driver.get(doc_url)
            time.sleep(3)
            html = driver.page_source
            text = clean_html_text(html)
            results.append({"url": doc_url, "text": text})
        elif doc_url.lower().endswith(".pdf"):
            # Remove old files first
            for f in os.listdir(download_dir):
                os.remove(os.path.join(download_dir, f))
            # Click the PDF link
            driver.get(measure_url)  # Reload base page to stay consistent
            time.sleep(3)
            link_el = driver.find_element(By.XPATH, f'//a[@href="{urlparse(doc_url).path}"]')
            link_el.click()
            time.sleep(5)  # Wait for download

            # Find the downloaded PDF file
            downloaded_pdf = next((os.path.join(download_dir, f)
                                   for f in os.listdir(download_dir)
                                   if f.lower().endswith(".pdf")), None)
            if downloaded_pdf:
                text = extract_pdf_text_from_file(downloaded_pdf)
                results.append({"url": doc_url, "text": text})
            else:
                results.append({"url": doc_url, "text": "[ERROR: PDF not downloaded]"})

    # Save results
    with open(output_filename, "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

    print(f"✅ Saved to {output_filename}")

finally:
    driver.quit()
    shutil.rmtree(download_dir)


Processing: https://www.capitol.hawaii.gov/sessions/session2025/bills/HB400_CD1_.HTM
Processing: https://www.capitol.hawaii.gov/sessions/session2025/bills/GM1329_.PDF
Processing: https://www.capitol.hawaii.gov/sessions/session2025/bills/HB400_SD2_.HTM
Processing: https://www.capitol.hawaii.gov/sessions/session2025/bills/HB400_HD1_.HTM
Processing: https://www.capitol.hawaii.gov/sessions/session2025/bills/HB400_SD1_.HTM
Processing: https://www.capitol.hawaii.gov/sessions/session2025/bills/HB400_.HTM
Processing: https://www.capitol.hawaii.gov/sessions/session2025/CommReports/HB400_HSCR286_.htm
Processing: https://www.capitol.hawaii.gov/sessions/session2025/CommReports/HB400_HD1_HSCR1171_.htm
Processing: https://www.capitol.hawaii.gov/sessions/session2025/CommReports/HB400_SD1_SSCR1253_.htm
Processing: https://www.capitol.hawaii.gov/sessions/session2025/CommReports/HB400_SD2_SSCR1841_.htm
Processing: https://www.capitol.hawaii.gov/sessions/session2025/CommReports/HB400_CD1_CCR157_.htm
Proc

# The following code is for scraping the bills .html data from the website
https://www.capitol.hawaii.gov/sessions/session2025/bills/ 