In [None]:
def crawl_links(start_url, max_pages=10000):
    """
    Web crawler that recursively visits URLs and extracts links from each page.
    
    Args:
        start_url (str): The starting URL to begin crawling
        max_pages (int): Maximum number of pages to crawl (safety limit)
        
    Returns:
        set: Set of all discovered links matching the base URL
    """
    from urllib.parse import urlparse, urljoin
    from selenium import webdriver
    from selenium.webdriver.common.by import By
    from selenium.webdriver.chrome.options import Options
    import time
    
    # Set up Chrome options
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in headless mode
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    
    # Initialize the browser
    driver = webdriver.Chrome(options=chrome_options)
    
    # Extract the base URL for comparison
    parsed_url = urlparse(start_url)
    base_url = f"{parsed_url.scheme}://{parsed_url.netloc}/"
    print(f"\n=== CRAWLER STARTING ===")
    print(f"Starting URL: {start_url}")
    print(f"Base URL for filtering: {base_url}")
    
    # Initialize queue and visited set
    queue = [start_url]
    visited = set()
    all_discovered_links = set()
    
    pages_visited = 0
    
    try:
        while queue and pages_visited < max_pages:
            # Get the next URL from the queue
            current_url = queue.pop(0)
            
            # Skip if already visited
            if current_url in visited:
                continue
            
            print(f"\n[{pages_visited + 1}/{max_pages}] Processing: {current_url}")
            
            # Visit the URL
            try:
                driver.get(current_url)
                time.sleep(0.1)  # Wait for page to load
                
                # Mark as visited
                visited.add(current_url)
                pages_visited += 1
                
                # Check if we're on the right site
                current_page_url = driver.current_url
                current_parsed = urlparse(current_page_url)
                current_base = f"{current_parsed.scheme}://{current_parsed.netloc}/"
                
                if current_base != base_url:
                    print(f"ERROR: We've navigated away from {base_url} to {current_base}")
                    continue
                
                # Scroll to make sure all content is loaded
                driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                time.sleep(0.5)
                driver.execute_script("window.scrollTo(0, 0);")
                time.sleep(0.5)
                
                # Get ALL anchor elements with href attributes
                link_elements = driver.find_elements(By.CSS_SELECTOR, "a[href]")
                print(f"Found {len(link_elements)} total links on the page")
                
                # Extract and process links
                new_links = 0
                for element in link_elements:
                    try:
                        href = element.get_attribute('href')
                        
                        if not href or href in ['#', 'javascript:void(0)', 'javascript:;']:
                            continue
                        
                        # Make URL absolute
                        if not href.startswith('http'):
                            href = urljoin(current_page_url, href)
                        
                        # Parse the link to get its base URL
                        link_parsed = urlparse(href)
                        link_base = f"{link_parsed.scheme}://{link_parsed.netloc}/"
                        
                        # Only process links with matching base URL
                        if link_base == base_url:
                            # Add to all discovered links
                            all_discovered_links.add(href)
                            
                            # If not visited and not in queue, add to queue
                            if href not in visited and href not in queue:
                                queue.append(href)
                                new_links += 1
                    except Exception as e:
                        continue
                
                print(f"Added {new_links} new links to the queue")
                print(f"Queue size: {len(queue)}")
                print(f"Total discovered links: {len(all_discovered_links)}")
                
            except Exception as e:
                print(f"Error processing {current_url}: {str(e)}")
        
        if pages_visited >= max_pages:
            print(f"\n=== MAXIMUM PAGES LIMIT REACHED ({max_pages}) ===")
        else:
            print(f"\n=== CRAWL COMPLETE ===")
        
        print(f"Total pages visited: {pages_visited}")
        print(f"Total unique links discovered: {len(all_discovered_links)}")
        
        # Print the array of all discovered links
        print("\n=== ALL DISCOVERED LINKS ===")
        for link in all_discovered_links:
            print(link)
        
        return all_discovered_links
        
    except Exception as e:
        print(f"Crawler error: {str(e)}")
        return all_discovered_links
    finally:
        # Always close the browser
        driver.quit()

In [4]:
crawl_links("https://catalog.manoa.hawaii.edu/content.php?catoid=2&catoid=2&navoid=420&filter%5Bitem_type%5D=3&filter%5Bonly_active%5D=1&filter%5B3%5D=1&filter%5Bcpage%5D=1#acalog_template_course_filter")


=== CRAWLER STARTING ===
Starting URL: https://catalog.manoa.hawaii.edu/content.php?catoid=2&catoid=2&navoid=420&filter%5Bitem_type%5D=3&filter%5Bonly_active%5D=1&filter%5B3%5D=1&filter%5Bcpage%5D=1#acalog_template_course_filter
Base URL for filtering: https://catalog.manoa.hawaii.edu/

[1/100] Processing: https://catalog.manoa.hawaii.edu/content.php?catoid=2&catoid=2&navoid=420&filter%5Bitem_type%5D=3&filter%5Bonly_active%5D=1&filter%5B3%5D=1&filter%5Bcpage%5D=1#acalog_template_course_filter
Found 178 total links on the page
Added 155 new links to the queue
Queue size: 155
Total discovered links: 155

[2/100] Processing: https://catalog.manoa.hawaii.edu/content.php?catoid=2&catoid=2&navoid=420&filter%5Bitem_type%5D=3&filter%5Bonly_active%5D=1&filter%5B3%5D=1&filter%5Bcpage%5D=1#select_catalog
Found 178 total links on the page
Added 0 new links to the queue
Queue size: 154
Total discovered links: 155

[3/100] Processing: https://catalog.manoa.hawaii.edu/search_advanced.php?catoid=2
Fo

KeyboardInterrupt: 

In [10]:
def ai_crawler(start_url, extraction_prompt, output_file="scraped_data.json", max_pages=50, google_api_key=None):
    """
    AI-powered web crawler that visits URLs, extracts structured data using Gemini-1.5-lite,
    and saves results to a JSON file.
    
    Args:
        start_url (str): The starting URL to begin crawling
        extraction_prompt (str): Prompt for the LLM describing what data to extract
        output_file (str): Path to save the extracted JSON data
        max_pages (int): Maximum number of pages to crawl (safety limit)
        google_api_key (str): Google API key for Gemini models
        
    Returns:
        list: List of all extracted data items
    """
    from urllib.parse import urlparse, urljoin
    from selenium import webdriver
    from selenium.webdriver.common.by import By
    from selenium.webdriver.chrome.options import Options
    from bs4 import BeautifulSoup
    import time
    import json
    import os
    import google.generativeai as genai
    
    # Initialize Google Gemini client
    if not google_api_key:
        google_api_key = os.environ.get("GOOGLE_API_KEY")
    
    if not google_api_key:
        raise ValueError("Google API key must be provided either directly or via GOOGLE_API_KEY environment variable")
    
    genai.configure(api_key=google_api_key)
    
    # Set up Chrome options
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in headless mode
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    
    # Initialize the browser
    driver = webdriver.Chrome(options=chrome_options)
    
    # Extract the base URL for comparison
    parsed_url = urlparse(start_url)
    base_url = f"{parsed_url.scheme}://{parsed_url.netloc}/"
    print(f"\n=== AI CRAWLER STARTING ===")
    print(f"Starting URL: {start_url}")
    print(f"Base URL for filtering: {base_url}")
    
    # Initialize queue and visited set
    queue = [start_url]
    visited = set()
    all_discovered_links = set()
    
    # Initialize data storage
    all_extracted_data = []
    
    pages_visited = 0
    
    # Function to extract data using Gemini
    def extract_with_gemini(page_content, page_url):
        try:
            # Create a combined prompt with page URL and content
            combined_prompt = f"""
URL: {page_url}

EXTRACTION INSTRUCTIONS:
{extraction_prompt}

PAGE CONTENT:
{page_content}

Please extract the requested information as a valid JSON array. Each item should be a JSON object.
If no relevant information is found, return an empty array [].
Return ONLY a valid JSON array without any explanations, markdown formatting, or additional text.
"""
            # Set up the model
            model = genai.GenerativeModel('gemini-1.5-flash')
            
            # Call the Gemini API
            response = model.generate_content(
                combined_prompt,
                generation_config=genai.types.GenerationConfig(
                    temperature=0.1,  # Lower temperature for more consistent outputs
                    max_output_tokens=8192,
                    response_mime_type="application/json"
                )
            )
            
            # Get the model response
            llm_response = response.text.strip()
            
            # Extract just the JSON part (in case the model added explanations)
            json_str = llm_response
            if "```json" in json_str:
                json_str = json_str.split("```json")[1].split("```")[0].strip()
            elif "```" in json_str:
                json_str = json_str.split("```")[1].split("```")[0].strip()
                
            # Parse the JSON response
            extracted_data = json.loads(json_str)
            
            # Ensure we have a list
            if not isinstance(extracted_data, list):
                extracted_data = [extracted_data]
                
            # Add source URL to each item
            for item in extracted_data:
                if isinstance(item, dict):
                    item['source_url'] = page_url
                    item['extraction_timestamp'] = time.strftime("%Y-%m-%d %H:%M:%S")
                    
            return extracted_data
            
        except Exception as e:
            print(f"Error extracting data with Gemini: {str(e)}")
            return []
    
    # Function to save data to JSON file
    def save_to_json(data, filepath):
        try:
            with open(filepath, 'w', encoding='utf-8') as f:
                json.dump(data, f, indent=2, ensure_ascii=False)
            print(f"Data saved to {filepath}")
        except Exception as e:
            print(f"Error saving data to {filepath}: {str(e)}")
            # Create backup file
            backup_file = f"{filepath}.backup_{int(time.time())}.json"
            try:
                with open(backup_file, 'w', encoding='utf-8') as f:
                    json.dump(data, f, indent=2, ensure_ascii=False)
                print(f"Backup data saved to {backup_file}")
            except:
                pass
    
    try:
        while queue and pages_visited < max_pages:
            # Get the next URL from the queue
            current_url = queue.pop(0)
            
            # Skip if already visited
            if current_url in visited:
                continue
            
            print(f"\n[{pages_visited + 1}/{max_pages}] Processing: {current_url}")
            
            # Visit the URL
            try:
                driver.get(current_url)
                time.sleep(0.1)  # Wait for page to load
                
                # Mark as visited
                visited.add(current_url)
                pages_visited += 1
                
                # Check if we're on the right site
                current_page_url = driver.current_url
                current_parsed = urlparse(current_page_url)
                current_base = f"{current_parsed.scheme}://{current_parsed.netloc}/"
                
                if current_base != base_url:
                    print(f"ERROR: We've navigated away from {base_url} to {current_base}")
                    continue
                
                # Scroll to make sure all content is loaded
                driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                time.sleep(0.1)
                driver.execute_script("window.scrollTo(0, 0);")
                time.sleep(0.1)
                
                # Get page content
                page_source = driver.page_source
                soup = BeautifulSoup(page_source, 'html.parser')
                
                # Clean up page content (remove scripts, styles, etc.)
                for tag in soup(["script", "style", "noscript", "iframe", "meta"]):
                    tag.decompose()
                
                # Get the cleaned text
                page_text = soup.get_text(separator="\n", strip=True)
                page_text = "\n".join(line.strip() for line in page_text.split("\n") if line.strip())
                
                print(f"Page content length: {len(page_text)} characters")
                
                # Process with Gemini and extract data
                print("Extracting data with Gemini-1.5-lite...")
                extracted_items = extract_with_gemini(page_text[:100000], current_page_url)  # Truncate if too long
                
                # Add to overall results
                if extracted_items:
                    print(f"Extracted {len(extracted_items)} items")
                    all_extracted_data.extend(extracted_items)
                else:
                    print("No data extracted from this page")
                
                # Save after each page (incremental saving to prevent data loss)
                print(f"Saving {len(all_extracted_data)} total items to {output_file}")
                save_to_json(all_extracted_data, output_file)
                
                # Get ALL anchor elements with href attributes
                link_elements = driver.find_elements(By.CSS_SELECTOR, "a[href]")
                print(f"Found {len(link_elements)} total links on the page")
                
                # Extract and process links
                new_links = 0
                for element in link_elements:
                    try:
                        href = element.get_attribute('href')
                        
                        if not href or href in ['#', 'javascript:void(0)', 'javascript:;']:
                            continue
                        
                        # Make URL absolute
                        if not href.startswith('http'):
                            href = urljoin(current_page_url, href)
                        
                        # Parse the link to get its base URL
                        link_parsed = urlparse(href)
                        link_base = f"{link_parsed.scheme}://{link_parsed.netloc}/"
                        
                        # Only process links with matching base URL
                        if link_base == base_url:
                            # Add to all discovered links
                            all_discovered_links.add(href)
                            
                            # If not visited and not in queue, add to queue
                            if href not in visited and href not in queue:
                                queue.append(href)
                                new_links += 1
                    except Exception as e:
                        continue
                
                print(f"Added {new_links} new links to the queue")
                print(f"Queue size: {len(queue)}")
                print(f"Total discovered links: {len(all_discovered_links)}")
                
            except Exception as e:
                print(f"Error processing {current_url}: {str(e)}")
        
        if pages_visited >= max_pages:
            print(f"\n=== MAXIMUM PAGES LIMIT REACHED ({max_pages}) ===")
        else:
            print(f"\n=== CRAWL COMPLETE ===")
        
        print(f"Total pages visited: {pages_visited}")
        print(f"Total unique links discovered: {len(all_discovered_links)}")
        print(f"Total data items extracted: {len(all_extracted_data)}")
        
        # Final save
        save_to_json(all_extracted_data, output_file)
        
        return all_extracted_data
        
    except Exception as e:
        print(f"Crawler error: {str(e)}")
        
        # Try to save any data collected so far
        if all_extracted_data:
            save_to_json(all_extracted_data, output_file)
            
        return all_extracted_data
    finally:
        # Always close the browser
        driver.quit()

In [None]:
description="Traverse the website and extract all course information. The JSON should contain the following information: course prefix (ACC, CINE, AS, BIOL), course number (101, 105, 475, 477), course title (General Biology, Contemporary Middle East: The Politics of Nationalismm Introduction to Women's Studies), course description (Introduction to biology: Cell structure and function, metabolism, molecular, and organismal genetics, and animal physiology., Economic, political and social forces behind current Middle East tensions: Historical origins of Middle East issues, key current conflicts in the Middle East; analysis of the role of oil, religious and national antagonisms and the geopolotical importance o fthe region in twentieth and twenty-first century superpower policy.), num units (4, V, 3), department name (Anthropology, Sociology, Anthropology), metadata (4 units; CSU GE 'B2/B3').. Return as a JSON array of course objects."
extraction_prompt = """
Extract the following information from course catalog pages:
- course_code: The course code (e.g. CS 101)
- title: The course title
- credits: Number of credit hours
- description: Full course description
- prerequisites: Any listed prerequisites

Return as a JSON array of course objects.
"""

results = ai_crawler(
    start_url="https://catalog.manoa.hawaii.edu/content.php?catoid=2&catoid=2&navoid=420&filter%5Bitem_type%5D=3&filter%5Bonly_active%5D=1&filter%5B3%5D=1&filter%5Bcpage%5D=1#acalog_template_course_filter", 
    extraction_prompt=description,
    output_file="course_catalog.json", 
    max_pages=10000,
    google_api_key="AIzaSyAdmHIhoIPCg9gdWCfjTBMVetVy4xgAGLw"  # Or set via GOOGLE_API_KEY env var
)


=== AI CRAWLER STARTING ===
Starting URL: https://catalog.manoa.hawaii.edu/content.php?catoid=2&catoid=2&navoid=420&filter%5Bitem_type%5D=3&filter%5Bonly_active%5D=1&filter%5B3%5D=1&filter%5Bcpage%5D=1#acalog_template_course_filter
Base URL for filtering: https://catalog.manoa.hawaii.edu/

[1/10000] Processing: https://catalog.manoa.hawaii.edu/content.php?catoid=2&catoid=2&navoid=420&filter%5Bitem_type%5D=3&filter%5Bonly_active%5D=1&filter%5B3%5D=1&filter%5Bcpage%5D=1#acalog_template_course_filter
Page content length: 9101 characters
Extracting data with Gemini-1.5-lite...
Extracted 100 items
Saving 100 total items to course_catalog.json
Data saved to course_catalog.json
Found 178 total links on the page
Added 155 new links to the queue
Queue size: 155
Total discovered links: 155

[2/10000] Processing: https://catalog.manoa.hawaii.edu/content.php?catoid=2&catoid=2&navoid=420&filter%5Bitem_type%5D=3&filter%5Bonly_active%5D=1&filter%5B3%5D=1&filter%5Bcpage%5D=1#select_catalog
Page conte