In [None]:
print("HEllo World")

In [9]:
import time
import json
import re
import os
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import logging
from urllib.parse import urljoin, urlparse

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class HawaiiVolumeScraper:
    def __init__(self, use_delays=False):
        self.driver = None
        self.setup_driver()
        self.processed_urls = set()
        self.max_depth = 3  # Limit depth to avoid excessive recursion
        self.use_delays = use_delays  # Whether to use delays between requests
        
    def setup_driver(self):
        """Setup Chrome driver with anti-detection measures"""
        options = Options()
        
        # Anti-detection settings
        options.add_argument('--disable-blink-features=AutomationControlled')
        options.add_experimental_option("excludeSwitches", ["enable-automation"])
        options.add_experimental_option('useAutomationExtension', False)
        options.add_argument('--disable-web-security')
        options.add_argument('--allow-running-insecure-content')
        options.add_argument('--no-sandbox')
        options.add_argument('--disable-dev-shm-usage')
        
        # Make it look like a regular user
        options.add_argument('--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
        
        # Uncomment to run headless
        # options.add_argument('--headless')
        
        try:
            self.driver = webdriver.Chrome(options=options)
            
            # Additional anti-detection
            self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
            self.driver.execute_cdp_cmd('Network.setUserAgentOverride', {
                "userAgent": 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
            })
            
            logger.info("✅ Chrome driver initialized successfully")
            
        except Exception as e:
            logger.error(f"❌ Error setting up Chrome driver: {e}")
            raise
            
    def maybe_delay(self, min_sec=0.5, max_sec=1):
        """Add optional delay if enabled"""
        if self.use_delays:
            import random
            delay = random.uniform(min_sec, max_sec)
            time.sleep(delay)
        
    def load_page(self, url, timeout=30):
        """Load a page and return BeautifulSoup object"""
        if url in self.processed_urls:
            logger.debug(f"⏭️ Already processed: {url}")
            return None
            
        try:
            logger.info(f"🌐 Loading: {url}")
            self.driver.get(url)
            
            # Wait for page to load
            WebDriverWait(self.driver, timeout).until(
                EC.presence_of_element_located((By.TAG_NAME, "body"))
            )
            
            # Small wait for dynamic content
            time.sleep(0.5)  # Minimal delay that's still needed
            
            # Mark as processed
            self.processed_urls.add(url)
            
            # Get page source
            page_source = self.driver.page_source
            soup = BeautifulSoup(page_source, 'html.parser')
            
            # Check if blocked
            if "403" in self.driver.title or "forbidden" in self.driver.title.lower():
                logger.warning("⚠️ Page might be blocked")
                return None
                
            logger.info("✅ Page loaded successfully")
            return soup
            
        except Exception as e:
            logger.error(f"❌ Error loading {url}: {e}")
            return None
    
    def extract_clean_text(self, soup):
        """Extract clean text from BeautifulSoup object"""
        if not soup:
            return ""
            
        # Remove unwanted elements
        for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'noscript']):
            element.decompose()
            
        # Get text content
        text = soup.get_text(separator=' ', strip=True)
        
        # Clean up text
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'\n\s*\n', '\n\n', text)
        
        return text.strip()
    
    def is_content_file(self, href):
        """Determine if this is a content file (not a directory)"""
        # Check if it has a file extension
        if '.' in href and not href.endswith('/'):
            return True
        
        # Check for specific patterns that indicate content files
        content_patterns = ['.htm', '.html', '.txt', '.pdf']
        return any(pattern in href.lower() for pattern in content_patterns)
    
    def is_directory(self, href):
        """Check if this is a directory"""
        return href.endswith('/') and href not in ['../', '../']
    
    def is_within_volume(self, url, volume_url):
        """Check if URL is within the target volume"""
        # Ensure we're not navigating outside our target volume
        return url.startswith(volume_url) and url != volume_url
    
    def get_links_in_volume(self, soup, current_url, volume_url):
        """Extract links that are within the target volume"""
        if not soup:
            return []
            
        links = []
        for link in soup.find_all('a', href=True):
            href = link['href']
            
            # Skip unwanted links
            if href in ['../', '../', '#', ''] or href.startswith('mailto:') or href.startswith('javascript:'):
                continue
                
            # Build full URL
            if href.startswith('http'):
                full_url = href
            elif href.startswith('/'):
                base_domain = '/'.join(current_url.split('/')[:3])
                full_url = base_domain + href
            else:
                full_url = urljoin(current_url, href)
                
            # Only include links within our target volume
            if not self.is_within_volume(full_url, volume_url):
                continue
                
            link_info = {
                'url': full_url,
                'href': href,
                'text': link.get_text(strip=True),
                'is_file': self.is_content_file(href),
                'is_directory': self.is_directory(href)
            }
            
            links.append(link_info)
            
        return links
    
    def crawl_volume(self, volume_url, max_files=100, save_callback=None):
        """
        Crawl a specific volume and extract files
        
        Args:
            volume_url: URL of the volume to crawl
            max_files: Maximum number of files to extract
            save_callback: Function to call after each file is processed
        """
        logger.info(f"🔍 Starting to crawl volume: {volume_url}")
        
        # Reset processed URLs for this volume
        self.processed_urls = set()
        
        # Queue of directories to process
        queue = [(volume_url, 0)]  # (url, depth)
        extracted_files = []
        
        while queue and len(extracted_files) < max_files:
            current_url, depth = queue.pop(0)
            
            # Skip if we've already processed this URL
            if current_url in self.processed_urls:
                continue
                
            # Skip if we're too deep
            if depth > self.max_depth:
                logger.info(f"⏭️ Skipping {current_url} - max depth reached")
                continue
                
            # Skip if not in our volume
            if not self.is_within_volume(current_url, volume_url) and current_url != volume_url:
                logger.info(f"⏭️ Skipping {current_url} - outside target volume")
                continue
            
            # Load the page
            soup = self.load_page(current_url)
            if not soup:
                continue
                
            # Get links in this page
            links = self.get_links_in_volume(soup, current_url, volume_url)
            
            # Separate files and directories
            files = [link for link in links if link['is_file']]
            directories = [link for link in links if link['is_directory']]
            
            logger.info(f"📂 Found {len(files)} files and {len(directories)} directories in {current_url}")
            
            # Process files
            for file_link in files:
                if len(extracted_files) >= max_files:
                    logger.info(f"🛑 Reached file limit ({max_files})")
                    break
                    
                try:
                    file_url = file_link['url']
                    
                    # Skip if already processed
                    if file_url in self.processed_urls:
                        continue
                        
                    logger.info(f"📄 Processing file: {file_link['href']}")
                    
                    file_soup = self.load_page(file_url)
                    if file_soup:
                        extracted_text = self.extract_clean_text(file_soup)
                        
                        file_data = {
                            "name": file_link['href'],
                            "url": file_url,
                            "type": "file",
                            "parent_directory": current_url,
                            "link_text": file_link['text'],
                            "text": extracted_text,
                            "text_length": len(extracted_text),
                            "depth": depth,
                            "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
                        }
                        
                        extracted_files.append(file_data)
                        logger.info(f"✅ Extracted {len(extracted_text)} characters from {file_link['href']}")
                        
                        # Save after each file if callback provided
                        if save_callback:
                            save_callback(extracted_files, file_data)
                        
                    # Optional delay between files
                    self.maybe_delay(0.5, 1)
                    
                except Exception as e:
                    logger.error(f"❌ Error processing file {file_link['href']}: {e}")
                    continue
            
            # Add directories to the queue
            for dir_link in directories:
                dir_url = dir_link['url']
                
                # Only add if within our volume and not already processed
                if self.is_within_volume(dir_url, volume_url) and dir_url not in self.processed_urls:
                    queue.append((dir_url, depth + 1))
            
            # Optional delay between directories
            self.maybe_delay(0.5, 1)
        
        logger.info(f"✅ Volume crawling completed: {len(extracted_files)} files extracted")
        return extracted_files
    
    def close(self):
        """Close the browser"""
        if self.driver:
            self.driver.quit()
            logger.info("🔒 Browser closed")

def fix_empty_volumes():
    """Fix volumes that have no files in them"""
    print("🔧 Hawaii Empty Volume Fixer")
    print("=" * 50)
    
    # Load existing data
    input_file = "hawaii_complete_content.json"
    if not os.path.exists(input_file):
        print(f"❌ File not found: {input_file}")
        return
    
    try:
        with open(input_file, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        print(f"📋 Loaded {len(data)} volumes from {input_file}")
        
        # Check for empty volumes
        empty_volumes = []
        for i, volume in enumerate(data):
            files = volume.get('files', [])
            if not files:
                empty_volumes.append((i, volume))
                print(f"🔍 Found empty volume {i}: {volume.get('volume_name', 'Unknown')}")
        
        if not empty_volumes:
            print("✅ No empty volumes found!")
            return
        
        print(f"🔍 Found {len(empty_volumes)} empty volumes")
        
        # Ask which volumes to process
        print("\nVolumes to process:")
        for i, (idx, vol) in enumerate(empty_volumes):
            print(f"{i+1}. Volume {idx}: {vol.get('volume_name', 'Unknown')}")
        
        selection = input("\nEnter volume numbers to process (comma-separated, or 'all'): ").strip()
        
        if selection.lower() == 'all':
            selected_indices = list(range(len(empty_volumes)))
        else:
            selected_indices = []
            for part in selection.split(','):
                try:
                    idx = int(part.strip()) - 1
                    if 0 <= idx < len(empty_volumes):
                        selected_indices.append(idx)
                except ValueError:
                    continue
        
        if not selected_indices:
            print("❌ No valid volumes selected")
            return
            
        # Ask about using delays
        use_delays = input("Use delays between requests? (y/n, default: n): ").strip().lower() == 'y'
        if use_delays:
            print("Using delays between requests (slower but safer)")
        else:
            print("Not using delays (faster but may trigger rate limits)")
        
        # Process selected volumes
        scraper = HawaiiVolumeScraper(use_delays=use_delays)
        
        try:
            for sel_idx in selected_indices:
                data_idx, volume = empty_volumes[sel_idx]
                volume_url = volume.get('volume_url')
                volume_name = volume.get('volume_name', 'Unknown')
                
                if not volume_url:
                    print(f"❌ No URL found for volume {data_idx}")
                    continue
                
                print(f"\n📂 Processing volume {data_idx}: {volume_name}")
                print(f"🌐 URL: {volume_url}")
                
                # Get max files to process
                max_files = input(f"Enter max files for volume {volume_name} (default 100): ").strip()
                max_files = int(max_files) if max_files.isdigit() else 100
                
                # Create a save callback function
                def save_after_each_file(files_so_far, current_file):
                    # Update the volume in the data
                    data[data_idx]['files'] = files_so_far
                    data[data_idx]['total_files'] = len(files_so_far)
                    data[data_idx]['timestamp'] = time.strftime("%Y-%m-%d %H:%M:%S")
                    
                    # Save the entire data
                    with open(input_file, 'w', encoding='utf-8') as f:
                        json.dump(data, f, indent=2, ensure_ascii=False)
                    
                    # Log the save
                    logger.info(f"💾 Saved progress: {len(files_so_far)} files in volume {volume_name}")
                
                # Crawl the volume with save callback
                volume_files = scraper.crawl_volume(
                    volume_url, 
                    max_files=max_files,
                    save_callback=save_after_each_file
                )
                
                if volume_files:
                    print(f"✅ Extracted {len(volume_files)} files from volume {volume_name}")
                    
                    # Final update for this volume
                    data[data_idx]['files'] = volume_files
                    data[data_idx]['total_files'] = len(volume_files)
                    data[data_idx]['timestamp'] = time.strftime("%Y-%m-%d %H:%M:%S")
                    
                    # Save after completing the volume
                    with open(input_file, 'w', encoding='utf-8') as f:
                        json.dump(data, f, indent=2, ensure_ascii=False)
                    
                    print(f"💾 Final volume data saved to {input_file}")
                else:
                    print(f"⚠️ No files extracted from volume {volume_name}")
            
            # Final save
            with open(input_file, 'w', encoding='utf-8') as f:
                json.dump(data, f, indent=2, ensure_ascii=False)
            
            # Print summary
            total_files = sum(len(volume.get('files', [])) for volume in data)
            print(f"\n🎉 PROCESSING COMPLETED!")
            print(f"📚 Volumes in data: {len(data)}")
            print(f"📄 Total files: {total_files:,}")
            print(f"💾 Data saved to: {input_file}")
            
        finally:
            scraper.close()
            
    except Exception as e:
        print(f"❌ Error processing volumes: {e}")

# Run the script when executed directly
fix_empty_volumes()

🔧 Hawaii Empty Volume Fixer
📋 Loaded 14 volumes from hawaii_complete_content.json
🔍 Found empty volume 2: hrscurrent/Vol03_Ch0121-0200D
🔍 Found empty volume 3: hrscurrent/Vol04_Ch0201-0257
🔍 Found empty volume 4: hrscurrent/Vol05_Ch0261-0319
🔍 Found empty volume 5: hrscurrent/Vol06_Ch0321-0344
🔍 Found empty volume 6: hrscurrent/Vol07_Ch0346-0398
🔍 Found empty volume 7: hrscurrent/Vol08_Ch0401-0429
🔍 Found empty volume 8: hrscurrent/Vol09_Ch0431-0435H
🔍 Found empty volume 9: hrscurrent/Vol10_Ch0436-0474
🔍 Found empty volume 10: hrscurrent/Vol11_Ch0476-0490
🔍 Found empty volume 11: hrscurrent/Vol12_Ch0501-0588
🔍 Found empty volume 12: hrscurrent/Vol13_Ch0601-0676
🔍 Found empty volume 13: hrscurrent/Vol14_Ch0701-0853
🔍 Found 12 empty volumes

Volumes to process:
1. Volume 2: hrscurrent/Vol03_Ch0121-0200D
2. Volume 3: hrscurrent/Vol04_Ch0201-0257
3. Volume 4: hrscurrent/Vol05_Ch0261-0319
4. Volume 5: hrscurrent/Vol06_Ch0321-0344
5. Volume 6: hrscurrent/Vol07_Ch0346-0398
6. Volume 7: hrscur

2025-07-17 11:08:24,881 - INFO - ✅ Chrome driver initialized successfully



📂 Processing volume 2: hrscurrent/Vol03_Ch0121-0200D
🌐 URL: https://www.capitol.hawaii.gov/hrscurrent/Vol03_Ch0121-0200D/


2025-07-17 11:08:34,111 - INFO - 🔍 Starting to crawl volume: https://www.capitol.hawaii.gov/hrscurrent/Vol03_Ch0121-0200D/
2025-07-17 11:08:34,111 - INFO - 🌐 Loading: https://www.capitol.hawaii.gov/hrscurrent/Vol03_Ch0121-0200D/
2025-07-17 11:08:34,833 - INFO - ✅ Page loaded successfully
2025-07-17 11:08:34,834 - INFO - 📂 Found 0 files and 121 directories in https://www.capitol.hawaii.gov/hrscurrent/Vol03_Ch0121-0200D/
2025-07-17 11:08:34,834 - INFO - 🌐 Loading: https://www.capitol.hawaii.gov/hrscurrent/Vol03_Ch0121-0200D/HRS0121/
2025-07-17 11:08:35,396 - INFO - ✅ Page loaded successfully
2025-07-17 11:08:35,397 - INFO - 📂 Found 48 files and 0 directories in https://www.capitol.hawaii.gov/hrscurrent/Vol03_Ch0121-0200D/HRS0121/
2025-07-17 11:08:35,397 - INFO - 📄 Processing file: /hrscurrent/Vol03_Ch0121-0200D/HRS0121/HRS_0121-.htm
2025-07-17 11:08:35,397 - INFO - 🌐 Loading: https://www.capitol.hawaii.gov/hrscurrent/Vol03_Ch0121-0200D/HRS0121/HRS_0121-.htm
2025-07-17 11:08:36,005 - INFO

✅ Extracted 100 files from volume hrscurrent/Vol03_Ch0121-0200D
💾 Final volume data saved to hawaii_complete_content.json

📂 Processing volume 3: hrscurrent/Vol04_Ch0201-0257
🌐 URL: https://www.capitol.hawaii.gov/hrscurrent/Vol04_Ch0201-0257/


2025-07-17 11:22:26,834 - INFO - 🔍 Starting to crawl volume: https://www.capitol.hawaii.gov/hrscurrent/Vol04_Ch0201-0257/
2025-07-17 11:22:26,834 - INFO - 🌐 Loading: https://www.capitol.hawaii.gov/hrscurrent/Vol04_Ch0201-0257/
2025-07-17 11:22:27,644 - INFO - ✅ Page loaded successfully
2025-07-17 11:22:27,645 - INFO - 📂 Found 0 files and 88 directories in https://www.capitol.hawaii.gov/hrscurrent/Vol04_Ch0201-0257/
2025-07-17 11:22:27,645 - INFO - 🌐 Loading: https://www.capitol.hawaii.gov/hrscurrent/Vol04_Ch0201-0257/HRS0201/
2025-07-17 11:22:28,223 - INFO - ✅ Page loaded successfully
2025-07-17 11:22:28,223 - INFO - 📂 Found 58 files and 0 directories in https://www.capitol.hawaii.gov/hrscurrent/Vol04_Ch0201-0257/HRS0201/
2025-07-17 11:22:28,223 - INFO - 📄 Processing file: /hrscurrent/Vol04_Ch0201-0257/HRS0201/HRS_0201-.htm
2025-07-17 11:22:28,224 - INFO - 🌐 Loading: https://www.capitol.hawaii.gov/hrscurrent/Vol04_Ch0201-0257/HRS0201/HRS_0201-.htm
2025-07-17 11:22:28,794 - INFO - ✅ Pag

✅ Extracted 100 files from volume hrscurrent/Vol04_Ch0201-0257
💾 Final volume data saved to hawaii_complete_content.json

📂 Processing volume 4: hrscurrent/Vol05_Ch0261-0319
🌐 URL: https://www.capitol.hawaii.gov/hrscurrent/Vol05_Ch0261-0319/


2025-07-17 11:25:04,867 - INFO - 🔍 Starting to crawl volume: https://www.capitol.hawaii.gov/hrscurrent/Vol05_Ch0261-0319/
2025-07-17 11:25:04,867 - INFO - 🌐 Loading: https://www.capitol.hawaii.gov/hrscurrent/Vol05_Ch0261-0319/
2025-07-17 11:25:05,435 - INFO - ✅ Page loaded successfully
2025-07-17 11:25:05,436 - INFO - 📂 Found 0 files and 93 directories in https://www.capitol.hawaii.gov/hrscurrent/Vol05_Ch0261-0319/
2025-07-17 11:25:05,436 - INFO - 🌐 Loading: https://www.capitol.hawaii.gov/hrscurrent/Vol05_Ch0261-0319/HRS0261/
2025-07-17 11:25:06,026 - INFO - ✅ Page loaded successfully
2025-07-17 11:25:06,026 - INFO - 📂 Found 59 files and 0 directories in https://www.capitol.hawaii.gov/hrscurrent/Vol05_Ch0261-0319/HRS0261/
2025-07-17 11:25:06,026 - INFO - 📄 Processing file: /hrscurrent/Vol05_Ch0261-0319/HRS0261/HRS_0261-.htm
2025-07-17 11:25:06,027 - INFO - 🌐 Loading: https://www.capitol.hawaii.gov/hrscurrent/Vol05_Ch0261-0319/HRS0261/HRS_0261-.htm
2025-07-17 11:25:06,595 - INFO - ✅ Pag

✅ Extracted 100 files from volume hrscurrent/Vol05_Ch0261-0319
💾 Final volume data saved to hawaii_complete_content.json

📂 Processing volume 5: hrscurrent/Vol06_Ch0321-0344
🌐 URL: https://www.capitol.hawaii.gov/hrscurrent/Vol06_Ch0321-0344/


2025-07-17 11:28:43,688 - INFO - 🔍 Starting to crawl volume: https://www.capitol.hawaii.gov/hrscurrent/Vol06_Ch0321-0344/
2025-07-17 11:28:43,688 - INFO - 🌐 Loading: https://www.capitol.hawaii.gov/hrscurrent/Vol06_Ch0321-0344/
2025-07-17 11:28:44,267 - INFO - ✅ Page loaded successfully
2025-07-17 11:28:44,268 - INFO - 📂 Found 0 files and 85 directories in https://www.capitol.hawaii.gov/hrscurrent/Vol06_Ch0321-0344/
2025-07-17 11:28:44,268 - INFO - 🌐 Loading: https://www.capitol.hawaii.gov/hrscurrent/Vol06_Ch0321-0344/HRS0321/
2025-07-17 11:28:44,847 - INFO - ✅ Page loaded successfully
2025-07-17 11:28:44,848 - INFO - 📂 Found 306 files and 0 directories in https://www.capitol.hawaii.gov/hrscurrent/Vol06_Ch0321-0344/HRS0321/
2025-07-17 11:28:44,849 - INFO - 📄 Processing file: /hrscurrent/Vol06_Ch0321-0344/HRS0321/HRS_0321-.htm
2025-07-17 11:28:44,849 - INFO - 🌐 Loading: https://www.capitol.hawaii.gov/hrscurrent/Vol06_Ch0321-0344/HRS0321/HRS_0321-.htm
2025-07-17 11:28:45,499 - INFO - ✅ Pa

✅ Extracted 100 files from volume hrscurrent/Vol06_Ch0321-0344
💾 Final volume data saved to hawaii_complete_content.json

📂 Processing volume 6: hrscurrent/Vol07_Ch0346-0398
🌐 URL: https://www.capitol.hawaii.gov/hrscurrent/Vol07_Ch0346-0398/


2025-07-17 11:29:55,812 - INFO - 🔍 Starting to crawl volume: https://www.capitol.hawaii.gov/hrscurrent/Vol07_Ch0346-0398/
2025-07-17 11:29:55,812 - INFO - 🌐 Loading: https://www.capitol.hawaii.gov/hrscurrent/Vol07_Ch0346-0398/
2025-07-17 11:29:56,377 - INFO - ✅ Page loaded successfully
2025-07-17 11:29:56,377 - INFO - 📂 Found 0 files and 91 directories in https://www.capitol.hawaii.gov/hrscurrent/Vol07_Ch0346-0398/
2025-07-17 11:29:56,378 - INFO - 🌐 Loading: https://www.capitol.hawaii.gov/hrscurrent/Vol07_Ch0346-0398/HRS0346/
2025-07-17 11:29:56,953 - INFO - ✅ Page loaded successfully
2025-07-17 11:29:56,955 - INFO - 📂 Found 263 files and 0 directories in https://www.capitol.hawaii.gov/hrscurrent/Vol07_Ch0346-0398/HRS0346/
2025-07-17 11:29:56,955 - INFO - 📄 Processing file: /hrscurrent/Vol07_Ch0346-0398/HRS0346/HRS_0346-.htm
2025-07-17 11:29:56,955 - INFO - 🌐 Loading: https://www.capitol.hawaii.gov/hrscurrent/Vol07_Ch0346-0398/HRS0346/HRS_0346-.htm
2025-07-17 11:29:57,550 - INFO - ✅ Pa

✅ Extracted 100 files from volume hrscurrent/Vol07_Ch0346-0398
💾 Final volume data saved to hawaii_complete_content.json

📂 Processing volume 7: hrscurrent/Vol08_Ch0401-0429
🌐 URL: https://www.capitol.hawaii.gov/hrscurrent/Vol08_Ch0401-0429/


2025-07-17 11:31:20,284 - INFO - 🔍 Starting to crawl volume: https://www.capitol.hawaii.gov/hrscurrent/Vol08_Ch0401-0429/
2025-07-17 11:31:20,284 - INFO - 🌐 Loading: https://www.capitol.hawaii.gov/hrscurrent/Vol08_Ch0401-0429/
2025-07-17 11:31:20,841 - INFO - ✅ Page loaded successfully
2025-07-17 11:31:20,841 - INFO - 📂 Found 0 files and 43 directories in https://www.capitol.hawaii.gov/hrscurrent/Vol08_Ch0401-0429/
2025-07-17 11:31:20,841 - INFO - 🌐 Loading: https://www.capitol.hawaii.gov/hrscurrent/Vol08_Ch0401-0429/HRS0401/
2025-07-17 11:31:21,494 - INFO - ✅ Page loaded successfully
2025-07-17 11:31:21,495 - INFO - 📂 Found 1 files and 0 directories in https://www.capitol.hawaii.gov/hrscurrent/Vol08_Ch0401-0429/HRS0401/
2025-07-17 11:31:21,495 - INFO - 📄 Processing file: /hrscurrent/Vol08_Ch0401-0429/HRS0401/HRS_0401-.htm
2025-07-17 11:31:21,495 - INFO - 🌐 Loading: https://www.capitol.hawaii.gov/hrscurrent/Vol08_Ch0401-0429/HRS0401/HRS_0401-.htm
2025-07-17 11:31:22,070 - INFO - ✅ Page

✅ Extracted 100 files from volume hrscurrent/Vol08_Ch0401-0429
💾 Final volume data saved to hawaii_complete_content.json

📂 Processing volume 8: hrscurrent/Vol09_Ch0431-0435H
🌐 URL: https://www.capitol.hawaii.gov/hrscurrent/Vol09_Ch0431-0435H/


2025-07-17 11:33:07,376 - INFO - 🔍 Starting to crawl volume: https://www.capitol.hawaii.gov/hrscurrent/Vol09_Ch0431-0435H/
2025-07-17 11:33:07,376 - INFO - 🌐 Loading: https://www.capitol.hawaii.gov/hrscurrent/Vol09_Ch0431-0435H/
2025-07-17 11:33:07,938 - INFO - ✅ Page loaded successfully
2025-07-17 11:33:07,938 - INFO - 📂 Found 0 files and 27 directories in https://www.capitol.hawaii.gov/hrscurrent/Vol09_Ch0431-0435H/
2025-07-17 11:33:07,939 - INFO - 🌐 Loading: https://www.capitol.hawaii.gov/hrscurrent/Vol09_Ch0431-0435H/HRS0431/
2025-07-17 11:33:08,669 - INFO - ✅ Page loaded successfully
2025-07-17 11:33:08,675 - INFO - 📂 Found 1435 files and 0 directories in https://www.capitol.hawaii.gov/hrscurrent/Vol09_Ch0431-0435H/HRS0431/
2025-07-17 11:33:08,675 - INFO - 📄 Processing file: /hrscurrent/Vol09_Ch0431-0435H/HRS0431/HRS_0431-.htm
2025-07-17 11:33:08,675 - INFO - 🌐 Loading: https://www.capitol.hawaii.gov/hrscurrent/Vol09_Ch0431-0435H/HRS0431/HRS_0431-.htm
2025-07-17 11:33:09,522 - INF

✅ Extracted 100 files from volume hrscurrent/Vol09_Ch0431-0435H
💾 Final volume data saved to hawaii_complete_content.json

📂 Processing volume 9: hrscurrent/Vol10_Ch0436-0474
🌐 URL: https://www.capitol.hawaii.gov/hrscurrent/Vol10_Ch0436-0474/


2025-07-17 11:35:06,463 - INFO - 🔍 Starting to crawl volume: https://www.capitol.hawaii.gov/hrscurrent/Vol10_Ch0436-0474/
2025-07-17 11:35:06,463 - INFO - 🌐 Loading: https://www.capitol.hawaii.gov/hrscurrent/Vol10_Ch0436-0474/
2025-07-17 11:35:07,025 - INFO - ✅ Page loaded successfully
2025-07-17 11:35:07,026 - INFO - 📂 Found 0 files and 95 directories in https://www.capitol.hawaii.gov/hrscurrent/Vol10_Ch0436-0474/
2025-07-17 11:35:07,027 - INFO - 🌐 Loading: https://www.capitol.hawaii.gov/hrscurrent/Vol10_Ch0436-0474/HRS0436/
2025-07-17 11:35:07,581 - INFO - ✅ Page loaded successfully
2025-07-17 11:35:07,582 - INFO - 📂 Found 1 files and 0 directories in https://www.capitol.hawaii.gov/hrscurrent/Vol10_Ch0436-0474/HRS0436/
2025-07-17 11:35:07,582 - INFO - 📄 Processing file: /hrscurrent/Vol10_Ch0436-0474/HRS0436/HRS_0436-.htm
2025-07-17 11:35:07,583 - INFO - 🌐 Loading: https://www.capitol.hawaii.gov/hrscurrent/Vol10_Ch0436-0474/HRS0436/HRS_0436-.htm
2025-07-17 11:35:08,143 - INFO - ✅ Page

✅ Extracted 100 files from volume hrscurrent/Vol10_Ch0436-0474
💾 Final volume data saved to hawaii_complete_content.json

📂 Processing volume 10: hrscurrent/Vol11_Ch0476-0490
🌐 URL: https://www.capitol.hawaii.gov/hrscurrent/Vol11_Ch0476-0490/


2025-07-17 11:36:16,504 - INFO - 🔍 Starting to crawl volume: https://www.capitol.hawaii.gov/hrscurrent/Vol11_Ch0476-0490/
2025-07-17 11:36:16,504 - INFO - 🌐 Loading: https://www.capitol.hawaii.gov/hrscurrent/Vol11_Ch0476-0490/
2025-07-17 11:36:17,099 - INFO - ✅ Page loaded successfully
2025-07-17 11:36:17,099 - INFO - 📂 Found 0 files and 69 directories in https://www.capitol.hawaii.gov/hrscurrent/Vol11_Ch0476-0490/
2025-07-17 11:36:17,100 - INFO - 🌐 Loading: https://www.capitol.hawaii.gov/hrscurrent/Vol11_Ch0476-0490/HRS0476/
2025-07-17 11:36:17,655 - INFO - ✅ Page loaded successfully
2025-07-17 11:36:17,655 - INFO - 📂 Found 33 files and 0 directories in https://www.capitol.hawaii.gov/hrscurrent/Vol11_Ch0476-0490/HRS0476/
2025-07-17 11:36:17,656 - INFO - 📄 Processing file: /hrscurrent/Vol11_Ch0476-0490/HRS0476/HRS_0476-.htm
2025-07-17 11:36:17,656 - INFO - 🌐 Loading: https://www.capitol.hawaii.gov/hrscurrent/Vol11_Ch0476-0490/HRS0476/HRS_0476-.htm
2025-07-17 11:36:18,217 - INFO - ✅ Pag

✅ Extracted 100 files from volume hrscurrent/Vol11_Ch0476-0490
💾 Final volume data saved to hawaii_complete_content.json

📂 Processing volume 11: hrscurrent/Vol12_Ch0501-0588
🌐 URL: https://www.capitol.hawaii.gov/hrscurrent/Vol12_Ch0501-0588/


2025-07-17 11:37:27,813 - INFO - 🔍 Starting to crawl volume: https://www.capitol.hawaii.gov/hrscurrent/Vol12_Ch0501-0588/
2025-07-17 11:37:27,814 - INFO - 🌐 Loading: https://www.capitol.hawaii.gov/hrscurrent/Vol12_Ch0501-0588/
2025-07-17 11:37:28,378 - INFO - ✅ Page loaded successfully
2025-07-17 11:37:28,379 - INFO - 📂 Found 0 files and 106 directories in https://www.capitol.hawaii.gov/hrscurrent/Vol12_Ch0501-0588/
2025-07-17 11:37:28,379 - INFO - 🌐 Loading: https://www.capitol.hawaii.gov/hrscurrent/Vol12_Ch0501-0588/HRS0501/
2025-07-17 11:37:28,994 - INFO - ✅ Page loaded successfully
2025-07-17 11:37:28,995 - INFO - 📂 Found 136 files and 0 directories in https://www.capitol.hawaii.gov/hrscurrent/Vol12_Ch0501-0588/HRS0501/
2025-07-17 11:37:28,996 - INFO - 📄 Processing file: /hrscurrent/Vol12_Ch0501-0588/HRS0501/HRS_0501-.htm
2025-07-17 11:37:28,996 - INFO - 🌐 Loading: https://www.capitol.hawaii.gov/hrscurrent/Vol12_Ch0501-0588/HRS0501/HRS_0501-.htm
2025-07-17 11:37:29,610 - INFO - ✅ P

✅ Extracted 100 files from volume hrscurrent/Vol12_Ch0501-0588
💾 Final volume data saved to hawaii_complete_content.json

📂 Processing volume 12: hrscurrent/Vol13_Ch0601-0676
🌐 URL: https://www.capitol.hawaii.gov/hrscurrent/Vol13_Ch0601-0676/


2025-07-17 11:40:51,807 - INFO - 🔍 Starting to crawl volume: https://www.capitol.hawaii.gov/hrscurrent/Vol13_Ch0601-0676/
2025-07-17 11:40:51,807 - INFO - 🌐 Loading: https://www.capitol.hawaii.gov/hrscurrent/Vol13_Ch0601-0676/
2025-07-17 11:40:52,376 - INFO - ✅ Page loaded successfully
2025-07-17 11:40:52,376 - INFO - 📂 Found 0 files and 87 directories in https://www.capitol.hawaii.gov/hrscurrent/Vol13_Ch0601-0676/
2025-07-17 11:40:52,377 - INFO - 🌐 Loading: https://www.capitol.hawaii.gov/hrscurrent/Vol13_Ch0601-0676/HRS0601/
2025-07-17 11:40:52,937 - INFO - ✅ Page loaded successfully
2025-07-17 11:40:52,939 - INFO - 📂 Found 27 files and 0 directories in https://www.capitol.hawaii.gov/hrscurrent/Vol13_Ch0601-0676/HRS0601/
2025-07-17 11:40:52,939 - INFO - 📄 Processing file: /hrscurrent/Vol13_Ch0601-0676/HRS0601/HRS_0601-.htm
2025-07-17 11:40:52,939 - INFO - 🌐 Loading: https://www.capitol.hawaii.gov/hrscurrent/Vol13_Ch0601-0676/HRS0601/HRS_0601-.htm
2025-07-17 11:40:53,514 - INFO - ✅ Pag

✅ Extracted 100 files from volume hrscurrent/Vol13_Ch0601-0676
💾 Final volume data saved to hawaii_complete_content.json

📂 Processing volume 13: hrscurrent/Vol14_Ch0701-0853
🌐 URL: https://www.capitol.hawaii.gov/hrscurrent/Vol14_Ch0701-0853/


2025-07-17 11:47:04,032 - INFO - 🔍 Starting to crawl volume: https://www.capitol.hawaii.gov/hrscurrent/Vol14_Ch0701-0853/
2025-07-17 11:47:04,032 - INFO - 🌐 Loading: https://www.capitol.hawaii.gov/hrscurrent/Vol14_Ch0701-0853/
2025-07-17 11:47:04,598 - INFO - ✅ Page loaded successfully
2025-07-17 11:47:04,599 - INFO - 📂 Found 0 files and 50 directories in https://www.capitol.hawaii.gov/hrscurrent/Vol14_Ch0701-0853/
2025-07-17 11:47:04,599 - INFO - 🌐 Loading: https://www.capitol.hawaii.gov/hrscurrent/Vol14_Ch0701-0853/HRS0701/
2025-07-17 11:47:05,155 - INFO - ✅ Page loaded successfully
2025-07-17 11:47:05,155 - INFO - 📂 Found 21 files and 0 directories in https://www.capitol.hawaii.gov/hrscurrent/Vol14_Ch0701-0853/HRS0701/
2025-07-17 11:47:05,155 - INFO - 📄 Processing file: /hrscurrent/Vol14_Ch0701-0853/HRS0701/HRS_0701-.htm
2025-07-17 11:47:05,156 - INFO - 🌐 Loading: https://www.capitol.hawaii.gov/hrscurrent/Vol14_Ch0701-0853/HRS0701/HRS_0701-.htm
2025-07-17 11:47:05,715 - INFO - ✅ Pag

✅ Extracted 100 files from volume hrscurrent/Vol14_Ch0701-0853
💾 Final volume data saved to hawaii_complete_content.json

🎉 PROCESSING COMPLETED!
📚 Volumes in data: 14
📄 Total files: 3,982
💾 Data saved to: hawaii_complete_content.json


2025-07-17 11:48:22,800 - INFO - 🔒 Browser closed


In [4]:
import requests

ddg_url = f"https://api.duckduckgo.com/?q=Honolulu Rail system&format=json&no_html=1&skip_disambig=1"
response = requests.get(ddg_url, timeout=10, headers={'User-Agent': 'RAG-System/1.0'})
data = response.json()
print(data.get("AbstractText"))





In [4]:
import requests
from bs4 import BeautifulSoup
import re
import phpserialize
import time
import json

BASE_URL = "https://catalog.manoa.hawaii.edu/content.php"
PARAMS = {
    "catoid": "2",
    "navoid": "420",
    "filter[item_type]": "3",
    "filter[only_active]": "1",
    "filter[3]": "1",
}

def parse_php_serialized(data):
    try:
        # PHP uses "~" instead of '"' in the JS string, so fix it first
        fixed = data.replace("~", '"')
        return phpserialize.loads(fixed.encode('utf-8'), decode_strings=True)
    except Exception as e:
        return {"error": str(e), "raw": data}

def extract_course_info(course_div):
    title_el = course_div.select_one('.courseleaf-accordion-title')
    desc_el = course_div.select_one('.courseleaf-accordion-content')

    if not title_el:
        return None

    onclick = title_el.get("onclick", "")
    match = re.search(r"showCatalogData\((.*?)\)", onclick)
    course = {}

    if match:
        args = match.group(1).split(',')
        args = [a.strip().strip("'") for a in args]
        course['catalog_id'] = args[0]
        course['section_id'] = args[1]
        course['course_id'] = args[2]
        raw_meta = args[5] if len(args) > 5 else ''
        course['metadata_raw'] = raw_meta
        course['metadata'] = parse_php_serialized(raw_meta)
    else:
        course['catalog_id'] = course['section_id'] = course['course_id'] = None
        course['metadata'] = {}

    full_title = title_el.get_text(strip=True)
    match = re.match(r"([A-Z]+)\s+(\d+[A-Z]*)\s+-\s+(.+)", full_title)
    if match:
        course['prefix'], course['number'], course['title'] = match.groups()
    else:
        course['prefix'] = course['number'] = None
        course['title'] = full_title

    if desc_el:
        full_text = desc_el.get_text(separator=' ', strip=True)
        unit_match = re.search(r"(\d+(?:\.\d+)?(?:-\d+(?:\.\d+)?)?)\s+Credits?", full_text, re.IGNORECASE)
        course['units'] = unit_match.group(1) if unit_match else None
        course['description'] = full_text
    else:
        course['description'] = None
        course['units'] = None

    return course

def scrape_catalog():
    all_courses = []
    page = 1

    while True:
        print(f"Scraping page {page}...")
        PARAMS["filter[cpage]"] = str(page)
        response = requests.get(BASE_URL, params=PARAMS)
        print(response, 'response')
        soup = BeautifulSoup(response.text, "html.parser")
        print(soup, 'soup')
        # course_divs = soup.select('.courseleaf-accordion')
        # print(course_divs, 'course_divs')
        # if not course_divs:
        #     print("No more courses found. Stopping.")
        #     break

        # for div in course_divs:
        #     course = extract_course_info(div)
        #     if course:
        #         all_courses.append(course)

        page += 1
        time.sleep(1)  # Be polite

    return all_courses

if __name__ == "__main__":
    data = scrape_catalog()
    with open("uh_manoa_courses.json", "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, ensure_ascii=False)
    print(f"Scraped {len(data)} courses.")


Scraping page 1...
<Response [200]> response
<!DOCTYPE html>

<html lang="en">
<head>
<title>Course Descriptions - University of Hawaiʻi Manoa - Modern Campus Catalog™</title>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<link href="//acalog-clients.s3.amazonaws.com/production/manoa_hawaii/img/favicon/favicon.ico" rel="shortcut icon"/>
<link href="//acalog-clients.s3.amazonaws.com/production/manoa_hawaii/css/gateway/user-styles.css" rel="stylesheet" type="text/css"/>
<link href="css/public_custom.php" rel="stylesheet" type="text/css"/>
<link href="global_styles.css?v=01252018" rel="stylesheet" type="text/css"/>
<!--[if IE]>
<link rel="stylesheet" type="text/css" href="ie.css" />
<![endif]-->
<!-- Cache-busting string (deploy date) added to asset URLS -->
<script src="js/jquery.js?v=01252018" type="text/javascript"></script>
<script src="js/Tooltip.js?v=01252018" type="text/javascript"></script>
<script src="javascripts.js?v=02232018" type="text/javascript"></scr

KeyboardInterrupt: 