In [None]:
! pip install selenium

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import time

def is_valid(url, base_netloc):
    """
    Check if the URL is valid and belongs to the same domain.
    """
    parsed = urlparse(url)
    return parsed.netloc == base_netloc or parsed.netloc == ''

def crawl(url, base_netloc, visited, driver):
    if url in visited:
        return ""
    visited.add(url)
    print(f"Crawling: {url}")
    text_data = ""
    
    try:
        driver.get(url)
        time.sleep(4)
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')
        
        for element in soup(["script", "style"]):
            element.decompose()
        
        page_text = soup.get_text(separator=' ', strip=True)
        text_data += page_text + "\n\n"
        print(text_data)
        
        # Find and crawl all internal links
        for link in soup.find_all('a', href=True):
            href = link['href']
            # Skip invalid or unwanted links
            if href.startswith('javascript:') or href.startswith('mailto:') or href.startswith('#'):
                continue
            next_url = urljoin(url, href)
            if is_valid(next_url, base_netloc) and next_url not in visited:
                text_data += crawl(next_url, base_netloc, visited, driver)
                time.sleep(1)
    except Exception as e:
        print(f"Error crawling {url}: {e}")
    return text_data

if __name__ == '__main__':
    chrome_options = Options()
    chrome_options.add_argument("--headless") 

    service = Service("chromedriver.exe")  # <-- Update with your chromedriver path
    driver = webdriver.Chrome(service=service, options=chrome_options)
    
    start_url = "https://help.deakin.edu.au/ithelp?id=it_kb_view"  
    base_netloc = urlparse(start_url).netloc
    visited_urls = set()
    
    all_text = crawl(start_url, base_netloc, visited_urls, driver)
    driver.quit()
    
    with open("website_text.txt", "w", encoding="utf-8") as file:
        file.write(all_text)
    
    print("Crawling complete. Text saved to website_text.txt")


In [None]:
import threading
import time
from queue import Queue
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

visited_lock = threading.Lock()
visited = set()

text_data_lock = threading.Lock()
all_text = ""

url_queue = Queue()

def is_valid(url, base_netloc):
    """
    Check if the URL is valid and belongs to the same domain.
    """
    parsed = urlparse(url)
    return parsed.netloc == base_netloc or parsed.netloc == ''

def worker(base_netloc, driver_path, delay):
    """
    A worker function that creates its own Selenium driver instance, 
    processes URLs from the queue, extracts text, and enqueues discovered links.
    """
    global all_text
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    service = Service(driver_path)
    driver = webdriver.Chrome(service=service, options=chrome_options)
    
    while True:
        try:
            current_url = url_queue.get(timeout=5)  
        except Exception:
            break

        with visited_lock:
            if current_url in visited:
                url_queue.task_done()
                continue
            visited.add(current_url)

        print(f"Crawling: {current_url}")
        try:
            driver.get(current_url)
            time.sleep(delay)
            page_source = driver.page_source

            soup = BeautifulSoup(page_source, 'html.parser')
            for element in soup(["script", "style"]):
                element.decompose()
            page_text = soup.get_text(separator=' ', strip=True)
            with text_data_lock:
                all_text += page_text + "\n\n"
                print(all_text)

            # Find and queue all valid internal links
            for link in soup.find_all('a', href=True):
                href = link['href']
                if href.startswith('javascript:') or href.startswith('mailto:') or href.startswith('#'):
                    continue
                next_url = urljoin(current_url, href)
                if is_valid(next_url, base_netloc):
                    with visited_lock:
                        if next_url not in visited:
                            url_queue.put(next_url)
            time.sleep(delay)
        except Exception as e:
            print(f"Error crawling {current_url}: {e}")
        finally:
            url_queue.task_done()

    driver.quit()

if __name__ == '__main__':
    # Update these parameters:
    start_url = "https://help.deakin.edu.au/ithelp?id=it_kb_view"         
    driver_path = "chromedriver.exe"      # Replace with your chromedriver executable path
    num_threads = 4                         
    delay = 2                               

    base_netloc = urlparse(start_url).netloc
    url_queue.put(start_url)

    threads = []
    for _ in range(num_threads):
        t = threading.Thread(target=worker, args=(base_netloc, driver_path, delay))
        t.start()
        threads.append(t)

    url_queue.join()
    for t in threads:
        t.join()

    with open("website_text.txt", "w", encoding="utf-8") as f:
        f.write(all_text)

    print("Crawling complete. Text saved to website_text.txt")
