In [1]:
domain = "https://www.grainger.com"
start_url = "https://www.grainger.com/product/DAYTON-Standard-Duty-Industrial-Fan-1VCE8?cpnuser=undefined&searchBar=true&searchQuery=1VCE8&suggestConfigId=6"

In [2]:
import time
import os
from urllib.parse import urlparse, urljoin
from collections import deque
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import re

# Selenium setup
options = Options()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=options)

# Function to get the hyperlinks from a URL using Selenium
def get_hyperlinks(url):
    print(f"Getting hyperlinks for URL: {url}")
    try:
        driver.get(url)
        time.sleep(1)  # Adjust wait time as needed based on page load speed
        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')
        hyperlinks = [a['href'] for a in soup.find_all('a', href=True)]
        return hyperlinks
    except Exception as e:
        print(f"Error getting hyperlinks: {e}")
        return []

# Function to crawl the website
def crawl(url):
    local_domain = urlparse(domain).netloc
    queue = deque([url])
    seen = {url}
    clean_links = []

    # Create necessary directories if they don't exist
    if not os.path.exists("text/"):
        os.mkdir("text/")
    if not os.path.exists(f"text/{local_domain}/"):
        os.mkdir(f"text/{local_domain}/")
    if not os.path.exists("processed"):
        os.mkdir("processed")

    while queue and len(clean_links) < 1000:
        url = queue.popleft()  # Use popleft to ensure we process URLs in a breadth-first manner
        print(f"Crawling URL: {url}")
        try:
            # Construct file path for saving content
            file_path = f"text/{local_domain}/{url[8:].replace('/', '_')}.txt"
            with open(file_path, "w") as f:
                driver.get(url)
                time.sleep(3)  # Adjust wait time as needed based on page load speed
                soup = BeautifulSoup(driver.page_source, "html.parser")
                text = soup.get_text()
                if "You need to enable JavaScript to run this app." in text:
                    print(f"Unable to parse page {url} due to JavaScript being required")
                f.write(text)
                # print(f"Writing: {text}")
        except Exception as e:
            print(f"Error crawling URL: {e}")

        try:
            new_links = get_domain_hyperlinks(local_domain, url, clean_links)
            for link in new_links:
                if link not in seen:
                    queue.append(link)
                    seen.add(link)
        except Exception as e:
            print(f"Error processing links: {e}")

    print(f"Collected {len(clean_links)} clean links")

# Function to get the hyperlinks from a URL that are within the same domain and base URL
def get_domain_hyperlinks(local_domain, url, clean_links):
    # regex_pattern = re.compile(r'-[a-zA-Z0-9]+\?.*')
    regex_pattern = re.compile(r'-((?=.*\d.*\d.*\d)[A-Z0-9]{5,7})\?.*')
    
    hyperlinks = get_hyperlinks(url)
    print(f"Found {len(hyperlinks)} hyperlinks on {url}")
    for link in set(hyperlinks):
        clean_link = None
        print(f"Checking link: {link}")

        # Handle absolute URLs
        if link.startswith("http"):
            url_obj = urlparse(link)
            if url_obj.netloc == local_domain and link.startswith(domain):
                clean_link = link
        else:
            # Handle relative URLs
            clean_link = urljoin(url, link)

        if clean_link is not None and '@' not in clean_link and regex_pattern.search(clean_link):
            if clean_link.endswith("/"):
                clean_link = clean_link[:-1]
            if clean_link not in clean_links:
                print(f"Adding clean link: {clean_link}")
                clean_links.append(clean_link)

        # Stop collecting if we reach 1000 links
        if len(clean_links) >= 1000:
            break

    print(f"Clean links: {clean_links}")
    return list(set(hyperlinks))

crawl(start_url)

driver.quit()


Crawling URL: https://www.grainger.com/product/DAYTON-Standard-Duty-Industrial-Fan-1VCE8?cpnuser=undefined&searchBar=true&searchQuery=1VCE8&suggestConfigId=6
Getting hyperlinks for URL: https://www.grainger.com/product/DAYTON-Standard-Duty-Industrial-Fan-1VCE8?cpnuser=undefined&searchBar=true&searchQuery=1VCE8&suggestConfigId=6
Found 146 hyperlinks on https://www.grainger.com/product/DAYTON-Standard-Duty-Industrial-Fan-1VCE8?cpnuser=undefined&searchBar=true&searchQuery=1VCE8&suggestConfigId=6
Checking link: /content/mc/policies/terms-of-access
Checking link: https://www.facebook.com/grainger
Checking link: /product/SCHAEFER-Standard-Duty-Industrial-Fan-6ALD6?opr=APPD&analytics=altItems_1VCE8&position=2
Adding clean link: https://www.grainger.com/product/SCHAEFER-Standard-Duty-Industrial-Fan-6ALD6?opr=APPD&analytics=altItems_1VCE8&position=2
Checking link: /category/security?analytics=nav
Checking link: /category/power-transmission?analytics=nav
Checking link: /category/test-instruments

KeyboardInterrupt: 