## First trial

In [3]:
import math
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from bs4 import BeautifulSoup
import asyncio
from tqdm.notebook import tqdm
import time
import sys

# Function to split data into batches
def split_into_batches(data, num_batches):
    """
    Splits a list into a specified number of batches.
    """
    batch_size = math.ceil(len(data) / num_batches)
    return [data[i * batch_size:(i + 1) * batch_size] for i in range(num_batches)]

# Async function to fetch links using Selenium
async def fetch_links(driver, county, naics_code):
    try:
        # Step 1: Navigate to the Page
        url = "https://accessnc.nccommerce.com/business/business_custom_search_infogroup.html"
        driver.get(url)

        # Wait for the page to load
        time.sleep(2)

        # Step 2: Switch to "Advanced Search" Tab
        advanced_search_tab = driver.find_element(By.CSS_SELECTOR, "a[href='#Advanced_Search']")
        advanced_search_tab.click()

        # Wait for the Advanced Search section to load
        time.sleep(2)

        # Step 3: Select "Area Type" (e.g., "County")
        area_type_dropdown = Select(driver.find_element(By.ID, "regionCategory"))
        area_type_dropdown.select_by_visible_text("County")

        # Step 4: Select the Area (e.g., "Buncombe")
        area_dropdown = Select(driver.find_element(By.ID, "region"))
        area_dropdown.select_by_visible_text(county)

        # Step 5: Select Industry Group (e.g., NAICS Code)
        industry_dropdown = Select(driver.find_element(By.ID, "IndustryGroup2"))
        industry_dropdown.select_by_value(naics_code)

        # Step 6: Interact with the Submit Button
        submit_button = driver.find_element(By.CSS_SELECTOR, "div.col-sm-10 > button[type='submit'][class='btn btn-primary'][name='submit']")
        driver.execute_script("arguments[0].scrollIntoView(true);", submit_button)
        time.sleep(1)
        driver.execute_script("arguments[0].click();", submit_button)

        # Step 7: Wait for Results to Load and Process the New Tab's HTML
        time.sleep(3)  # Adjust as needed for loading time
        driver.switch_to.window(driver.window_handles[-1])  # Switch to the new tab

        # Parse the HTML using BeautifulSoup
        result_html = driver.page_source
        soup = BeautifulSoup(result_html, 'html.parser')

        # Extract all links where the text is "Detail"
        detail_links = [link.get('href') for link in soup.find_all('a', string="Detail")]
        print(f"{county}-{naics_code}: Extracted {len(detail_links)} links.")

        # Write links to a file
        with open("all_links.txt", "a") as file:
            for link in detail_links:
                file.write(link + "\n")

        # Close the result tab
        driver.close()

        # Switch back to the original tab
        driver.switch_to.window(driver.window_handles[0])

        return detail_links
    except Exception as e:
        print(f"Error processing {county}-{naics_code}: {e}")
        return []

# Async main function
async def main(start_batch, end_batch):
    # Read NAICS codes from the file
    with open("qnaics_codes.txt", 'r') as f:
        codes = [line.strip() for line in f.readlines()]

    # Define counties
    counties = [
        "Alexander", "Alleghany", "Ashe", "Avery", "Buncombe", "Burke",
        "Caldwell", "Catawba", "Clay", "Cleveland", "Gaston", "Haywood",
        "Henderson", "Jackson", "Lincoln", "Macon", "Madison", "McDowell",
        "Mecklenburg", "Mitchell", "Polk", "Rutherford", "Swain",
        "Transylvania", "Watauga", "Wilkes", "Yancey"
    ]

    # Combine counties and NAICS codes into pairs
    tasks_data = [(county, code) for county in counties for code in codes]

    # Split the tasks into 20 batches
    batches = split_into_batches(tasks_data, 20)

    # Validate batch range
    if start_batch < 1 or end_batch > len(batches):
        print(f"Invalid batch range: {start_batch}-{end_batch}. Must be between 1 and {len(batches)}.")
        return

    # Create a WebDriver instance
    driver = webdriver.Chrome()

    try:
        for batch_idx in range(start_batch - 1, end_batch):
            print(f"Processing batch {batch_idx + 1}/{end_batch}...")
            progress_bar = tqdm(total=len(batches[batch_idx]), desc=f"Batch {batch_idx + 1}/{end_batch}")

            # Create tasks for the current batch
            tasks = [
                fetch_links(driver, county, code)
                for county, code in batches[batch_idx]
            ]

            # Run all tasks in the current batch
            await asyncio.gather(*tasks)

            # Update progress bar
            for _ in tasks:
                progress_bar.update(1)

            progress_bar.close()
            print(f"Batch {batch_idx + 1} completed.")
            await asyncio.sleep(5)  # Optional rest between batches

        print(f"All tasks from batch {start_batch} to {end_batch} completed. Links are written to all_links.txt.")
    finally:
        driver.quit()


In [6]:
await main(start_batch=1, end_batch=5)

Processing batch 1/5...


Batch 1/5:   0%|          | 0/420 [00:00<?, ?it/s]

Alexander-1132: Extracted 0 links.
Alexander-3115: Extracted 0 links.
Alexander-3362: Extracted 1 links.
Alexander-5232: Extracted 0 links.
Alexander-8112: Extracted 0 links.
Alexander-3132: Extracted 1 links.
Alexander-4481: Extracted 0 links.
Alexander-6117: Extracted 1 links.
Alexander-4243: Extracted 0 links.
Alexander-5222: Extracted 6 links.
Alexander-2131: Extracted 0 links.
Alexander-4811: Extracted 0 links.
Alexander-3313: Extracted 0 links.
Alexander-4512: Extracted 0 links.
Alexander-3361: Extracted 1 links.
Alexander-4541: Extracted 0 links.
Alexander-2371: Extracted 1 links.
Alexander-4872: Extracted 0 links.
Alexander-4832: Extracted 0 links.
Alexander-5324: Extracted 0 links.
Alexander-5182: Extracted 1 links.
Alexander-3353: Extracted 0 links.
Alexander-4431: Extracted 0 links.
Alexander-3371: Extracted 11 links.
Alexander-4921: Extracted 0 links.
Alexander-3333: Extracted 0 links.
Alexander-3379: Extracted 3 links.
Alexander-4869: Extracted 0 links.
Alexander-3274: Ext

Batch 2/5:   0%|          | 0/420 [00:00<?, ?it/s]

Alleghany-7131: Extracted 0 links.
Alleghany-2362: Extracted 6 links.
Alleghany-8113: Extracted 0 links.
Alleghany-2373: Extracted 4 links.
Alleghany-4239: Extracted 1 links.
Alleghany-4862: Extracted 0 links.
Alleghany-4533: Extracted 0 links.
Alleghany-1125: Extracted 0 links.
Alleghany-3323: Extracted 0 links.
Alleghany-3118: Extracted 0 links.
Alleghany-4532: Extracted 0 links.
Alleghany-3212: Extracted 0 links.
Alleghany-3222: Extracted 0 links.
Alleghany-3372: Extracted 0 links.
Alleghany-5613: Extracted 0 links.
Alleghany-8141: Extracted 0 links.
Alleghany-3364: Extracted 0 links.
Alleghany-5415: Extracted 1 links.
Alleghany-3261: Extracted 0 links.
Alleghany-1129: Extracted 0 links.
Alleghany-5122: Extracted 0 links.
Alleghany-4531: Extracted 0 links.
Alleghany-6213: Extracted 10 links.
Alleghany-3241: Extracted 0 links.
Alleghany-7115: Extracted 1 links.
Alleghany-3262: Extracted 1 links.
Alleghany-5111: Extracted 0 links.
Alleghany-2111: Extracted 0 links.
Alleghany-3321: Ext

Batch 3/5:   0%|          | 0/420 [00:00<?, ?it/s]

Ashe-7139: Extracted 4 links.
Ashe-5417: Extracted 2 links.
Ashe-6222: Extracted 0 links.
Ashe-2383: Extracted 9 links.
Ashe-5239: Extracted 8 links.
Ashe-4482: Extracted 0 links.
Ashe-6241: Extracted 15 links.
Ashe-8129: Extracted 6 links.
Ashe-8131: Extracted 67 links.
Ashe-5614: Extracted 1 links.
Ashe-1119: Extracted 16 links.
Ashe-4483: Extracted 0 links.
Ashe-5617: Extracted 26 links.
Ashe-3159: Extracted 0 links.
Ashe-2382: Extracted 27 links.
Ashe-4235: Extracted 0 links.
Ashe-3219: Extracted 3 links.
Ashe-4249: Extracted 4 links.
Ashe-8133: Extracted 17 links.
Ashe-6239: Extracted 1 links.
Ashe-9241: Extracted 2 links.
Ashe-5174: Extracted 0 links.
Ashe-5611: Extracted 0 links.
Ashe-3366: Extracted 0 links.
Ashe-5511: Extracted 4 links.
Ashe-4442: Extracted 12 links.
Ashe-3221: Extracted 0 links.
Ashe-3363: Extracted 0 links.
Ashe-3256: Extracted 0 links.
Ashe-9281: Extracted 0 links.
Ashe-4841: Extracted 0 links.
Ashe-3114: Extracted 0 links.
Ashe-6114: Extracted 0 links.
Ash

Batch 4/5:   0%|          | 0/420 [00:00<?, ?it/s]

Buncombe-2371: Extracted 24 links.
Buncombe-4872: Extracted 1 links.
Buncombe-4832: Extracted 0 links.
Buncombe-5324: Extracted 6 links.
Buncombe-5182: Extracted 23 links.
Buncombe-3353: Extracted 2 links.
Buncombe-4431: Extracted 0 links.
Buncombe-3371: Extracted 33 links.
Buncombe-4921: Extracted 3 links.
Buncombe-3333: Extracted 0 links.
Buncombe-3379: Extracted 1 links.
Buncombe-4869: Extracted 0 links.
Buncombe-3274: Extracted 0 links.
Buncombe-4232: Extracted 4 links.
Buncombe-3322: Extracted 0 links.
Buncombe-5312: Extracted 100 links.
Buncombe-2123: Extracted 4 links.
Buncombe-3253: Extracted 1 links.
Buncombe-4511: Extracted 0 links.
Buncombe-3344: Extracted 0 links.
Buncombe-4247: Extracted 14 links.
Buncombe-3326: Extracted 0 links.
Buncombe-3342: Extracted 2 links.
Buncombe-3332: Extracted 5 links.
Buncombe-5231: Extracted 0 links.
Buncombe-6115: Extracted 9 links.
Buncombe-1141: Extracted 0 links.
Buncombe-3272: Extracted 3 links.
Buncombe-6112: Extracted 4 links.
Buncombe

Batch 5/5:   0%|          | 0/420 [00:00<?, ?it/s]

Burke-3364: Extracted 3 links.
Burke-5415: Extracted 6 links.
Burke-3261: Extracted 0 links.
Burke-1129: Extracted 0 links.
Burke-5122: Extracted 0 links.
Burke-4531: Extracted 0 links.
Burke-6213: Extracted 100 links.
Burke-3241: Extracted 0 links.
Burke-7115: Extracted 8 links.
Burke-3262: Extracted 0 links.
Burke-5111: Extracted 0 links.
Burke-2111: Extracted 1 links.
Burke-3321: Extracted 1 links.
Burke-5313: Extracted 6 links.
Burke-1131: Extracted 0 links.
Burke-9221: Extracted 36 links.
Burke-3399: Extracted 5 links.
Burke-4931: Extracted 4 links.
Burke-7112: Extracted 1 links.
Burke-4542: Extracted 0 links.
Burke-4543: Extracted 0 links.
Burke-4244: Extracted 2 links.
Burke-3169: Extracted 0 links.
Burke-4412: Extracted 6 links.
Burke-5612: Extracted 0 links.
Burke-4453: Extracted 2 links.
Burke-4451: Extracted 48 links.
Burke-3336: Extracted 1 links.
Burke-5179: Extracted 0 links.
Burke-3131: Extracted 0 links.
Burke-2212: Extracted 0 links.
Burke-1114: Extracted 3 links.
Burk

In [3]:
await main(start_batch=6, end_batch=7)

Processing batch 6/7...


Batch 6/7:   0%|          | 0/420 [00:00<?, ?it/s]

Caldwell-9251: Extracted 0 links.
Caldwell-3274: Extracted 0 links.
Caldwell-4453: Extracted 3 links.
Caldwell-2131: Extracted 0 links.
Caldwell-5414: Extracted 8 links.
Caldwell-1129: Extracted 3 links.
Caldwell-7132: Extracted 1 links.
Caldwell-3219: Extracted 11 links.
Caldwell-6233: Extracted 4 links.
Caldwell-6214: Extracted 7 links.
Caldwell-3118: Extracted 2 links.
Caldwell-4511: Extracted 0 links.
Caldwell-3371: Extracted 14 links.
Caldwell-3254: Extracted 1 links.
Caldwell-4237: Extracted 0 links.
Caldwell-3221: Extracted 0 links.
Caldwell-3116: Extracted 0 links.
Caldwell-4512: Extracted 0 links.
Caldwell-6239: Extracted 2 links.
Caldwell-5411: Extracted 27 links.
Caldwell-5617: Extracted 42 links.
Caldwell-3113: Extracted 0 links.
Caldwell-2382: Extracted 37 links.
Caldwell-5231: Extracted 10 links.
Caldwell-5312: Extracted 43 links.
Caldwell-7112: Extracted 2 links.
Caldwell-2361: Extracted 34 links.
Caldwell-4884: Extracted 8 links.
Caldwell-8131: Extracted 100 links.
Cald

Batch 7/7:   0%|          | 0/420 [00:00<?, ?it/s]

Clay-4881: Extracted 0 links.
Clay-1152: Extracted 2 links.
Clay-5242: Extracted 7 links.
Clay-5191: Extracted 0 links.
Clay-5415: Extracted 1 links.
Clay-3366: Extracted 0 links.
Clay-1122: Extracted 0 links.
Clay-3114: Extracted 0 links.
Clay-4523: Extracted 0 links.
Clay-7223: Extracted 3 links.
Clay-4871: Extracted 0 links.
Clay-5223: Extracted 0 links.
Clay-3322: Extracted 0 links.
Clay-6241: Extracted 10 links.
Clay-4883: Extracted 0 links.
Clay-4921: Extracted 0 links.
Clay-5619: Extracted 4 links.
Clay-3315: Extracted 0 links.
Clay-1123: Extracted 0 links.
Clay-1141: Extracted 0 links.
Clay-3333: Extracted 0 links.
Clay-5621: Extracted 1 links.
Clay-3169: Extracted 0 links.
Clay-6232: Extracted 0 links.
Clay-9231: Extracted 2 links.
Clay-1119: Extracted 3 links.
Clay-2122: Extracted 0 links.
Clay-3115: Extracted 0 links.
Clay-5173: Extracted 0 links.
Clay-4241: Extracted 0 links.
Clay-3325: Extracted 0 links.
Clay-8134: Extracted 4 links.
Clay-3121: Extracted 4 links.
Clay-4862

In [4]:
await main(start_batch=9, end_batch=10)

Processing batch 9/10...


Batch 9/10:   0%|          | 0/420 [00:00<?, ?it/s]

Gaston-3116: Extracted 0 links.
Gaston-4512: Extracted 0 links.
Gaston-6239: Extracted 1 links.
Gaston-5411: Extracted 100 links.
Gaston-5617: Extracted 100 links.
Gaston-3113: Extracted 0 links.
Gaston-2382: Extracted 100 links.
Gaston-5231: Extracted 35 links.
Gaston-5312: Extracted 100 links.
Gaston-7112: Extracted 3 links.
Gaston-2361: Extracted 100 links.
Gaston-4884: Extracted 21 links.
Gaston-8131: Extracted 100 links.
Gaston-7139: Extracted 59 links.
Gaston-3313: Extracted 0 links.
Gaston-3271: Extracted 0 links.
Gaston-8122: Extracted 26 links.
Gaston-4236: Extracted 24 links.
Gaston-3339: Extracted 12 links.
Gaston-5251: Extracted 0 links.
Gaston-3363: Extracted 2 links.
Gaston-4421: Extracted 0 links.
Gaston-4248: Extracted 0 links.
Gaston-4441: Extracted 59 links.
Gaston-3259: Extracted 6 links.
Gaston-6215: Extracted 0 links.
Gaston-3253: Extracted 1 links.
Gaston-3324: Extracted 1 links.
Gaston-4249: Extracted 6 links.
Gaston-6114: Extracted 6 links.
Gaston-4821: Extracte

Batch 10/10:   0%|          | 0/420 [00:00<?, ?it/s]

Henderson-5619: Extracted 16 links.
Henderson-3315: Extracted 0 links.
Henderson-1123: Extracted 0 links.
Henderson-1141: Extracted 0 links.
Henderson-3333: Extracted 4 links.
Henderson-5621: Extracted 11 links.
Henderson-3169: Extracted 0 links.
Henderson-6232: Extracted 0 links.
Henderson-9231: Extracted 6 links.
Henderson-1119: Extracted 0 links.
Henderson-2122: Extracted 0 links.
Henderson-3115: Extracted 0 links.
Henderson-5173: Extracted 0 links.
Henderson-4241: Extracted 1 links.
Henderson-3325: Extracted 0 links.
Henderson-8134: Extracted 0 links.
Henderson-3121: Extracted 19 links.
Henderson-4862: Extracted 0 links.
Henderson-3241: Extracted 0 links.
Henderson-7115: Extracted 0 links.
Henderson-3399: Extracted 21 links.
Henderson-3133: Extracted 0 links.
Henderson-4413: Extracted 25 links.
Henderson-8141: Extracted 0 links.
Henderson-4841: Extracted 11 links.
Henderson-3343: Extracted 0 links.
Henderson-7211: Extracted 0 links.
Henderson-4832: Extracted 1 links.
Henderson-5179

In [8]:
with open("all_links.txt",'r') as f:
    all_links = f.readlines()
len(set(all_links))

13401

In [11]:
with open("qnaics_codes.txt", 'r') as f:
    codes = [line.strip() for line in f.readlines()]

    # Define counties
counties = [
        "Alexander", "Alleghany", "Ashe", "Avery", "Buncombe", "Burke",
        "Caldwell", "Catawba", "Clay", "Cleveland", "Gaston", "Haywood",
        "Henderson", "Jackson", "Lincoln", "Macon", "Madison", "McDowell",
        "Mecklenburg", "Mitchell", "Polk", "Rutherford", "Swain",
        "Transylvania", "Watauga", "Wilkes", "Yancey"
    ]

    # Combine counties and NAICS codes into pairs
tasks_data = [(county, code) for county in counties for code in codes]
batches = split_into_batches(tasks_data,20)
batches[18]

[('Watauga', '1125'),
 ('Watauga', '5621'),
 ('Watauga', '3345'),
 ('Watauga', '4872'),
 ('Watauga', '3379'),
 ('Watauga', '3118'),
 ('Watauga', '2361'),
 ('Watauga', '9231'),
 ('Watauga', '5416'),
 ('Watauga', '3314'),
 ('Watauga', '3122'),
 ('Watauga', '3369'),
 ('Watauga', '5322'),
 ('Watauga', '3131'),
 ('Watauga', '3336'),
 ('Watauga', '2121'),
 ('Watauga', '2372'),
 ('Watauga', '6215'),
 ('Watauga', '3353'),
 ('Watauga', '3149'),
 ('Watauga', '3399'),
 ('Watauga', '3121'),
 ('Watauga', '3256'),
 ('Watauga', '3279'),
 ('Watauga', '4821'),
 ('Watauga', '4859'),
 ('Watauga', '4241'),
 ('Watauga', '4452'),
 ('Watauga', '3259'),
 ('Watauga', '9261'),
 ('Watauga', '6213'),
 ('Watauga', '4853'),
 ('Watauga', '5231'),
 ('Watauga', '4841'),
 ('Watauga', '3115'),
 ('Watauga', '5112'),
 ('Watauga', '3152'),
 ('Watauga', '6233'),
 ('Watauga', '1122'),
 ('Watauga', '8121'),
 ('Watauga', '7212'),
 ('Watauga', '4812'),
 ('Watauga', '3151'),
 ('Watauga', '3325'),
 ('Watauga', '5173'),
 ('Watauga

In [15]:
with open("tasks_data.txt",'w') as f:
    for i in tasks_data:
        f.write(str(i))
        f.write("\n")

In [14]:
problematic_counties = ["Caldwell", "Cleveland","Gaston",'Rutherford','Lincoln',"Jackson",
                        "Madison",'Mecklenburg',"McDowell","Rutherford",'Watauga',"Wilkes","Polk"] 
problematic_tasks =  [(county, code) for county in problematic_counties for code in codes]

In [16]:
len(problematic_tasks)

4043

In [18]:
problematic_tasks

[('Caldwell', '3321'),
 ('Caldwell', '4884'),
 ('Caldwell', '2122'),
 ('Caldwell', '3313'),
 ('Caldwell', '5232'),
 ('Caldwell', '3132'),
 ('Caldwell', '3391'),
 ('Caldwell', '3365'),
 ('Caldwell', '1151'),
 ('Caldwell', '5324'),
 ('Caldwell', '4541'),
 ('Caldwell', '3344'),
 ('Caldwell', '4442'),
 ('Caldwell', '3111'),
 ('Caldwell', '3112'),
 ('Caldwell', '4512'),
 ('Caldwell', '5613'),
 ('Caldwell', '3333'),
 ('Caldwell', '6231'),
 ('Caldwell', '8132'),
 ('Caldwell', '4231'),
 ('Caldwell', '3326'),
 ('Caldwell', '3329'),
 ('Caldwell', '6223'),
 ('Caldwell', '5511'),
 ('Caldwell', '5239'),
 ('Caldwell', '5615'),
 ('Caldwell', '8139'),
 ('Caldwell', '6244'),
 ('Caldwell', '6221'),
 ('Caldwell', '5418'),
 ('Caldwell', '3211'),
 ('Caldwell', '4235'),
 ('Caldwell', '5412'),
 ('Caldwell', '5622'),
 ('Caldwell', '5612'),
 ('Caldwell', '4451'),
 ('Caldwell', '5191'),
 ('Caldwell', '3254'),
 ('Caldwell', '3212'),
 ('Caldwell', '4411'),
 ('Caldwell', '4233'),
 ('Caldwell', '6232'),
 ('Caldwell

In [19]:
with open("connecting.txt",'r') as f:
    finished_raw = f.readlines()
finished = []
for i in finished_raw:
    county, code = i.split(":")[0].split("-")
    finished.append((county, code))

In [21]:
res_tasks = []
for i in problematic_tasks:
    if i in finished:
        continue
    res_tasks.append(i)
res_tasks

[('Caldwell', '3391'),
 ('Caldwell', '3112'),
 ('Caldwell', '6231'),
 ('Caldwell', '3329'),
 ('Caldwell', '5511'),
 ('Caldwell', '5615'),
 ('Caldwell', '8139'),
 ('Caldwell', '6244'),
 ('Caldwell', '5412'),
 ('Caldwell', '5191'),
 ('Caldwell', '4233'),
 ('Caldwell', '3352'),
 ('Caldwell', '7224'),
 ('Caldwell', '4851'),
 ('Caldwell', '5629'),
 ('Caldwell', '7111'),
 ('Caldwell', '7114'),
 ('Caldwell', '7223'),
 ('Caldwell', '4422'),
 ('Caldwell', '4831'),
 ('Caldwell', '9241'),
 ('Caldwell', '6216'),
 ('Caldwell', '3345'),
 ('Caldwell', '9231'),
 ('Caldwell', '5416'),
 ('Caldwell', '3122'),
 ('Caldwell', '3256'),
 ('Caldwell', '4241'),
 ('Caldwell', '4452'),
 ('Caldwell', '4853'),
 ('Caldwell', '4841'),
 ('Caldwell', '3152'),
 ('Caldwell', '4812'),
 ('Caldwell', '1111'),
 ('Caldwell', '3341'),
 ('Caldwell', '9281'),
 ('Caldwell', '8123'),
 ('Caldwell', '5121'),
 ('Caldwell', '3251'),
 ('Caldwell', '5174'),
 ('Caldwell', '4539'),
 ('Caldwell', '5223'),
 ('Caldwell', '1113'),
 ('Caldwell

In [25]:
with open("res_tasks.txt", "w") as f:
    for i in res_tasks:
        f.write(i[0]+"-"+i[1])
        f.write("\n")

## Second trial

In [5]:
import math
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from bs4 import BeautifulSoup
import asyncio
from tqdm.notebook import tqdm
from datetime import datetime
import nest_asyncio

# Allow nested event loops in Jupyter Notebook
nest_asyncio.apply()

# Function to split data into batches
def split_into_batches(data, num_batches):
    """
    Splits a list into a specified number of batches.
    """
    batch_size = math.ceil(len(data) / num_batches)
    return [data[i * batch_size:(i + 1) * batch_size] for i in range(num_batches)]

# Async function to fetch links for a single NAICS code
async def fetch_links(driver, naics_code):
    """
    Fetch all 'Detail' links for a given NAICS code.
    Handles pagination and logs progress to a worklog file.
    """
    try:
        # Step 1: Navigate to the Page
        url = "https://accessnc.nccommerce.com/business/business_custom_search_infogroup.html"
        driver.get(url)

        # Wait for the page to load
        await asyncio.sleep(1)

        # Step 2: Select Industry Group (e.g., NAICS Code)
        industry_dropdown = Select(driver.find_element(By.ID, "qnaics"))
        industry_dropdown.select_by_value(naics_code)

        # Step 3: Interact with the Submit Button
        submit_button = driver.find_element(By.CSS_SELECTOR, "div.form-group > button[type='submit'][class='btn btn-primary'][name='submit']")
        driver.execute_script("arguments[0].scrollIntoView(true);", submit_button)
        await asyncio.sleep(1)
        driver.execute_script("arguments[0].click();", submit_button)
        print(f"Clicked the Submit button for NAICS: {naics_code}.")

        # Step 4: Wait for Results to Load and Process the New Tab's HTML
        await asyncio.sleep(3)  # Wait to ensure the response page is fully loaded
        driver.switch_to.window(driver.window_handles[-1])  # Switch to the new tab
        print(f"Switched to the new tab for NAICS: {naics_code}.")

        # Initialize an empty list to store all links
        all_detail_links = []
        page_num = 1  # Initialize page counter

        while True:
            # Capture the HTML content of the current page
            result_html = driver.page_source
            soup = BeautifulSoup(result_html, 'html.parser')

            # Extract all links where the text is "Detail"
            detail_links = [link.get('href') for link in soup.find_all('a', string="Detail")]
            all_detail_links.extend(detail_links)
            print(f"NAICS: {naics_code}, Page {page_num}: Extracted {len(detail_links)} links.")

            # Write links to a temporary file
            timestamp = datetime.now()
            tmp_filename = f"tmp_{naics_code}_{timestamp.hour}_{timestamp.minute}.txt"
            with open(tmp_filename, "a") as file:
                for link in detail_links:
                    file.write(link + "\n")
            print(f"NAICS: {naics_code}, Page {page_num}: Wrote {len(detail_links)} links to {tmp_filename}")

            # Log progress to worklog
            with open("worklog.txt", "a") as log_file:
                log_file.write(f"[{timestamp}] NAICS: {naics_code}, Page: {page_num}, Links Extracted: {len(detail_links)}\n")

            # Find and click the "Next" button
            try:
                next_button = driver.find_element(By.CSS_SELECTOR, "a[rel='next']")
                if "disabled" in next_button.get_attribute("class"):
                    print(f"NAICS: {naics_code}, Page {page_num}: No more pages.")
                    break
                next_button.click()
                print(f"NAICS: {naics_code}, Clicked the 'Next' button to go to page {page_num + 1}.")
                page_num += 1
                await asyncio.sleep(2)  # Allow time for the next page to load
            except Exception:
                print(f"NAICS: {naics_code}, No 'Next' button found or an error occurred.")
                break

        print(f"NAICS: {naics_code}: Total links extracted: {len(all_detail_links)}")

        # Log final results
        with open("worklog.txt", "a") as log_file:
            log_file.write(f"[{datetime.now()}] Completed NAICS: {naics_code}. Total Links: {len(all_detail_links)}\n")

        # Close the result tab before returning to the main tab
        driver.close()
        await asyncio.sleep(1)  # Ensure tab closure completes
        driver.switch_to.window(driver.window_handles[0])
        print(f"Closed the result tab for NAICS: {naics_code}.")
    except Exception as e:
        print(f"Error processing NAICS: {naics_code}: {e}")

# Async main function
async def main(start_batch, end_batch):
    # Read NAICS codes from the file
    with open("qnaics_codes.txt", 'r') as f:
        codes = [line.strip() for line in f.readlines()]

    # Split the tasks into batches
    batches = split_into_batches(codes, 20)

    # Validate batch range
    if start_batch < 1 or end_batch > len(batches):
        print(f"Invalid batch range: {start_batch}-{end_batch}. Must be between 1 and {len(batches)}.")
        return

    # Create a WebDriver instance
    driver = webdriver.Chrome()

    try:
        for batch_idx in range(start_batch - 1, end_batch):
            print(f"Processing batch {batch_idx + 1}/{end_batch}...")
            progress_bar = tqdm(total=len(batches[batch_idx]), desc=f"Batch {batch_idx + 1}/{end_batch}")

            # Run tasks sequentially in the current batch to ensure proper tab handling
            for naics_code in batches[batch_idx]:
                await fetch_links(driver, naics_code)
                progress_bar.update(1)

            progress_bar.close()
            print(f"Batch {batch_idx + 1} completed.")
            await asyncio.sleep(5)  # Optional rest between batches

        print(f"All tasks from batch {start_batch} to {end_batch} completed. Links are written to tmp files.")
    finally:
        driver.quit()
        print("Closed the WebDriver.")



In [6]:
# Run the main function in Jupyter Notebook
await main(start_batch=1, end_batch=20)


Processing batch 1/20...


Batch 1/20:   0%|          | 0/16 [00:00<?, ?it/s]

Clicked the Submit button for NAICS: 3321.
Switched to the new tab for NAICS: 3321.
NAICS: 3321, Page 1: Extracted 59 links.
NAICS: 3321, Page 1: Wrote 59 links to tmp_3321_4_2.txt
NAICS: 3321, No 'Next' button found or an error occurred.
NAICS: 3321: Total links extracted: 59
Closed the result tab for NAICS: 3321.
Clicked the Submit button for NAICS: 4884.
Switched to the new tab for NAICS: 4884.
NAICS: 4884, Page 1: Extracted 100 links.
NAICS: 4884, Page 1: Wrote 100 links to tmp_4884_4_2.txt
NAICS: 4884, Clicked the 'Next' button to go to page 2.
NAICS: 4884, Page 2: Extracted 100 links.
NAICS: 4884, Page 2: Wrote 100 links to tmp_4884_4_2.txt
NAICS: 4884, Clicked the 'Next' button to go to page 3.


  tag = self.element_classes.get(Tag, Tag)(


NAICS: 4884, Page 3: Extracted 100 links.
NAICS: 4884, Page 3: Wrote 100 links to tmp_4884_4_2.txt
NAICS: 4884, Clicked the 'Next' button to go to page 4.
NAICS: 4884, Page 4: Extracted 100 links.
NAICS: 4884, Page 4: Wrote 100 links to tmp_4884_4_2.txt
NAICS: 4884, Clicked the 'Next' button to go to page 5.
NAICS: 4884, Page 5: Extracted 100 links.
NAICS: 4884, Page 5: Wrote 100 links to tmp_4884_4_2.txt
NAICS: 4884, Clicked the 'Next' button to go to page 6.
NAICS: 4884, Page 6: Extracted 100 links.
NAICS: 4884, Page 6: Wrote 100 links to tmp_4884_4_2.txt
NAICS: 4884, Clicked the 'Next' button to go to page 7.
NAICS: 4884, Page 7: Extracted 100 links.
NAICS: 4884, Page 7: Wrote 100 links to tmp_4884_4_2.txt
NAICS: 4884, Clicked the 'Next' button to go to page 8.
NAICS: 4884, Page 8: Extracted 100 links.
NAICS: 4884, Page 8: Wrote 100 links to tmp_4884_4_2.txt
NAICS: 4884, Clicked the 'Next' button to go to page 9.
NAICS: 4884, Page 9: Extracted 54 links.
NAICS: 4884, Page 9: Wrote 54

Batch 2/20:   0%|          | 0/16 [00:00<?, ?it/s]

Clicked the Submit button for NAICS: 5613.
Switched to the new tab for NAICS: 5613.
NAICS: 5613, Page 1: Extracted 100 links.
NAICS: 5613, Page 1: Wrote 100 links to tmp_5613_4_5.txt
NAICS: 5613, Clicked the 'Next' button to go to page 2.
NAICS: 5613, Page 2: Extracted 100 links.
NAICS: 5613, Page 2: Wrote 100 links to tmp_5613_4_5.txt
NAICS: 5613, Clicked the 'Next' button to go to page 3.
NAICS: 5613, Page 3: Extracted 100 links.
NAICS: 5613, Page 3: Wrote 100 links to tmp_5613_4_5.txt
NAICS: 5613, Clicked the 'Next' button to go to page 4.
NAICS: 5613, Page 4: Extracted 100 links.
NAICS: 5613, Page 4: Wrote 100 links to tmp_5613_4_5.txt
NAICS: 5613, Clicked the 'Next' button to go to page 5.
NAICS: 5613, Page 5: Extracted 100 links.
NAICS: 5613, Page 5: Wrote 100 links to tmp_5613_4_5.txt
NAICS: 5613, Clicked the 'Next' button to go to page 6.
NAICS: 5613, Page 6: Extracted 100 links.
NAICS: 5613, Page 6: Wrote 100 links to tmp_5613_4_5.txt
NAICS: 5613, Clicked the 'Next' button to 

Batch 3/20:   0%|          | 0/16 [00:00<?, ?it/s]

Clicked the Submit button for NAICS: 4235.
Switched to the new tab for NAICS: 4235.
NAICS: 4235, Page 1: Extracted 0 links.
NAICS: 4235, Page 1: Wrote 0 links to tmp_4235_4_20.txt
NAICS: 4235, Clicked the 'Next' button to go to page 2.
NAICS: 4235, Page 2: Extracted 93 links.
NAICS: 4235, Page 2: Wrote 93 links to tmp_4235_4_21.txt
NAICS: 4235, No 'Next' button found or an error occurred.
NAICS: 4235: Total links extracted: 93
Closed the result tab for NAICS: 4235.
Clicked the Submit button for NAICS: 5412.
Switched to the new tab for NAICS: 5412.
NAICS: 5412, Page 1: Extracted 100 links.
NAICS: 5412, Page 1: Wrote 100 links to tmp_5412_4_21.txt
NAICS: 5412, Clicked the 'Next' button to go to page 2.
NAICS: 5412, Page 2: Extracted 100 links.
NAICS: 5412, Page 2: Wrote 100 links to tmp_5412_4_21.txt
NAICS: 5412, Clicked the 'Next' button to go to page 3.
NAICS: 5412, Page 3: Extracted 100 links.
NAICS: 5412, Page 3: Wrote 100 links to tmp_5412_4_21.txt
NAICS: 5412, Clicked the 'Next' bu

Batch 4/20:   0%|          | 0/16 [00:00<?, ?it/s]

Clicked the Submit button for NAICS: 5242.
Switched to the new tab for NAICS: 5242.
NAICS: 5242, Page 1: Extracted 100 links.
NAICS: 5242, Page 1: Wrote 100 links to tmp_5242_4_33.txt
NAICS: 5242, Clicked the 'Next' button to go to page 2.
NAICS: 5242, Page 2: Extracted 100 links.
NAICS: 5242, Page 2: Wrote 100 links to tmp_5242_4_33.txt
NAICS: 5242, Clicked the 'Next' button to go to page 3.
NAICS: 5242, Page 3: Extracted 100 links.
NAICS: 5242, Page 3: Wrote 100 links to tmp_5242_4_33.txt
NAICS: 5242, Clicked the 'Next' button to go to page 4.
NAICS: 5242, Page 4: Extracted 100 links.
NAICS: 5242, Page 4: Wrote 100 links to tmp_5242_4_33.txt
NAICS: 5242, Clicked the 'Next' button to go to page 5.
NAICS: 5242, Page 5: Extracted 100 links.
NAICS: 5242, Page 5: Wrote 100 links to tmp_5242_4_33.txt
NAICS: 5242, Clicked the 'Next' button to go to page 6.
NAICS: 5242, Page 6: Extracted 100 links.
NAICS: 5242, Page 6: Wrote 100 links to tmp_5242_4_33.txt
NAICS: 5242, Clicked the 'Next' butt

Batch 5/20:   0%|          | 0/16 [00:00<?, ?it/s]

Clicked the Submit button for NAICS: 8133.
Switched to the new tab for NAICS: 8133.
NAICS: 8133, Page 1: Extracted 0 links.
NAICS: 8133, Page 1: Wrote 0 links to tmp_8133_4_52.txt
NAICS: 8133, Clicked the 'Next' button to go to page 2.
NAICS: 8133, Page 2: Extracted 100 links.
NAICS: 8133, Page 2: Wrote 100 links to tmp_8133_4_52.txt
NAICS: 8133, Clicked the 'Next' button to go to page 3.
NAICS: 8133, Page 3: Extracted 100 links.
NAICS: 8133, Page 3: Wrote 100 links to tmp_8133_4_52.txt
NAICS: 8133, Clicked the 'Next' button to go to page 4.
NAICS: 8133, Page 4: Extracted 100 links.
NAICS: 8133, Page 4: Wrote 100 links to tmp_8133_4_52.txt
NAICS: 8133, Clicked the 'Next' button to go to page 5.
NAICS: 8133, Page 5: Extracted 100 links.
NAICS: 8133, Page 5: Wrote 100 links to tmp_8133_4_52.txt
NAICS: 8133, Clicked the 'Next' button to go to page 6.
NAICS: 8133, Page 6: Extracted 100 links.
NAICS: 8133, Page 6: Wrote 100 links to tmp_8133_4_52.txt
NAICS: 8133, Clicked the 'Next' button t

Batch 6/20:   0%|          | 0/16 [00:00<?, ?it/s]

Clicked the Submit button for NAICS: 2212.
Switched to the new tab for NAICS: 2212.
NAICS: 2212, Page 1: Extracted 100 links.
NAICS: 2212, Page 1: Wrote 100 links to tmp_2212_5_20.txt
NAICS: 2212, Clicked the 'Next' button to go to page 2.
NAICS: 2212, Page 2: Extracted 49 links.
NAICS: 2212, Page 2: Wrote 49 links to tmp_2212_5_20.txt
NAICS: 2212, No 'Next' button found or an error occurred.
NAICS: 2212: Total links extracted: 149
Closed the result tab for NAICS: 2212.
Clicked the Submit button for NAICS: 4232.
Switched to the new tab for NAICS: 4232.
NAICS: 4232, Page 1: Extracted 100 links.
NAICS: 4232, Page 1: Wrote 100 links to tmp_4232_5_20.txt
NAICS: 4232, Clicked the 'Next' button to go to page 2.
NAICS: 4232, Page 2: Extracted 100 links.
NAICS: 4232, Page 2: Wrote 100 links to tmp_4232_5_21.txt
NAICS: 4232, Clicked the 'Next' button to go to page 3.
NAICS: 4232, Page 3: Extracted 66 links.
NAICS: 4232, Page 3: Wrote 66 links to tmp_4232_5_21.txt
NAICS: 4232, No 'Next' button f

Batch 7/20:   0%|          | 0/16 [00:00<?, ?it/s]

Clicked the Submit button for NAICS: 1125.
Switched to the new tab for NAICS: 1125.
NAICS: 1125, Page 1: Extracted 0 links.
NAICS: 1125, Page 1: Wrote 0 links to tmp_1125_5_30.txt
NAICS: 1125, No 'Next' button found or an error occurred.
NAICS: 1125: Total links extracted: 0
Closed the result tab for NAICS: 1125.
Clicked the Submit button for NAICS: 5621.
Switched to the new tab for NAICS: 5621.
NAICS: 5621, Page 1: Extracted 100 links.
NAICS: 5621, Page 1: Wrote 100 links to tmp_5621_5_30.txt
NAICS: 5621, Clicked the 'Next' button to go to page 2.
NAICS: 5621, Page 2: Extracted 100 links.
NAICS: 5621, Page 2: Wrote 100 links to tmp_5621_5_31.txt
NAICS: 5621, Clicked the 'Next' button to go to page 3.
NAICS: 5621, Page 3: Extracted 100 links.
NAICS: 5621, Page 3: Wrote 100 links to tmp_5621_5_31.txt
NAICS: 5621, Clicked the 'Next' button to go to page 4.
NAICS: 5621, Page 4: Extracted 79 links.
NAICS: 5621, Page 4: Wrote 79 links to tmp_5621_5_31.txt
NAICS: 5621, No 'Next' button found

Batch 8/20:   0%|          | 0/16 [00:00<?, ?it/s]

Error processing NAICS: 2372: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=131.0.6778.86)
Stacktrace:
0   chromedriver                        0x0000000104817ac4 cxxbridge1$str$ptr + 3651580
1   chromedriver                        0x0000000104810314 cxxbridge1$str$ptr + 3620940
2   chromedriver                        0x00000001042784b4 cxxbridge1$string$len + 89224
3   chromedriver                        0x0000000104253994 core::str::slice_error_fail::ha0e52dbcb60e6bae + 3780
4   chromedriver                        0x00000001042e2890 cxxbridge1$string$len + 524388
5   chromedriver                        0x00000001042f576c cxxbridge1$string$len + 601920
6   chromedriver                        0x00000001042b10b0 cxxbridge1$string$len + 321668
7   chromedriver                        0x00000001042b1d00 cxxbridge1$string$len + 324820
8   chromedriver                        0x00000001047e2e08 cxxbridge1$str$ptr + 3435328


Batch 9/20:   0%|          | 0/16 [00:00<?, ?it/s]

Error processing NAICS: 5231: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=131.0.6778.86)
Stacktrace:
0   chromedriver                        0x0000000104817ac4 cxxbridge1$str$ptr + 3651580
1   chromedriver                        0x0000000104810314 cxxbridge1$str$ptr + 3620940
2   chromedriver                        0x00000001042784b4 cxxbridge1$string$len + 89224
3   chromedriver                        0x0000000104253994 core::str::slice_error_fail::ha0e52dbcb60e6bae + 3780
4   chromedriver                        0x00000001042e2890 cxxbridge1$string$len + 524388
5   chromedriver                        0x00000001042f576c cxxbridge1$string$len + 601920
6   chromedriver                        0x00000001042b10b0 cxxbridge1$string$len + 321668
7   chromedriver                        0x00000001042b1d00 cxxbridge1$string$len + 324820
8   chromedriver                        0x00000001047e2e08 cxxbridge1$str$ptr + 3435328


Batch 10/20:   0%|          | 0/16 [00:00<?, ?it/s]

Error processing NAICS: 2373: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=131.0.6778.86)
Stacktrace:
0   chromedriver                        0x0000000104817ac4 cxxbridge1$str$ptr + 3651580
1   chromedriver                        0x0000000104810314 cxxbridge1$str$ptr + 3620940
2   chromedriver                        0x00000001042784b4 cxxbridge1$string$len + 89224
3   chromedriver                        0x0000000104253994 core::str::slice_error_fail::ha0e52dbcb60e6bae + 3780
4   chromedriver                        0x00000001042e2890 cxxbridge1$string$len + 524388
5   chromedriver                        0x00000001042f576c cxxbridge1$string$len + 601920
6   chromedriver                        0x00000001042b10b0 cxxbridge1$string$len + 321668
7   chromedriver                        0x00000001042b1d00 cxxbridge1$string$len + 324820
8   chromedriver                        0x00000001047e2e08 cxxbridge1$str$ptr + 3435328


Batch 11/20:   0%|          | 0/16 [00:00<?, ?it/s]

Error processing NAICS: 7121: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=131.0.6778.86)
Stacktrace:
0   chromedriver                        0x0000000104817ac4 cxxbridge1$str$ptr + 3651580
1   chromedriver                        0x0000000104810314 cxxbridge1$str$ptr + 3620940
2   chromedriver                        0x00000001042784b4 cxxbridge1$string$len + 89224
3   chromedriver                        0x0000000104253994 core::str::slice_error_fail::ha0e52dbcb60e6bae + 3780
4   chromedriver                        0x00000001042e2890 cxxbridge1$string$len + 524388
5   chromedriver                        0x00000001042f576c cxxbridge1$string$len + 601920
6   chromedriver                        0x00000001042b10b0 cxxbridge1$string$len + 321668
7   chromedriver                        0x00000001042b1d00 cxxbridge1$string$len + 324820
8   chromedriver                        0x00000001047e2e08 cxxbridge1$str$ptr + 3435328


Batch 12/20:   0%|          | 0/16 [00:00<?, ?it/s]

Error processing NAICS: 9221: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=131.0.6778.86)
Stacktrace:
0   chromedriver                        0x0000000104817ac4 cxxbridge1$str$ptr + 3651580
1   chromedriver                        0x0000000104810314 cxxbridge1$str$ptr + 3620940
2   chromedriver                        0x00000001042784b4 cxxbridge1$string$len + 89224
3   chromedriver                        0x0000000104253994 core::str::slice_error_fail::ha0e52dbcb60e6bae + 3780
4   chromedriver                        0x00000001042e2890 cxxbridge1$string$len + 524388
5   chromedriver                        0x00000001042f576c cxxbridge1$string$len + 601920
6   chromedriver                        0x00000001042b10b0 cxxbridge1$string$len + 321668
7   chromedriver                        0x00000001042b1d00 cxxbridge1$string$len + 324820
8   chromedriver                        0x00000001047e2e08 cxxbridge1$str$ptr + 3435328


Batch 13/20:   0%|          | 0/16 [00:00<?, ?it/s]

Error processing NAICS: 1141: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=131.0.6778.86)
Stacktrace:
0   chromedriver                        0x0000000104817ac4 cxxbridge1$str$ptr + 3651580
1   chromedriver                        0x0000000104810314 cxxbridge1$str$ptr + 3620940
2   chromedriver                        0x00000001042784b4 cxxbridge1$string$len + 89224
3   chromedriver                        0x0000000104253994 core::str::slice_error_fail::ha0e52dbcb60e6bae + 3780
4   chromedriver                        0x00000001042e2890 cxxbridge1$string$len + 524388
5   chromedriver                        0x00000001042f576c cxxbridge1$string$len + 601920
6   chromedriver                        0x00000001042b10b0 cxxbridge1$string$len + 321668
7   chromedriver                        0x00000001042b1d00 cxxbridge1$string$len + 324820
8   chromedriver                        0x00000001047e2e08 cxxbridge1$str$ptr + 3435328


Batch 14/20:   0%|          | 0/16 [00:00<?, ?it/s]

Error processing NAICS: 5222: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=131.0.6778.86)
Stacktrace:
0   chromedriver                        0x0000000104817ac4 cxxbridge1$str$ptr + 3651580
1   chromedriver                        0x0000000104810314 cxxbridge1$str$ptr + 3620940
2   chromedriver                        0x00000001042784b4 cxxbridge1$string$len + 89224
3   chromedriver                        0x0000000104253994 core::str::slice_error_fail::ha0e52dbcb60e6bae + 3780
4   chromedriver                        0x00000001042e2890 cxxbridge1$string$len + 524388
5   chromedriver                        0x00000001042f576c cxxbridge1$string$len + 601920
6   chromedriver                        0x00000001042b10b0 cxxbridge1$string$len + 321668
7   chromedriver                        0x00000001042b1d00 cxxbridge1$string$len + 324820
8   chromedriver                        0x00000001047e2e08 cxxbridge1$str$ptr + 3435328


Batch 15/20:   0%|          | 0/16 [00:00<?, ?it/s]

Error processing NAICS: 5414: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=131.0.6778.86)
Stacktrace:
0   chromedriver                        0x0000000104817ac4 cxxbridge1$str$ptr + 3651580
1   chromedriver                        0x0000000104810314 cxxbridge1$str$ptr + 3620940
2   chromedriver                        0x00000001042784b4 cxxbridge1$string$len + 89224
3   chromedriver                        0x0000000104253994 core::str::slice_error_fail::ha0e52dbcb60e6bae + 3780
4   chromedriver                        0x00000001042e2890 cxxbridge1$string$len + 524388
5   chromedriver                        0x00000001042f576c cxxbridge1$string$len + 601920
6   chromedriver                        0x00000001042b10b0 cxxbridge1$string$len + 321668
7   chromedriver                        0x00000001042b1d00 cxxbridge1$string$len + 324820
8   chromedriver                        0x00000001047e2e08 cxxbridge1$str$ptr + 3435328


Batch 16/20:   0%|          | 0/16 [00:00<?, ?it/s]

Error processing NAICS: 1153: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=131.0.6778.86)
Stacktrace:
0   chromedriver                        0x0000000104817ac4 cxxbridge1$str$ptr + 3651580
1   chromedriver                        0x0000000104810314 cxxbridge1$str$ptr + 3620940
2   chromedriver                        0x00000001042784b4 cxxbridge1$string$len + 89224
3   chromedriver                        0x0000000104253994 core::str::slice_error_fail::ha0e52dbcb60e6bae + 3780
4   chromedriver                        0x00000001042e2890 cxxbridge1$string$len + 524388
5   chromedriver                        0x00000001042f576c cxxbridge1$string$len + 601920
6   chromedriver                        0x00000001042b10b0 cxxbridge1$string$len + 321668
7   chromedriver                        0x00000001042b1d00 cxxbridge1$string$len + 324820
8   chromedriver                        0x00000001047e2e08 cxxbridge1$str$ptr + 3435328


Batch 17/20:   0%|          | 0/16 [00:00<?, ?it/s]

Error processing NAICS: 3315: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=131.0.6778.86)
Stacktrace:
0   chromedriver                        0x0000000104817ac4 cxxbridge1$str$ptr + 3651580
1   chromedriver                        0x0000000104810314 cxxbridge1$str$ptr + 3620940
2   chromedriver                        0x00000001042784b4 cxxbridge1$string$len + 89224
3   chromedriver                        0x0000000104253994 core::str::slice_error_fail::ha0e52dbcb60e6bae + 3780
4   chromedriver                        0x00000001042e2890 cxxbridge1$string$len + 524388
5   chromedriver                        0x00000001042f576c cxxbridge1$string$len + 601920
6   chromedriver                        0x00000001042b10b0 cxxbridge1$string$len + 321668
7   chromedriver                        0x00000001042b1d00 cxxbridge1$string$len + 324820
8   chromedriver                        0x00000001047e2e08 cxxbridge1$str$ptr + 3435328


Batch 18/20:   0%|          | 0/16 [00:00<?, ?it/s]

Error processing NAICS: 5617: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=131.0.6778.86)
Stacktrace:
0   chromedriver                        0x0000000104817ac4 cxxbridge1$str$ptr + 3651580
1   chromedriver                        0x0000000104810314 cxxbridge1$str$ptr + 3620940
2   chromedriver                        0x00000001042784b4 cxxbridge1$string$len + 89224
3   chromedriver                        0x0000000104253994 core::str::slice_error_fail::ha0e52dbcb60e6bae + 3780
4   chromedriver                        0x00000001042e2890 cxxbridge1$string$len + 524388
5   chromedriver                        0x00000001042f576c cxxbridge1$string$len + 601920
6   chromedriver                        0x00000001042b10b0 cxxbridge1$string$len + 321668
7   chromedriver                        0x00000001042b1d00 cxxbridge1$string$len + 324820
8   chromedriver                        0x00000001047e2e08 cxxbridge1$str$ptr + 3435328


Batch 19/20:   0%|          | 0/16 [00:00<?, ?it/s]

Error processing NAICS: 5152: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=131.0.6778.86)
Stacktrace:
0   chromedriver                        0x0000000104817ac4 cxxbridge1$str$ptr + 3651580
1   chromedriver                        0x0000000104810314 cxxbridge1$str$ptr + 3620940
2   chromedriver                        0x00000001042784b4 cxxbridge1$string$len + 89224
3   chromedriver                        0x0000000104253994 core::str::slice_error_fail::ha0e52dbcb60e6bae + 3780
4   chromedriver                        0x00000001042e2890 cxxbridge1$string$len + 524388
5   chromedriver                        0x00000001042f576c cxxbridge1$string$len + 601920
6   chromedriver                        0x00000001042b10b0 cxxbridge1$string$len + 321668
7   chromedriver                        0x00000001042b1d00 cxxbridge1$string$len + 324820
8   chromedriver                        0x00000001047e2e08 cxxbridge1$str$ptr + 3435328


Batch 20/20:   0%|          | 0/7 [00:00<?, ?it/s]

Error processing NAICS: 5611: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=131.0.6778.86)
Stacktrace:
0   chromedriver                        0x0000000104817ac4 cxxbridge1$str$ptr + 3651580
1   chromedriver                        0x0000000104810314 cxxbridge1$str$ptr + 3620940
2   chromedriver                        0x00000001042784b4 cxxbridge1$string$len + 89224
3   chromedriver                        0x0000000104253994 core::str::slice_error_fail::ha0e52dbcb60e6bae + 3780
4   chromedriver                        0x00000001042e2890 cxxbridge1$string$len + 524388
5   chromedriver                        0x00000001042f576c cxxbridge1$string$len + 601920
6   chromedriver                        0x00000001042b10b0 cxxbridge1$string$len + 321668
7   chromedriver                        0x00000001042b1d00 cxxbridge1$string$len + 324820
8   chromedriver                        0x00000001047e2e08 cxxbridge1$str$ptr + 3435328


In [11]:
with open("qnaics_codes.txt", 'r') as f:
        codes = [line.strip() for line in f.readlines()]

    # Split the tasks into batches
batches = split_into_batches(codes, 20)
batches[6]

['1125',
 '5621',
 '3345',
 '4872',
 '3379',
 '3118',
 '2361',
 '9231',
 '5416',
 '3314',
 '3122',
 '3369',
 '5322',
 '3131',
 '3336',
 '2121']

In [12]:
await main(start_batch=7,end_batch=20)

Processing batch 7/20...


Batch 7/20:   0%|          | 0/16 [00:00<?, ?it/s]

Clicked the Submit button for NAICS: 1125.
Switched to the new tab for NAICS: 1125.
NAICS: 1125, Page 1: Extracted 0 links.
NAICS: 1125, Page 1: Wrote 0 links to tmp_1125_5_45.txt
NAICS: 1125, No 'Next' button found or an error occurred.
NAICS: 1125: Total links extracted: 0
Closed the result tab for NAICS: 1125.
Clicked the Submit button for NAICS: 5621.
Switched to the new tab for NAICS: 5621.
NAICS: 5621, Page 1: Extracted 100 links.
NAICS: 5621, Page 1: Wrote 100 links to tmp_5621_5_45.txt
NAICS: 5621, Clicked the 'Next' button to go to page 2.
NAICS: 5621, Page 2: Extracted 100 links.
NAICS: 5621, Page 2: Wrote 100 links to tmp_5621_5_45.txt
NAICS: 5621, Clicked the 'Next' button to go to page 3.
NAICS: 5621, Page 3: Extracted 100 links.
NAICS: 5621, Page 3: Wrote 100 links to tmp_5621_5_45.txt
NAICS: 5621, Clicked the 'Next' button to go to page 4.
NAICS: 5621, Page 4: Extracted 79 links.
NAICS: 5621, Page 4: Wrote 79 links to tmp_5621_5_45.txt
NAICS: 5621, No 'Next' button found

Batch 8/20:   0%|          | 0/16 [00:00<?, ?it/s]

Clicked the Submit button for NAICS: 2372.
Switched to the new tab for NAICS: 2372.
NAICS: 2372, Page 1: Extracted 100 links.
NAICS: 2372, Page 1: Wrote 100 links to tmp_2372_6_0.txt
NAICS: 2372, Clicked the 'Next' button to go to page 2.
NAICS: 2372, Page 2: Extracted 100 links.
NAICS: 2372, Page 2: Wrote 100 links to tmp_2372_6_0.txt
NAICS: 2372, Clicked the 'Next' button to go to page 3.
NAICS: 2372, Page 3: Extracted 100 links.
NAICS: 2372, Page 3: Wrote 100 links to tmp_2372_6_0.txt
NAICS: 2372, Clicked the 'Next' button to go to page 4.
NAICS: 2372, Page 4: Extracted 83 links.
NAICS: 2372, Page 4: Wrote 83 links to tmp_2372_6_0.txt
NAICS: 2372, No 'Next' button found or an error occurred.
NAICS: 2372: Total links extracted: 383
Closed the result tab for NAICS: 2372.
Clicked the Submit button for NAICS: 6215.
Switched to the new tab for NAICS: 6215.
NAICS: 6215, Page 1: Extracted 100 links.
NAICS: 6215, Page 1: Wrote 100 links to tmp_6215_6_0.txt
NAICS: 6215, Clicked the 'Next' bu

Batch 9/20:   0%|          | 0/16 [00:00<?, ?it/s]

Clicked the Submit button for NAICS: 5231.
Switched to the new tab for NAICS: 5231.
NAICS: 5231, Page 1: Extracted 100 links.
NAICS: 5231, Page 1: Wrote 100 links to tmp_5231_6_26.txt
NAICS: 5231, Clicked the 'Next' button to go to page 2.
NAICS: 5231, Page 2: Extracted 100 links.
NAICS: 5231, Page 2: Wrote 100 links to tmp_5231_6_26.txt
NAICS: 5231, Clicked the 'Next' button to go to page 3.
NAICS: 5231, Page 3: Extracted 100 links.
NAICS: 5231, Page 3: Wrote 100 links to tmp_5231_6_26.txt
NAICS: 5231, Clicked the 'Next' button to go to page 4.
NAICS: 5231, Page 4: Extracted 100 links.
NAICS: 5231, Page 4: Wrote 100 links to tmp_5231_6_26.txt
NAICS: 5231, Clicked the 'Next' button to go to page 5.
NAICS: 5231, Page 5: Extracted 100 links.
NAICS: 5231, Page 5: Wrote 100 links to tmp_5231_6_26.txt
NAICS: 5231, Clicked the 'Next' button to go to page 6.
NAICS: 5231, Page 6: Extracted 100 links.
NAICS: 5231, Page 6: Wrote 100 links to tmp_5231_6_27.txt
NAICS: 5231, Clicked the 'Next' butt

Batch 10/20:   0%|          | 0/16 [00:00<?, ?it/s]

Clicked the Submit button for NAICS: 2373.
Switched to the new tab for NAICS: 2373.
NAICS: 2373, Page 1: Extracted 100 links.
NAICS: 2373, Page 1: Wrote 100 links to tmp_2373_6_38.txt
NAICS: 2373, Clicked the 'Next' button to go to page 2.
NAICS: 2373, Page 2: Extracted 100 links.
NAICS: 2373, Page 2: Wrote 100 links to tmp_2373_6_38.txt
NAICS: 2373, Clicked the 'Next' button to go to page 3.
NAICS: 2373, Page 3: Extracted 100 links.
NAICS: 2373, Page 3: Wrote 100 links to tmp_2373_6_38.txt
NAICS: 2373, Clicked the 'Next' button to go to page 4.
NAICS: 2373, Page 4: Extracted 100 links.
NAICS: 2373, Page 4: Wrote 100 links to tmp_2373_6_38.txt
NAICS: 2373, Clicked the 'Next' button to go to page 5.
NAICS: 2373, Page 5: Extracted 100 links.
NAICS: 2373, Page 5: Wrote 100 links to tmp_2373_6_38.txt
NAICS: 2373, Clicked the 'Next' button to go to page 6.
NAICS: 2373, Page 6: Extracted 100 links.
NAICS: 2373, Page 6: Wrote 100 links to tmp_2373_6_38.txt
NAICS: 2373, Clicked the 'Next' butt

Batch 11/20:   0%|          | 0/16 [00:00<?, ?it/s]

Clicked the Submit button for NAICS: 7121.
Switched to the new tab for NAICS: 7121.
NAICS: 7121, Page 1: Extracted 100 links.
NAICS: 7121, Page 1: Wrote 100 links to tmp_7121_6_41.txt
NAICS: 7121, Clicked the 'Next' button to go to page 2.
NAICS: 7121, Page 2: Extracted 100 links.
NAICS: 7121, Page 2: Wrote 100 links to tmp_7121_6_42.txt
NAICS: 7121, Clicked the 'Next' button to go to page 3.
NAICS: 7121, Page 3: Extracted 100 links.
NAICS: 7121, Page 3: Wrote 100 links to tmp_7121_6_42.txt
NAICS: 7121, Clicked the 'Next' button to go to page 4.
NAICS: 7121, Page 4: Extracted 100 links.
NAICS: 7121, Page 4: Wrote 100 links to tmp_7121_6_42.txt
NAICS: 7121, Clicked the 'Next' button to go to page 5.
NAICS: 7121, Page 5: Extracted 100 links.
NAICS: 7121, Page 5: Wrote 100 links to tmp_7121_6_42.txt
NAICS: 7121, Clicked the 'Next' button to go to page 6.
NAICS: 7121, Page 6: Extracted 100 links.
NAICS: 7121, Page 6: Wrote 100 links to tmp_7121_6_42.txt
NAICS: 7121, Clicked the 'Next' butt

Batch 12/20:   0%|          | 0/16 [00:00<?, ?it/s]

Clicked the Submit button for NAICS: 9221.
Switched to the new tab for NAICS: 9221.
NAICS: 9221, Page 1: Extracted 0 links.
NAICS: 9221, Page 1: Wrote 0 links to tmp_9221_6_48.txt
NAICS: 9221, Clicked the 'Next' button to go to page 2.
NAICS: 9221, Page 2: Extracted 100 links.
NAICS: 9221, Page 2: Wrote 100 links to tmp_9221_6_48.txt
NAICS: 9221, Clicked the 'Next' button to go to page 3.
NAICS: 9221, Page 3: Extracted 100 links.
NAICS: 9221, Page 3: Wrote 100 links to tmp_9221_6_48.txt
NAICS: 9221, Clicked the 'Next' button to go to page 4.
NAICS: 9221, Page 4: Extracted 100 links.
NAICS: 9221, Page 4: Wrote 100 links to tmp_9221_6_48.txt
NAICS: 9221, Clicked the 'Next' button to go to page 5.
NAICS: 9221, Page 5: Extracted 100 links.
NAICS: 9221, Page 5: Wrote 100 links to tmp_9221_6_48.txt
NAICS: 9221, Clicked the 'Next' button to go to page 6.
NAICS: 9221, Page 6: Extracted 100 links.
NAICS: 9221, Page 6: Wrote 100 links to tmp_9221_6_48.txt
NAICS: 9221, Clicked the 'Next' button t

Batch 13/20:   0%|          | 0/16 [00:00<?, ?it/s]

Clicked the Submit button for NAICS: 1141.
Switched to the new tab for NAICS: 1141.
NAICS: 1141, Page 1: Extracted 13 links.
NAICS: 1141, Page 1: Wrote 13 links to tmp_1141_6_56.txt
NAICS: 1141, No 'Next' button found or an error occurred.
NAICS: 1141: Total links extracted: 13
Closed the result tab for NAICS: 1141.
Clicked the Submit button for NAICS: 3322.
Switched to the new tab for NAICS: 3322.
NAICS: 3322, Page 1: Extracted 41 links.
NAICS: 3322, Page 1: Wrote 41 links to tmp_3322_6_56.txt
NAICS: 3322, No 'Next' button found or an error occurred.
NAICS: 3322: Total links extracted: 41
Closed the result tab for NAICS: 3322.
Clicked the Submit button for NAICS: 5251.
Switched to the new tab for NAICS: 5251.
NAICS: 5251, Page 1: Extracted 32 links.
NAICS: 5251, Page 1: Wrote 32 links to tmp_5251_6_57.txt
NAICS: 5251, No 'Next' button found or an error occurred.
NAICS: 5251: Total links extracted: 32
Closed the result tab for NAICS: 5251.
Clicked the Submit button for NAICS: 5211.
Swi

Batch 14/20:   0%|          | 0/16 [00:00<?, ?it/s]

Clicked the Submit button for NAICS: 5222.
Switched to the new tab for NAICS: 5222.
NAICS: 5222, Page 1: Extracted 100 links.
NAICS: 5222, Page 1: Wrote 100 links to tmp_5222_7_26.txt
NAICS: 5222, Clicked the 'Next' button to go to page 2.
NAICS: 5222, Page 2: Extracted 100 links.
NAICS: 5222, Page 2: Wrote 100 links to tmp_5222_7_26.txt
NAICS: 5222, Clicked the 'Next' button to go to page 3.
NAICS: 5222, Page 3: Extracted 100 links.
NAICS: 5222, Page 3: Wrote 100 links to tmp_5222_7_26.txt
NAICS: 5222, Clicked the 'Next' button to go to page 4.
NAICS: 5222, Page 4: Extracted 100 links.
NAICS: 5222, Page 4: Wrote 100 links to tmp_5222_7_26.txt
NAICS: 5222, Clicked the 'Next' button to go to page 5.
NAICS: 5222, Page 5: Extracted 100 links.
NAICS: 5222, Page 5: Wrote 100 links to tmp_5222_7_26.txt
NAICS: 5222, Clicked the 'Next' button to go to page 6.
NAICS: 5222, Page 6: Extracted 100 links.
NAICS: 5222, Page 6: Wrote 100 links to tmp_5222_7_26.txt
NAICS: 5222, Clicked the 'Next' butt

Batch 15/20:   0%|          | 0/16 [00:00<?, ?it/s]

Clicked the Submit button for NAICS: 5414.
Switched to the new tab for NAICS: 5414.
NAICS: 5414, Page 1: Extracted 100 links.
NAICS: 5414, Page 1: Wrote 100 links to tmp_5414_7_33.txt
NAICS: 5414, Clicked the 'Next' button to go to page 2.
NAICS: 5414, Page 2: Extracted 100 links.
NAICS: 5414, Page 2: Wrote 100 links to tmp_5414_7_33.txt
NAICS: 5414, Clicked the 'Next' button to go to page 3.
NAICS: 5414, Page 3: Extracted 100 links.
NAICS: 5414, Page 3: Wrote 100 links to tmp_5414_7_33.txt
NAICS: 5414, Clicked the 'Next' button to go to page 4.
NAICS: 5414, Page 4: Extracted 100 links.
NAICS: 5414, Page 4: Wrote 100 links to tmp_5414_7_33.txt
NAICS: 5414, Clicked the 'Next' button to go to page 5.
NAICS: 5414, Page 5: Extracted 100 links.
NAICS: 5414, Page 5: Wrote 100 links to tmp_5414_7_33.txt
NAICS: 5414, Clicked the 'Next' button to go to page 6.
NAICS: 5414, Page 6: Extracted 100 links.
NAICS: 5414, Page 6: Wrote 100 links to tmp_5414_7_33.txt
NAICS: 5414, Clicked the 'Next' butt

Batch 16/20:   0%|          | 0/16 [00:00<?, ?it/s]

Clicked the Submit button for NAICS: 1153.
Switched to the new tab for NAICS: 1153.
NAICS: 1153, Page 1: Extracted 100 links.
NAICS: 1153, Page 1: Wrote 100 links to tmp_1153_7_44.txt
NAICS: 1153, Clicked the 'Next' button to go to page 2.
NAICS: 1153, Page 2: Extracted 25 links.
NAICS: 1153, Page 2: Wrote 25 links to tmp_1153_7_44.txt
NAICS: 1153, No 'Next' button found or an error occurred.
NAICS: 1153: Total links extracted: 125
Closed the result tab for NAICS: 1153.
Clicked the Submit button for NAICS: 4542.
Switched to the new tab for NAICS: 4542.
NAICS: 4542, Page 1: Extracted 0 links.
NAICS: 4542, Page 1: Wrote 0 links to tmp_4542_7_44.txt
NAICS: 4542, No 'Next' button found or an error occurred.
NAICS: 4542: Total links extracted: 0
Closed the result tab for NAICS: 4542.
Clicked the Submit button for NAICS: 4532.
Switched to the new tab for NAICS: 4532.
NAICS: 4532, Page 1: Extracted 0 links.
NAICS: 4532, Page 1: Wrote 0 links to tmp_4532_7_44.txt
NAICS: 4532, No 'Next' button 

Batch 17/20:   0%|          | 0/16 [00:00<?, ?it/s]

Clicked the Submit button for NAICS: 3315.
Switched to the new tab for NAICS: 3315.
NAICS: 3315, Page 1: Extracted 34 links.
NAICS: 3315, Page 1: Wrote 34 links to tmp_3315_7_52.txt
NAICS: 3315, No 'Next' button found or an error occurred.
NAICS: 3315: Total links extracted: 34
Closed the result tab for NAICS: 3315.
Clicked the Submit button for NAICS: 4481.
Switched to the new tab for NAICS: 4481.
NAICS: 4481, Page 1: Extracted 0 links.
NAICS: 4481, Page 1: Wrote 0 links to tmp_4481_7_52.txt
NAICS: 4481, No 'Next' button found or an error occurred.
NAICS: 4481: Total links extracted: 0
Closed the result tab for NAICS: 4481.
Clicked the Submit button for NAICS: 1114.
Switched to the new tab for NAICS: 1114.
NAICS: 1114, Page 1: Extracted 100 links.
NAICS: 1114, Page 1: Wrote 100 links to tmp_1114_7_52.txt
NAICS: 1114, Clicked the 'Next' button to go to page 2.
NAICS: 1114, Page 2: Extracted 71 links.
NAICS: 1114, Page 2: Wrote 71 links to tmp_1114_7_52.txt
NAICS: 1114, No 'Next' button

Batch 18/20:   0%|          | 0/16 [00:00<?, ?it/s]

Clicked the Submit button for NAICS: 5617.
Switched to the new tab for NAICS: 5617.
NAICS: 5617, Page 1: Extracted 100 links.
NAICS: 5617, Page 1: Wrote 100 links to tmp_5617_7_54.txt
NAICS: 5617, Clicked the 'Next' button to go to page 2.
NAICS: 5617, Page 2: Extracted 100 links.
NAICS: 5617, Page 2: Wrote 100 links to tmp_5617_7_54.txt
NAICS: 5617, Clicked the 'Next' button to go to page 3.
NAICS: 5617, Page 3: Extracted 100 links.
NAICS: 5617, Page 3: Wrote 100 links to tmp_5617_7_54.txt
NAICS: 5617, Clicked the 'Next' button to go to page 4.
NAICS: 5617, Page 4: Extracted 100 links.
NAICS: 5617, Page 4: Wrote 100 links to tmp_5617_7_54.txt
NAICS: 5617, Clicked the 'Next' button to go to page 5.
NAICS: 5617, Page 5: Extracted 100 links.
NAICS: 5617, Page 5: Wrote 100 links to tmp_5617_7_54.txt
NAICS: 5617, Clicked the 'Next' button to go to page 6.
NAICS: 5617, Page 6: Extracted 100 links.
NAICS: 5617, Page 6: Wrote 100 links to tmp_5617_7_54.txt
NAICS: 5617, Clicked the 'Next' butt

Batch 19/20:   0%|          | 0/16 [00:00<?, ?it/s]

Clicked the Submit button for NAICS: 5152.
Switched to the new tab for NAICS: 5152.
NAICS: 5152, Page 1: Extracted 0 links.
NAICS: 5152, Page 1: Wrote 0 links to tmp_5152_8_10.txt
NAICS: 5152, No 'Next' button found or an error occurred.
NAICS: 5152: Total links extracted: 0
Closed the result tab for NAICS: 5152.
Clicked the Submit button for NAICS: 3372.
Switched to the new tab for NAICS: 3372.
NAICS: 3372, Page 1: Extracted 93 links.
NAICS: 3372, Page 1: Wrote 93 links to tmp_3372_8_10.txt
NAICS: 3372, No 'Next' button found or an error occurred.
NAICS: 3372: Total links extracted: 93
Closed the result tab for NAICS: 3372.
Clicked the Submit button for NAICS: 4245.
Switched to the new tab for NAICS: 4245.
NAICS: 4245, Page 1: Extracted 100 links.
NAICS: 4245, Page 1: Wrote 100 links to tmp_4245_8_10.txt
NAICS: 4245, Clicked the 'Next' button to go to page 2.
NAICS: 4245, Page 2: Extracted 12 links.
NAICS: 4245, Page 2: Wrote 12 links to tmp_4245_8_11.txt
NAICS: 4245, No 'Next' button

Batch 20/20:   0%|          | 0/7 [00:00<?, ?it/s]

Clicked the Submit button for NAICS: 5611.
Switched to the new tab for NAICS: 5611.
NAICS: 5611, Page 1: Extracted 100 links.
NAICS: 5611, Page 1: Wrote 100 links to tmp_5611_8_14.txt
NAICS: 5611, Clicked the 'Next' button to go to page 2.
NAICS: 5611, Page 2: Extracted 100 links.
NAICS: 5611, Page 2: Wrote 100 links to tmp_5611_8_14.txt
NAICS: 5611, Clicked the 'Next' button to go to page 3.
NAICS: 5611, Page 3: Extracted 100 links.
NAICS: 5611, Page 3: Wrote 100 links to tmp_5611_8_14.txt
NAICS: 5611, Clicked the 'Next' button to go to page 4.
NAICS: 5611, Page 4: Extracted 100 links.
NAICS: 5611, Page 4: Wrote 100 links to tmp_5611_8_14.txt
NAICS: 5611, Clicked the 'Next' button to go to page 5.
NAICS: 5611, Page 5: Extracted 100 links.
NAICS: 5611, Page 5: Wrote 100 links to tmp_5611_8_14.txt
NAICS: 5611, Clicked the 'Next' button to go to page 6.
NAICS: 5611, Page 6: Extracted 100 links.
NAICS: 5611, Page 6: Wrote 100 links to tmp_5611_8_14.txt
NAICS: 5611, Clicked the 'Next' butt

## Task Eval

In [23]:
problematic_codes = []
problematic_start_codes = []

In [18]:
worklist = []
complete_log = []
with open("worklog.txt",'r') as f:
    for line in f.readlines():
        vals = line.strip().split("]")[1]
        vallst = vals.split(',')
        if len(vallst)==1:
            code, linknum = vals.split(".")
            complete_log.append((code,linknum))
            continue
        if len(vallst)!=3:
            print(vallst)
            continue
        naics, page, link_extracted = vals.split(",")
        worklist.append((naics, page, link_extracted))



In [24]:
for ele in complete_log:
    code, linknum = ele
    code = int(code.split(":")[1])
    linknum = int(linknum.split(":")[1])
    if linknum == 0:
        problematic_codes.append(code)

In [27]:
for ele in worklist:
    code, page, linknum = ele
    code = int(code.split(":")[1])
    page = int(page.split(":")[1])
    linknum = int(linknum.split(":")[1])
    # exclude already problematic
    if code in problematic_codes:
        continue
    if linknum == 0:
        if page != 1:
            problematic_codes.append(code)
        else:
            problematic_start_codes.append(code)

In [28]:
problematic_codes

[4541,
 4512,
 3211,
 5612,
 5191,
 6232,
 3352,
 4483,
 4533,
 2111,
 4422,
 4421,
 7213,
 4511,
 3351,
 1125,
 1125,
 3131,
 2121,
 4821,
 5112,
 5173,
 4543,
 8141,
 4862,
 5151,
 4539,
 3255,
 3117,
 5211,
 5179,
 4431,
 4542,
 4532,
 3274,
 4481,
 4471,
 4482,
 4461,
 4523,
 5111,
 4522,
 5152,
 4531,
 6241]

In [40]:
with open ("problematic_codes.txt",'w') as f:
    for c in problematic_codes:
        f.write(str(c))
        f.write("\n")

## Problematic_tasks

In [43]:
import math
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from bs4 import BeautifulSoup
import asyncio
from tqdm.notebook import tqdm
from datetime import datetime
import nest_asyncio

# Allow nested event loops in Jupyter Notebook
nest_asyncio.apply()

# Function to split data into batches
def split_into_batches(data, num_batches):
    """
    Splits a list into a specified number of batches.
    """
    batch_size = math.ceil(len(data) / num_batches)
    return [data[i * batch_size:(i + 1) * batch_size] for i in range(num_batches)]

# Async function to fetch links for a single NAICS code
async def fetch_links(driver, naics_code):
    """
    Fetch all 'Detail' links for a given NAICS code.
    Handles pagination and logs progress to a worklog file.
    """
    try:
        # Step 1: Navigate to the Page
        url = "https://accessnc.nccommerce.com/business/business_custom_search_infogroup.html"
        driver.get(url)

        # Wait for the page to load
        await asyncio.sleep(1)

        # Step 2: Select Industry Group (e.g., NAICS Code)
        industry_dropdown = Select(driver.find_element(By.ID, "qnaics"))
        industry_dropdown.select_by_value(naics_code)

        # Step 3: Interact with the Submit Button
        submit_button = driver.find_element(By.CSS_SELECTOR, "div.form-group > button[type='submit'][class='btn btn-primary'][name='submit']")
        driver.execute_script("arguments[0].scrollIntoView(true);", submit_button)
        await asyncio.sleep(1)
        driver.execute_script("arguments[0].click();", submit_button)
        print(f"Clicked the Submit button for NAICS: {naics_code}.")

        # Step 4: Wait for Results to Load and Process the New Tab's HTML
        await asyncio.sleep(5)  # Wait to ensure the response page is fully loaded
        driver.switch_to.window(driver.window_handles[-1])  # Switch to the new tab
        print(f"Switched to the new tab for NAICS: {naics_code}.")

        # Initialize an empty list to store all links
        all_detail_links = []
        page_num = 1  # Initialize page counter

        while True:
            # Capture the HTML content of the current page
            result_html = driver.page_source
            soup = BeautifulSoup(result_html, 'html.parser')

            # Extract all links where the text is "Detail"
            detail_links = [link.get('href') for link in soup.find_all('a', string="Detail")]
            all_detail_links.extend(detail_links)
            print(f"NAICS: {naics_code}, Page {page_num}: Extracted {len(detail_links)} links.")

            # Write links to a temporary file
            timestamp = datetime.now()
            tmp_filename = f"problematic_{naics_code}_{timestamp.hour}_{timestamp.minute}.txt"
            with open(tmp_filename, "a") as file:
                for link in detail_links:
                    file.write(link + "\n")
            print(f"NAICS: {naics_code}, Page {page_num}: Wrote {len(detail_links)} links to {tmp_filename}")

            # Log progress to worklog
            with open("worklog.txt", "a") as log_file:
                log_file.write(f"[{timestamp}] NAICS: {naics_code}, Page: {page_num}, Links Extracted: {len(detail_links)}\n")

            # Find and click the "Next" button
            try:
                next_button = driver.find_element(By.CSS_SELECTOR, "a[rel='next']")
                if "disabled" in next_button.get_attribute("class"):
                    print(f"NAICS: {naics_code}, Page {page_num}: No more pages.")
                    break
                next_button.click()
                print(f"NAICS: {naics_code}, Clicked the 'Next' button to go to page {page_num + 1}.")
                page_num += 1
                await asyncio.sleep(2)  # Allow time for the next page to load
            except Exception:
                print(f"NAICS: {naics_code}, No 'Next' button found or an error occurred.")
                break

        print(f"NAICS: {naics_code}: Total links extracted: {len(all_detail_links)}")

        # Log final results
        with open("worklog.txt", "a") as log_file:
            log_file.write(f"[{datetime.now()}] Completed NAICS: {naics_code}. Total Links: {len(all_detail_links)}\n")

        # Close the result tab before returning to the main tab
        driver.close()
        await asyncio.sleep(1)  # Ensure tab closure completes
        driver.switch_to.window(driver.window_handles[0])
        print(f"Closed the result tab for NAICS: {naics_code}.")
    except Exception as e:
        print(f"Error processing NAICS: {naics_code}: {e}")

# Async main function
async def main(start_batch, end_batch):
    # Read NAICS codes from the file
    with open("problematic_codes.txt", 'r') as f:
        codes = [line.strip() for line in f.readlines()]

    # Split the tasks into batches
    batches = split_into_batches(codes, 4)

    # Validate batch range
    if start_batch < 1 or end_batch > len(batches):
        print(f"Invalid batch range: {start_batch}-{end_batch}. Must be between 1 and {len(batches)}.")
        return

    # Create a WebDriver instance
    driver = webdriver.Chrome()

    try:
        for batch_idx in range(start_batch - 1, end_batch):
            print(f"Processing batch {batch_idx + 1}/{end_batch}...")
            progress_bar = tqdm(total=len(batches[batch_idx]), desc=f"Batch {batch_idx + 1}/{end_batch}")

            # Run tasks sequentially in the current batch to ensure proper tab handling
            for naics_code in batches[batch_idx]:
                await fetch_links(driver, naics_code)
                progress_bar.update(1)

            progress_bar.close()
            print(f"Batch {batch_idx + 1} completed.")
            await asyncio.sleep(5)  # Optional rest between batches

        print(f"All tasks from batch {start_batch} to {end_batch} completed. Links are written to tmp files.")
    finally:
        driver.quit()
        print("Closed the WebDriver.")



In [34]:
split_into_batches(problematic_codes, 4)

[[4541, 4512, 3211, 5612, 5191, 6232, 3352, 4483, 4533, 2111, 4422, 4421],
 [7213, 4511, 3351, 1125, 1125, 3131, 2121, 4821, 5112, 5173, 4543, 8141],
 [4862, 5151, 4539, 3255, 3117, 5211, 5179, 4431, 4542, 4532, 3274, 4481],
 [4471, 4482, 4461, 4523, 5111, 4522, 5152, 4531, 6241]]

In [44]:
await main(start_batch=1, end_batch=4)

Processing batch 1/4...


Batch 1/4:   0%|          | 0/12 [00:00<?, ?it/s]

Clicked the Submit button for NAICS: 4541.
Switched to the new tab for NAICS: 4541.
NAICS: 4541, Page 1: Extracted 0 links.
NAICS: 4541, Page 1: Wrote 0 links to problematic_4541_11_51.txt
NAICS: 4541, No 'Next' button found or an error occurred.
NAICS: 4541: Total links extracted: 0
Closed the result tab for NAICS: 4541.
Clicked the Submit button for NAICS: 4512.
Switched to the new tab for NAICS: 4512.
NAICS: 4512, Page 1: Extracted 0 links.
NAICS: 4512, Page 1: Wrote 0 links to problematic_4512_11_51.txt
NAICS: 4512, No 'Next' button found or an error occurred.
NAICS: 4512: Total links extracted: 0
Closed the result tab for NAICS: 4512.
Clicked the Submit button for NAICS: 3211.
Switched to the new tab for NAICS: 3211.
NAICS: 3211, Page 1: Extracted 62 links.
NAICS: 3211, Page 1: Wrote 62 links to problematic_3211_11_51.txt
NAICS: 3211, No 'Next' button found or an error occurred.
NAICS: 3211: Total links extracted: 62
Closed the result tab for NAICS: 3211.
Clicked the Submit button

Batch 2/4:   0%|          | 0/12 [00:00<?, ?it/s]

Clicked the Submit button for NAICS: 7213.
Switched to the new tab for NAICS: 7213.
NAICS: 7213, Page 1: Extracted 95 links.
NAICS: 7213, Page 1: Wrote 95 links to problematic_7213_11_53.txt
NAICS: 7213, No 'Next' button found or an error occurred.
NAICS: 7213: Total links extracted: 95
Closed the result tab for NAICS: 7213.
Clicked the Submit button for NAICS: 4511.
Switched to the new tab for NAICS: 4511.
NAICS: 4511, Page 1: Extracted 0 links.
NAICS: 4511, Page 1: Wrote 0 links to problematic_4511_11_53.txt
NAICS: 4511, No 'Next' button found or an error occurred.
NAICS: 4511: Total links extracted: 0
Closed the result tab for NAICS: 4511.
Clicked the Submit button for NAICS: 3351.
Switched to the new tab for NAICS: 3351.
NAICS: 3351, Page 1: Extracted 33 links.
NAICS: 3351, Page 1: Wrote 33 links to problematic_3351_11_53.txt
NAICS: 3351, No 'Next' button found or an error occurred.
NAICS: 3351: Total links extracted: 33
Closed the result tab for NAICS: 3351.
Clicked the Submit but

Batch 3/4:   0%|          | 0/12 [00:00<?, ?it/s]

Clicked the Submit button for NAICS: 4862.
Switched to the new tab for NAICS: 4862.
NAICS: 4862, Page 1: Extracted 0 links.
NAICS: 4862, Page 1: Wrote 0 links to problematic_4862_11_54.txt
NAICS: 4862, No 'Next' button found or an error occurred.
NAICS: 4862: Total links extracted: 0
Closed the result tab for NAICS: 4862.
Clicked the Submit button for NAICS: 5151.
Switched to the new tab for NAICS: 5151.
NAICS: 5151, Page 1: Extracted 0 links.
NAICS: 5151, Page 1: Wrote 0 links to problematic_5151_11_54.txt
NAICS: 5151, No 'Next' button found or an error occurred.
NAICS: 5151: Total links extracted: 0
Closed the result tab for NAICS: 5151.
Clicked the Submit button for NAICS: 4539.
Switched to the new tab for NAICS: 4539.
NAICS: 4539, Page 1: Extracted 0 links.
NAICS: 4539, Page 1: Wrote 0 links to problematic_4539_11_55.txt
NAICS: 4539, No 'Next' button found or an error occurred.
NAICS: 4539: Total links extracted: 0
Closed the result tab for NAICS: 4539.
Clicked the Submit button fo

Batch 4/4:   0%|          | 0/9 [00:00<?, ?it/s]

Clicked the Submit button for NAICS: 4471.
Switched to the new tab for NAICS: 4471.
NAICS: 4471, Page 1: Extracted 0 links.
NAICS: 4471, Page 1: Wrote 0 links to problematic_4471_11_56.txt
NAICS: 4471, No 'Next' button found or an error occurred.
NAICS: 4471: Total links extracted: 0
Closed the result tab for NAICS: 4471.
Clicked the Submit button for NAICS: 4482.
Switched to the new tab for NAICS: 4482.
NAICS: 4482, Page 1: Extracted 0 links.
NAICS: 4482, Page 1: Wrote 0 links to problematic_4482_11_56.txt
NAICS: 4482, No 'Next' button found or an error occurred.
NAICS: 4482: Total links extracted: 0
Closed the result tab for NAICS: 4482.
Clicked the Submit button for NAICS: 4461.
Switched to the new tab for NAICS: 4461.
NAICS: 4461, Page 1: Extracted 0 links.
NAICS: 4461, Page 1: Wrote 0 links to problematic_4461_11_56.txt
NAICS: 4461, No 'Next' button found or an error occurred.
NAICS: 4461: Total links extracted: 0
Closed the result tab for NAICS: 4461.
Clicked the Submit button fo

## Problematic_start_codes

In [45]:
with open ("problematic_start_codes.txt",'w') as f:
    for c in problematic_start_codes:
        f.write(str(c))
        f.write("\n")

In [46]:
import math
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from bs4 import BeautifulSoup
import asyncio
from tqdm.notebook import tqdm
from datetime import datetime
import nest_asyncio

# Allow nested event loops in Jupyter Notebook
nest_asyncio.apply()

# Function to split data into batches
def split_into_batches(data, num_batches):
    """
    Splits a list into a specified number of batches.
    """
    batch_size = math.ceil(len(data) / num_batches)
    return [data[i * batch_size:(i + 1) * batch_size] for i in range(num_batches)]

# Async function to fetch links for a single NAICS code
async def fetch_links(driver, naics_code):
    """
    Fetch all 'Detail' links for a given NAICS code.
    Handles pagination and logs progress to a worklog file.
    """
    try:
        # Step 1: Navigate to the Page
        url = "https://accessnc.nccommerce.com/business/business_custom_search_infogroup.html"
        driver.get(url)

        # Wait for the page to load
        await asyncio.sleep(1)

        # Step 2: Select Industry Group (e.g., NAICS Code)
        industry_dropdown = Select(driver.find_element(By.ID, "qnaics"))
        industry_dropdown.select_by_value(naics_code)

        # Step 3: Interact with the Submit Button
        submit_button = driver.find_element(By.CSS_SELECTOR, "div.form-group > button[type='submit'][class='btn btn-primary'][name='submit']")
        driver.execute_script("arguments[0].scrollIntoView(true);", submit_button)
        await asyncio.sleep(1)
        driver.execute_script("arguments[0].click();", submit_button)
        print(f"Clicked the Submit button for NAICS: {naics_code}.")

        # Step 4: Wait for Results to Load and Process the New Tab's HTML
        await asyncio.sleep(5)  # Wait to ensure the response page is fully loaded
        driver.switch_to.window(driver.window_handles[-1])  # Switch to the new tab
        print(f"Switched to the new tab for NAICS: {naics_code}.")

        # Initialize an empty list to store all links
        all_detail_links = []
        page_num = 1  # Initialize page counter

        while True:
            # Capture the HTML content of the current page
            result_html = driver.page_source
            soup = BeautifulSoup(result_html, 'html.parser')

            # Extract all links where the text is "Detail"
            detail_links = [link.get('href') for link in soup.find_all('a', string="Detail")]
            all_detail_links.extend(detail_links)
            print(f"NAICS: {naics_code}, Page {page_num}: Extracted {len(detail_links)} links.")

            # Write links to a temporary file
            timestamp = datetime.now()
            tmp_filename = f"problematic_start_{naics_code}_{timestamp.hour}_{timestamp.minute}.txt"
            with open(tmp_filename, "a") as file:
                for link in detail_links:
                    file.write(link + "\n")
            print(f"NAICS: {naics_code}, Page {page_num}: Wrote {len(detail_links)} links to {tmp_filename}")

            # Log progress to worklog
            with open("worklog.txt", "a") as log_file:
                log_file.write(f"[{timestamp}] NAICS: {naics_code}, Page: {page_num}, Links Extracted: {len(detail_links)}\n")

            break

            # Find and click the "Next" button
            # try:
            #     next_button = driver.find_element(By.CSS_SELECTOR, "a[rel='next']")
            #     if "disabled" in next_button.get_attribute("class"):
            #         print(f"NAICS: {naics_code}, Page {page_num}: No more pages.")
            #         break
            #     next_button.click()
            #     print(f"NAICS: {naics_code}, Clicked the 'Next' button to go to page {page_num + 1}.")
            #     page_num += 1
            #     await asyncio.sleep(2)  # Allow time for the next page to load
            # except Exception:
            #     print(f"NAICS: {naics_code}, No 'Next' button found or an error occurred.")
            #     break

        print(f"NAICS: {naics_code}: Total links extracted: {len(all_detail_links)}")

        # Log final results
        with open("worklog.txt", "a") as log_file:
            log_file.write(f"[{datetime.now()}] Completed NAICS: {naics_code}. Total Links: {len(all_detail_links)}\n")

        # Close the result tab before returning to the main tab
        driver.close()
        await asyncio.sleep(1)  # Ensure tab closure completes
        driver.switch_to.window(driver.window_handles[0])
        print(f"Closed the result tab for NAICS: {naics_code}.")
    except Exception as e:
        print(f"Error processing NAICS: {naics_code}: {e}")

# Async main function
async def main(start_batch, end_batch):
    # Read NAICS codes from the file
    with open("problematic_start_codes.txt", 'r') as f:
        codes = [line.strip() for line in f.readlines()]

    # Split the tasks into batches
    batches = split_into_batches(codes, 4)

    # Validate batch range
    if start_batch < 1 or end_batch > len(batches):
        print(f"Invalid batch range: {start_batch}-{end_batch}. Must be between 1 and {len(batches)}.")
        return

    # Create a WebDriver instance
    driver = webdriver.Chrome()

    try:
        for batch_idx in range(start_batch - 1, end_batch):
            print(f"Processing batch {batch_idx + 1}/{end_batch}...")
            progress_bar = tqdm(total=len(batches[batch_idx]), desc=f"Batch {batch_idx + 1}/{end_batch}")

            # Run tasks sequentially in the current batch to ensure proper tab handling
            for naics_code in batches[batch_idx]:
                await fetch_links(driver, naics_code)
                progress_bar.update(1)

            progress_bar.close()
            print(f"Batch {batch_idx + 1} completed.")
            await asyncio.sleep(5)  # Optional rest between batches

        print(f"All tasks from batch {start_batch} to {end_batch} completed. Links are written to tmp files.")
    finally:
        driver.quit()
        print("Closed the WebDriver.")



In [47]:
await main(start_batch=1, end_batch=4)

Processing batch 1/4...


Batch 1/4:   0%|          | 0/10 [00:00<?, ?it/s]

Clicked the Submit button for NAICS: 8132.
Switched to the new tab for NAICS: 8132.
NAICS: 8132, Page 1: Extracted 100 links.
NAICS: 8132, Page 1: Wrote 100 links to problematic_start_8132_12_10.txt
NAICS: 8132: Total links extracted: 100
Closed the result tab for NAICS: 8132.
Clicked the Submit button for NAICS: 5615.
Switched to the new tab for NAICS: 5615.
NAICS: 5615, Page 1: Extracted 100 links.
NAICS: 5615, Page 1: Wrote 100 links to problematic_start_5615_12_10.txt
NAICS: 5615: Total links extracted: 100
Closed the result tab for NAICS: 5615.
Clicked the Submit button for NAICS: 8139.
Switched to the new tab for NAICS: 8139.
NAICS: 8139, Page 1: Extracted 100 links.
NAICS: 8139, Page 1: Wrote 100 links to problematic_start_8139_12_10.txt
NAICS: 8139: Total links extracted: 100
Closed the result tab for NAICS: 8139.
Clicked the Submit button for NAICS: 6244.
Switched to the new tab for NAICS: 6244.
NAICS: 6244, Page 1: Extracted 100 links.
NAICS: 6244, Page 1: Wrote 100 links to 

Batch 2/4:   0%|          | 0/10 [00:00<?, ?it/s]

Clicked the Submit button for NAICS: 7111.
Switched to the new tab for NAICS: 7111.
NAICS: 7111, Page 1: Extracted 100 links.
NAICS: 7111, Page 1: Wrote 100 links to problematic_start_7111_12_11.txt
NAICS: 7111: Total links extracted: 100
Closed the result tab for NAICS: 7111.
Clicked the Submit button for NAICS: 7131.
Switched to the new tab for NAICS: 7131.
NAICS: 7131, Page 1: Extracted 100 links.
NAICS: 7131, Page 1: Wrote 100 links to problematic_start_7131_12_11.txt
NAICS: 7131: Total links extracted: 100
Closed the result tab for NAICS: 7131.
Clicked the Submit button for NAICS: 4441.
Switched to the new tab for NAICS: 4441.
NAICS: 4441, Page 1: Extracted 100 links.
NAICS: 4441, Page 1: Wrote 100 links to problematic_start_4441_12_11.txt
NAICS: 4441: Total links extracted: 100
Closed the result tab for NAICS: 4441.
Clicked the Submit button for NAICS: 9211.
Switched to the new tab for NAICS: 9211.
NAICS: 9211, Page 1: Extracted 100 links.
NAICS: 9211, Page 1: Wrote 100 links to 

Batch 3/4:   0%|          | 0/10 [00:00<?, ?it/s]

Clicked the Submit button for NAICS: 2123.
Switched to the new tab for NAICS: 2123.
NAICS: 2123, Page 1: Extracted 100 links.
NAICS: 2123, Page 1: Wrote 100 links to problematic_start_2123_12_13.txt
NAICS: 2123: Total links extracted: 100
Closed the result tab for NAICS: 2123.
Clicked the Submit button for NAICS: 4412.
Switched to the new tab for NAICS: 4412.
NAICS: 4412, Page 1: Extracted 100 links.
NAICS: 4412, Page 1: Wrote 100 links to problematic_start_4412_12_13.txt
NAICS: 4412: Total links extracted: 100
Closed the result tab for NAICS: 4412.
Clicked the Submit button for NAICS: 1121.
Switched to the new tab for NAICS: 1121.
NAICS: 1121, Page 1: Extracted 100 links.
NAICS: 1121, Page 1: Wrote 100 links to problematic_start_1121_12_13.txt
NAICS: 1121: Total links extracted: 100
Closed the result tab for NAICS: 1121.
Clicked the Submit button for NAICS: 6216.
Switched to the new tab for NAICS: 6216.
NAICS: 6216, Page 1: Extracted 100 links.
NAICS: 6216, Page 1: Wrote 100 links to 

Batch 4/4:   0%|          | 0/10 [00:00<?, ?it/s]

Clicked the Submit button for NAICS: 3149.
Switched to the new tab for NAICS: 3149.
NAICS: 3149, Page 1: Extracted 100 links.
NAICS: 3149, Page 1: Wrote 100 links to problematic_start_3149_12_14.txt
NAICS: 3149: Total links extracted: 100
Closed the result tab for NAICS: 3149.
Clicked the Submit button for NAICS: 3399.
Switched to the new tab for NAICS: 3399.
NAICS: 3399, Page 1: Extracted 100 links.
NAICS: 3399, Page 1: Wrote 100 links to problematic_start_3399_12_14.txt
NAICS: 3399: Total links extracted: 100
Closed the result tab for NAICS: 3399.
Clicked the Submit button for NAICS: 3279.
Switched to the new tab for NAICS: 3279.
NAICS: 3279, Page 1: Extracted 100 links.
NAICS: 3279, Page 1: Wrote 100 links to problematic_start_3279_12_14.txt
NAICS: 3279: Total links extracted: 100
Closed the result tab for NAICS: 3279.
Clicked the Submit button for NAICS: 4859.
Switched to the new tab for NAICS: 4859.
NAICS: 4859, Page 1: Extracted 100 links.
NAICS: 4859, Page 1: Wrote 100 links to 