## First trial

In [3]:
import math
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from bs4 import BeautifulSoup
import asyncio
from tqdm.notebook import tqdm
import time
import sys

# Function to split data into batches
def split_into_batches(data, num_batches):
    """
    Splits a list into a specified number of batches.
    """
    batch_size = math.ceil(len(data) / num_batches)
    return [data[i * batch_size:(i + 1) * batch_size] for i in range(num_batches)]

# Async function to fetch links using Selenium
async def fetch_links(driver, county, naics_code):
    try:
        # Step 1: Navigate to the Page
        url = "https://accessnc.nccommerce.com/business/business_custom_search_infogroup.html"
        driver.get(url)

        # Wait for the page to load
        time.sleep(2)

        # Step 2: Switch to "Advanced Search" Tab
        advanced_search_tab = driver.find_element(By.CSS_SELECTOR, "a[href='#Advanced_Search']")
        advanced_search_tab.click()

        # Wait for the Advanced Search section to load
        time.sleep(2)

        # Step 3: Select "Area Type" (e.g., "County")
        area_type_dropdown = Select(driver.find_element(By.ID, "regionCategory"))
        area_type_dropdown.select_by_visible_text("County")

        # Step 4: Select the Area (e.g., "Buncombe")
        area_dropdown = Select(driver.find_element(By.ID, "region"))
        area_dropdown.select_by_visible_text(county)

        # Step 5: Select Industry Group (e.g., NAICS Code)
        industry_dropdown = Select(driver.find_element(By.ID, "IndustryGroup2"))
        industry_dropdown.select_by_value(naics_code)

        # Step 6: Interact with the Submit Button
        submit_button = driver.find_element(By.CSS_SELECTOR, "div.col-sm-10 > button[type='submit'][class='btn btn-primary'][name='submit']")
        driver.execute_script("arguments[0].scrollIntoView(true);", submit_button)
        time.sleep(1)
        driver.execute_script("arguments[0].click();", submit_button)

        # Step 7: Wait for Results to Load and Process the New Tab's HTML
        time.sleep(3)  # Adjust as needed for loading time
        driver.switch_to.window(driver.window_handles[-1])  # Switch to the new tab

        # Parse the HTML using BeautifulSoup
        result_html = driver.page_source
        soup = BeautifulSoup(result_html, 'html.parser')

        # Extract all links where the text is "Detail"
        detail_links = [link.get('href') for link in soup.find_all('a', string="Detail")]
        print(f"{county}-{naics_code}: Extracted {len(detail_links)} links.")

        # Write links to a file
        with open("all_links.txt", "a") as file:
            for link in detail_links:
                file.write(link + "\n")

        # Close the result tab
        driver.close()

        # Switch back to the original tab
        driver.switch_to.window(driver.window_handles[0])

        return detail_links
    except Exception as e:
        print(f"Error processing {county}-{naics_code}: {e}")
        return []

# Async main function
async def main(start_batch, end_batch):
    # Read NAICS codes from the file
    with open("qnaics_codes.txt", 'r') as f:
        codes = [line.strip() for line in f.readlines()]

    # Define counties
    counties = [
        "Alexander", "Alleghany", "Ashe", "Avery", "Buncombe", "Burke",
        "Caldwell", "Catawba", "Clay", "Cleveland", "Gaston", "Haywood",
        "Henderson", "Jackson", "Lincoln", "Macon", "Madison", "McDowell",
        "Mecklenburg", "Mitchell", "Polk", "Rutherford", "Swain",
        "Transylvania", "Watauga", "Wilkes", "Yancey"
    ]

    # Combine counties and NAICS codes into pairs
    tasks_data = [(county, code) for county in counties for code in codes]

    # Split the tasks into 20 batches
    batches = split_into_batches(tasks_data, 20)

    # Validate batch range
    if start_batch < 1 or end_batch > len(batches):
        print(f"Invalid batch range: {start_batch}-{end_batch}. Must be between 1 and {len(batches)}.")
        return

    # Create a WebDriver instance
    driver = webdriver.Chrome()

    try:
        for batch_idx in range(start_batch - 1, end_batch):
            print(f"Processing batch {batch_idx + 1}/{end_batch}...")
            progress_bar = tqdm(total=len(batches[batch_idx]), desc=f"Batch {batch_idx + 1}/{end_batch}")

            # Create tasks for the current batch
            tasks = [
                fetch_links(driver, county, code)
                for county, code in batches[batch_idx]
            ]

            # Run all tasks in the current batch
            await asyncio.gather(*tasks)

            # Update progress bar
            for _ in tasks:
                progress_bar.update(1)

            progress_bar.close()
            print(f"Batch {batch_idx + 1} completed.")
            await asyncio.sleep(5)  # Optional rest between batches

        print(f"All tasks from batch {start_batch} to {end_batch} completed. Links are written to all_links.txt.")
    finally:
        driver.quit()


In [6]:
await main(start_batch=1, end_batch=5)

Processing batch 1/5...


Batch 1/5:   0%|          | 0/420 [00:00<?, ?it/s]

Alexander-1132: Extracted 0 links.
Alexander-3115: Extracted 0 links.
Alexander-3362: Extracted 1 links.
Alexander-5232: Extracted 0 links.
Alexander-8112: Extracted 0 links.
Alexander-3132: Extracted 1 links.
Alexander-4481: Extracted 0 links.
Alexander-6117: Extracted 1 links.
Alexander-4243: Extracted 0 links.
Alexander-5222: Extracted 6 links.
Alexander-2131: Extracted 0 links.
Alexander-4811: Extracted 0 links.
Alexander-3313: Extracted 0 links.
Alexander-4512: Extracted 0 links.
Alexander-3361: Extracted 1 links.
Alexander-4541: Extracted 0 links.
Alexander-2371: Extracted 1 links.
Alexander-4872: Extracted 0 links.
Alexander-4832: Extracted 0 links.
Alexander-5324: Extracted 0 links.
Alexander-5182: Extracted 1 links.
Alexander-3353: Extracted 0 links.
Alexander-4431: Extracted 0 links.
Alexander-3371: Extracted 11 links.
Alexander-4921: Extracted 0 links.
Alexander-3333: Extracted 0 links.
Alexander-3379: Extracted 3 links.
Alexander-4869: Extracted 0 links.
Alexander-3274: Ext

Batch 2/5:   0%|          | 0/420 [00:00<?, ?it/s]

Alleghany-7131: Extracted 0 links.
Alleghany-2362: Extracted 6 links.
Alleghany-8113: Extracted 0 links.
Alleghany-2373: Extracted 4 links.
Alleghany-4239: Extracted 1 links.
Alleghany-4862: Extracted 0 links.
Alleghany-4533: Extracted 0 links.
Alleghany-1125: Extracted 0 links.
Alleghany-3323: Extracted 0 links.
Alleghany-3118: Extracted 0 links.
Alleghany-4532: Extracted 0 links.
Alleghany-3212: Extracted 0 links.
Alleghany-3222: Extracted 0 links.
Alleghany-3372: Extracted 0 links.
Alleghany-5613: Extracted 0 links.
Alleghany-8141: Extracted 0 links.
Alleghany-3364: Extracted 0 links.
Alleghany-5415: Extracted 1 links.
Alleghany-3261: Extracted 0 links.
Alleghany-1129: Extracted 0 links.
Alleghany-5122: Extracted 0 links.
Alleghany-4531: Extracted 0 links.
Alleghany-6213: Extracted 10 links.
Alleghany-3241: Extracted 0 links.
Alleghany-7115: Extracted 1 links.
Alleghany-3262: Extracted 1 links.
Alleghany-5111: Extracted 0 links.
Alleghany-2111: Extracted 0 links.
Alleghany-3321: Ext

Batch 3/5:   0%|          | 0/420 [00:00<?, ?it/s]

Ashe-7139: Extracted 4 links.
Ashe-5417: Extracted 2 links.
Ashe-6222: Extracted 0 links.
Ashe-2383: Extracted 9 links.
Ashe-5239: Extracted 8 links.
Ashe-4482: Extracted 0 links.
Ashe-6241: Extracted 15 links.
Ashe-8129: Extracted 6 links.
Ashe-8131: Extracted 67 links.
Ashe-5614: Extracted 1 links.
Ashe-1119: Extracted 16 links.
Ashe-4483: Extracted 0 links.
Ashe-5617: Extracted 26 links.
Ashe-3159: Extracted 0 links.
Ashe-2382: Extracted 27 links.
Ashe-4235: Extracted 0 links.
Ashe-3219: Extracted 3 links.
Ashe-4249: Extracted 4 links.
Ashe-8133: Extracted 17 links.
Ashe-6239: Extracted 1 links.
Ashe-9241: Extracted 2 links.
Ashe-5174: Extracted 0 links.
Ashe-5611: Extracted 0 links.
Ashe-3366: Extracted 0 links.
Ashe-5511: Extracted 4 links.
Ashe-4442: Extracted 12 links.
Ashe-3221: Extracted 0 links.
Ashe-3363: Extracted 0 links.
Ashe-3256: Extracted 0 links.
Ashe-9281: Extracted 0 links.
Ashe-4841: Extracted 0 links.
Ashe-3114: Extracted 0 links.
Ashe-6114: Extracted 0 links.
Ash

Batch 4/5:   0%|          | 0/420 [00:00<?, ?it/s]

Buncombe-2371: Extracted 24 links.
Buncombe-4872: Extracted 1 links.
Buncombe-4832: Extracted 0 links.
Buncombe-5324: Extracted 6 links.
Buncombe-5182: Extracted 23 links.
Buncombe-3353: Extracted 2 links.
Buncombe-4431: Extracted 0 links.
Buncombe-3371: Extracted 33 links.
Buncombe-4921: Extracted 3 links.
Buncombe-3333: Extracted 0 links.
Buncombe-3379: Extracted 1 links.
Buncombe-4869: Extracted 0 links.
Buncombe-3274: Extracted 0 links.
Buncombe-4232: Extracted 4 links.
Buncombe-3322: Extracted 0 links.
Buncombe-5312: Extracted 100 links.
Buncombe-2123: Extracted 4 links.
Buncombe-3253: Extracted 1 links.
Buncombe-4511: Extracted 0 links.
Buncombe-3344: Extracted 0 links.
Buncombe-4247: Extracted 14 links.
Buncombe-3326: Extracted 0 links.
Buncombe-3342: Extracted 2 links.
Buncombe-3332: Extracted 5 links.
Buncombe-5231: Extracted 0 links.
Buncombe-6115: Extracted 9 links.
Buncombe-1141: Extracted 0 links.
Buncombe-3272: Extracted 3 links.
Buncombe-6112: Extracted 4 links.
Buncombe

Batch 5/5:   0%|          | 0/420 [00:00<?, ?it/s]

Burke-3364: Extracted 3 links.
Burke-5415: Extracted 6 links.
Burke-3261: Extracted 0 links.
Burke-1129: Extracted 0 links.
Burke-5122: Extracted 0 links.
Burke-4531: Extracted 0 links.
Burke-6213: Extracted 100 links.
Burke-3241: Extracted 0 links.
Burke-7115: Extracted 8 links.
Burke-3262: Extracted 0 links.
Burke-5111: Extracted 0 links.
Burke-2111: Extracted 1 links.
Burke-3321: Extracted 1 links.
Burke-5313: Extracted 6 links.
Burke-1131: Extracted 0 links.
Burke-9221: Extracted 36 links.
Burke-3399: Extracted 5 links.
Burke-4931: Extracted 4 links.
Burke-7112: Extracted 1 links.
Burke-4542: Extracted 0 links.
Burke-4543: Extracted 0 links.
Burke-4244: Extracted 2 links.
Burke-3169: Extracted 0 links.
Burke-4412: Extracted 6 links.
Burke-5612: Extracted 0 links.
Burke-4453: Extracted 2 links.
Burke-4451: Extracted 48 links.
Burke-3336: Extracted 1 links.
Burke-5179: Extracted 0 links.
Burke-3131: Extracted 0 links.
Burke-2212: Extracted 0 links.
Burke-1114: Extracted 3 links.
Burk

In [3]:
await main(start_batch=6, end_batch=7)

Processing batch 6/7...


Batch 6/7:   0%|          | 0/420 [00:00<?, ?it/s]

Caldwell-9251: Extracted 0 links.
Caldwell-3274: Extracted 0 links.
Caldwell-4453: Extracted 3 links.
Caldwell-2131: Extracted 0 links.
Caldwell-5414: Extracted 8 links.
Caldwell-1129: Extracted 3 links.
Caldwell-7132: Extracted 1 links.
Caldwell-3219: Extracted 11 links.
Caldwell-6233: Extracted 4 links.
Caldwell-6214: Extracted 7 links.
Caldwell-3118: Extracted 2 links.
Caldwell-4511: Extracted 0 links.
Caldwell-3371: Extracted 14 links.
Caldwell-3254: Extracted 1 links.
Caldwell-4237: Extracted 0 links.
Caldwell-3221: Extracted 0 links.
Caldwell-3116: Extracted 0 links.
Caldwell-4512: Extracted 0 links.
Caldwell-6239: Extracted 2 links.
Caldwell-5411: Extracted 27 links.
Caldwell-5617: Extracted 42 links.
Caldwell-3113: Extracted 0 links.
Caldwell-2382: Extracted 37 links.
Caldwell-5231: Extracted 10 links.
Caldwell-5312: Extracted 43 links.
Caldwell-7112: Extracted 2 links.
Caldwell-2361: Extracted 34 links.
Caldwell-4884: Extracted 8 links.
Caldwell-8131: Extracted 100 links.
Cald

Batch 7/7:   0%|          | 0/420 [00:00<?, ?it/s]

Clay-4881: Extracted 0 links.
Clay-1152: Extracted 2 links.
Clay-5242: Extracted 7 links.
Clay-5191: Extracted 0 links.
Clay-5415: Extracted 1 links.
Clay-3366: Extracted 0 links.
Clay-1122: Extracted 0 links.
Clay-3114: Extracted 0 links.
Clay-4523: Extracted 0 links.
Clay-7223: Extracted 3 links.
Clay-4871: Extracted 0 links.
Clay-5223: Extracted 0 links.
Clay-3322: Extracted 0 links.
Clay-6241: Extracted 10 links.
Clay-4883: Extracted 0 links.
Clay-4921: Extracted 0 links.
Clay-5619: Extracted 4 links.
Clay-3315: Extracted 0 links.
Clay-1123: Extracted 0 links.
Clay-1141: Extracted 0 links.
Clay-3333: Extracted 0 links.
Clay-5621: Extracted 1 links.
Clay-3169: Extracted 0 links.
Clay-6232: Extracted 0 links.
Clay-9231: Extracted 2 links.
Clay-1119: Extracted 3 links.
Clay-2122: Extracted 0 links.
Clay-3115: Extracted 0 links.
Clay-5173: Extracted 0 links.
Clay-4241: Extracted 0 links.
Clay-3325: Extracted 0 links.
Clay-8134: Extracted 4 links.
Clay-3121: Extracted 4 links.
Clay-4862

In [4]:
await main(start_batch=9, end_batch=10)

Processing batch 9/10...


Batch 9/10:   0%|          | 0/420 [00:00<?, ?it/s]

Gaston-3116: Extracted 0 links.
Gaston-4512: Extracted 0 links.
Gaston-6239: Extracted 1 links.
Gaston-5411: Extracted 100 links.
Gaston-5617: Extracted 100 links.
Gaston-3113: Extracted 0 links.
Gaston-2382: Extracted 100 links.
Gaston-5231: Extracted 35 links.
Gaston-5312: Extracted 100 links.
Gaston-7112: Extracted 3 links.
Gaston-2361: Extracted 100 links.
Gaston-4884: Extracted 21 links.
Gaston-8131: Extracted 100 links.
Gaston-7139: Extracted 59 links.
Gaston-3313: Extracted 0 links.
Gaston-3271: Extracted 0 links.
Gaston-8122: Extracted 26 links.
Gaston-4236: Extracted 24 links.
Gaston-3339: Extracted 12 links.
Gaston-5251: Extracted 0 links.
Gaston-3363: Extracted 2 links.
Gaston-4421: Extracted 0 links.
Gaston-4248: Extracted 0 links.
Gaston-4441: Extracted 59 links.
Gaston-3259: Extracted 6 links.
Gaston-6215: Extracted 0 links.
Gaston-3253: Extracted 1 links.
Gaston-3324: Extracted 1 links.
Gaston-4249: Extracted 6 links.
Gaston-6114: Extracted 6 links.
Gaston-4821: Extracte

Batch 10/10:   0%|          | 0/420 [00:00<?, ?it/s]

Henderson-5619: Extracted 16 links.
Henderson-3315: Extracted 0 links.
Henderson-1123: Extracted 0 links.
Henderson-1141: Extracted 0 links.
Henderson-3333: Extracted 4 links.
Henderson-5621: Extracted 11 links.
Henderson-3169: Extracted 0 links.
Henderson-6232: Extracted 0 links.
Henderson-9231: Extracted 6 links.
Henderson-1119: Extracted 0 links.
Henderson-2122: Extracted 0 links.
Henderson-3115: Extracted 0 links.
Henderson-5173: Extracted 0 links.
Henderson-4241: Extracted 1 links.
Henderson-3325: Extracted 0 links.
Henderson-8134: Extracted 0 links.
Henderson-3121: Extracted 19 links.
Henderson-4862: Extracted 0 links.
Henderson-3241: Extracted 0 links.
Henderson-7115: Extracted 0 links.
Henderson-3399: Extracted 21 links.
Henderson-3133: Extracted 0 links.
Henderson-4413: Extracted 25 links.
Henderson-8141: Extracted 0 links.
Henderson-4841: Extracted 11 links.
Henderson-3343: Extracted 0 links.
Henderson-7211: Extracted 0 links.
Henderson-4832: Extracted 1 links.
Henderson-5179

In [8]:
with open("all_links.txt",'r') as f:
    all_links = f.readlines()
len(set(all_links))

13401

In [11]:
with open("qnaics_codes.txt", 'r') as f:
    codes = [line.strip() for line in f.readlines()]

    # Define counties
counties = [
        "Alexander", "Alleghany", "Ashe", "Avery", "Buncombe", "Burke",
        "Caldwell", "Catawba", "Clay", "Cleveland", "Gaston", "Haywood",
        "Henderson", "Jackson", "Lincoln", "Macon", "Madison", "McDowell",
        "Mecklenburg", "Mitchell", "Polk", "Rutherford", "Swain",
        "Transylvania", "Watauga", "Wilkes", "Yancey"
    ]

    # Combine counties and NAICS codes into pairs
tasks_data = [(county, code) for county in counties for code in codes]
batches = split_into_batches(tasks_data,20)
batches[18]

[('Watauga', '1125'),
 ('Watauga', '5621'),
 ('Watauga', '3345'),
 ('Watauga', '4872'),
 ('Watauga', '3379'),
 ('Watauga', '3118'),
 ('Watauga', '2361'),
 ('Watauga', '9231'),
 ('Watauga', '5416'),
 ('Watauga', '3314'),
 ('Watauga', '3122'),
 ('Watauga', '3369'),
 ('Watauga', '5322'),
 ('Watauga', '3131'),
 ('Watauga', '3336'),
 ('Watauga', '2121'),
 ('Watauga', '2372'),
 ('Watauga', '6215'),
 ('Watauga', '3353'),
 ('Watauga', '3149'),
 ('Watauga', '3399'),
 ('Watauga', '3121'),
 ('Watauga', '3256'),
 ('Watauga', '3279'),
 ('Watauga', '4821'),
 ('Watauga', '4859'),
 ('Watauga', '4241'),
 ('Watauga', '4452'),
 ('Watauga', '3259'),
 ('Watauga', '9261'),
 ('Watauga', '6213'),
 ('Watauga', '4853'),
 ('Watauga', '5231'),
 ('Watauga', '4841'),
 ('Watauga', '3115'),
 ('Watauga', '5112'),
 ('Watauga', '3152'),
 ('Watauga', '6233'),
 ('Watauga', '1122'),
 ('Watauga', '8121'),
 ('Watauga', '7212'),
 ('Watauga', '4812'),
 ('Watauga', '3151'),
 ('Watauga', '3325'),
 ('Watauga', '5173'),
 ('Watauga

In [15]:
with open("tasks_data.txt",'w') as f:
    for i in tasks_data:
        f.write(str(i))
        f.write("\n")

In [14]:
problematic_counties = ["Caldwell", "Cleveland","Gaston",'Rutherford','Lincoln',"Jackson",
                        "Madison",'Mecklenburg',"McDowell","Rutherford",'Watauga',"Wilkes","Polk"] 
problematic_tasks =  [(county, code) for county in problematic_counties for code in codes]

In [16]:
len(problematic_tasks)

4043

In [18]:
problematic_tasks

[('Caldwell', '3321'),
 ('Caldwell', '4884'),
 ('Caldwell', '2122'),
 ('Caldwell', '3313'),
 ('Caldwell', '5232'),
 ('Caldwell', '3132'),
 ('Caldwell', '3391'),
 ('Caldwell', '3365'),
 ('Caldwell', '1151'),
 ('Caldwell', '5324'),
 ('Caldwell', '4541'),
 ('Caldwell', '3344'),
 ('Caldwell', '4442'),
 ('Caldwell', '3111'),
 ('Caldwell', '3112'),
 ('Caldwell', '4512'),
 ('Caldwell', '5613'),
 ('Caldwell', '3333'),
 ('Caldwell', '6231'),
 ('Caldwell', '8132'),
 ('Caldwell', '4231'),
 ('Caldwell', '3326'),
 ('Caldwell', '3329'),
 ('Caldwell', '6223'),
 ('Caldwell', '5511'),
 ('Caldwell', '5239'),
 ('Caldwell', '5615'),
 ('Caldwell', '8139'),
 ('Caldwell', '6244'),
 ('Caldwell', '6221'),
 ('Caldwell', '5418'),
 ('Caldwell', '3211'),
 ('Caldwell', '4235'),
 ('Caldwell', '5412'),
 ('Caldwell', '5622'),
 ('Caldwell', '5612'),
 ('Caldwell', '4451'),
 ('Caldwell', '5191'),
 ('Caldwell', '3254'),
 ('Caldwell', '3212'),
 ('Caldwell', '4411'),
 ('Caldwell', '4233'),
 ('Caldwell', '6232'),
 ('Caldwell

In [19]:
with open("connecting.txt",'r') as f:
    finished_raw = f.readlines()
finished = []
for i in finished_raw:
    county, code = i.split(":")[0].split("-")
    finished.append((county, code))

In [21]:
res_tasks = []
for i in problematic_tasks:
    if i in finished:
        continue
    res_tasks.append(i)
res_tasks

[('Caldwell', '3391'),
 ('Caldwell', '3112'),
 ('Caldwell', '6231'),
 ('Caldwell', '3329'),
 ('Caldwell', '5511'),
 ('Caldwell', '5615'),
 ('Caldwell', '8139'),
 ('Caldwell', '6244'),
 ('Caldwell', '5412'),
 ('Caldwell', '5191'),
 ('Caldwell', '4233'),
 ('Caldwell', '3352'),
 ('Caldwell', '7224'),
 ('Caldwell', '4851'),
 ('Caldwell', '5629'),
 ('Caldwell', '7111'),
 ('Caldwell', '7114'),
 ('Caldwell', '7223'),
 ('Caldwell', '4422'),
 ('Caldwell', '4831'),
 ('Caldwell', '9241'),
 ('Caldwell', '6216'),
 ('Caldwell', '3345'),
 ('Caldwell', '9231'),
 ('Caldwell', '5416'),
 ('Caldwell', '3122'),
 ('Caldwell', '3256'),
 ('Caldwell', '4241'),
 ('Caldwell', '4452'),
 ('Caldwell', '4853'),
 ('Caldwell', '4841'),
 ('Caldwell', '3152'),
 ('Caldwell', '4812'),
 ('Caldwell', '1111'),
 ('Caldwell', '3341'),
 ('Caldwell', '9281'),
 ('Caldwell', '8123'),
 ('Caldwell', '5121'),
 ('Caldwell', '3251'),
 ('Caldwell', '5174'),
 ('Caldwell', '4539'),
 ('Caldwell', '5223'),
 ('Caldwell', '1113'),
 ('Caldwell

In [25]:
with open("res_tasks.txt", "w") as f:
    for i in res_tasks:
        f.write(i[0]+"-"+i[1])
        f.write("\n")

## Second trial

In [None]:
import math
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from bs4 import BeautifulSoup
import asyncio
from tqdm.notebook import tqdm
import time
import sys

# Function to split data into batches
def split_into_batches(data, num_batches):
    """
    Splits a list into a specified number of batches.
    """
    batch_size = math.ceil(len(data) / num_batches)
    return [data[i * batch_size:(i + 1) * batch_size] for i in range(num_batches)]
async def fetch_links(driver, county, naics_code):
    try:
        # Step 1: Navigate to the Page
        url = "https://accessnc.nccommerce.com/business/business_custom_search_infogroup.html"
        driver.get(url)

        # Wait for the page to load
        time.sleep(2)

        # Step 2: Switch to "Advanced Search" Tab
        advanced_search_tab = driver.find_element(By.CSS_SELECTOR, "a[href='#Advanced_Search']")
        advanced_search_tab.click()

        # Wait for the Advanced Search section to load
        time.sleep(2)

        # Step 3: Select "Area Type" (e.g., "County")
        area_type_dropdown = Select(driver.find_element(By.ID, "regionCategory"))
        area_type_dropdown.select_by_visible_text("County")

        # Step 4: Select the Area (e.g., "Buncombe")
        area_dropdown = Select(driver.find_element(By.ID, "region"))
        area_dropdown.select_by_visible_text(county)

        # Step 5: Select Industry Group (e.g., NAICS Code)
        industry_dropdown = Select(driver.find_element(By.ID, "IndustryGroup2"))
        industry_dropdown.select_by_value(naics_code)

        # Step 6: Interact with the Submit Button
        submit_button = driver.find_element(By.CSS_SELECTOR, "div.col-sm-10 > button[type='submit'][class='btn btn-primary'][name='submit']")
        driver.execute_script("arguments[0].scrollIntoView(true);", submit_button)
        time.sleep(1)
        driver.execute_script("arguments[0].click();", submit_button)

        # Step 7: Wait for Results to Load and Process the New Tab's HTML
        time.sleep(3)  # Adjust as needed for loading time
        driver.switch_to.window(driver.window_handles[-1])  # Switch to the new tab

        # Loop through all pages
        detail_links = []
        while True:
            # Parse the HTML using BeautifulSoup
            result_html = driver.page_source
            soup = BeautifulSoup(result_html, 'html.parser')

            # Extract all links where the text is "Detail"
            links = [link.get('href') for link in soup.find_all('a', string="Detail")]
            detail_links.extend(links)
            print(f"Extracted {len(links)} links from the current page.")

            # Find the "Next" button and click it, if available
            try:
                next_button = driver.find_element(By.CSS_SELECTOR, "a[aria-label='Next']")
                if "disabled" in next_button.get_attribute("class"):
                    print("No more pages.")
                    break
                next_button.click()
                time.sleep(2)  # Allow time for the next page to load
            except Exception:
                print("No 'Next' button found or an error occurred.")
                break

        # Write links to a file
        with open("all_links.txt", "a") as file:
            for link in detail_links:
                file.write(link + "\n")

        print(f"{county}-{naics_code}: Extracted {len(detail_links)} total links.")

        # Close the result tab
        driver.close()

        # Switch back to the original tab
        driver.switch_to.window(driver.window_handles[0])

        return detail_links
    except Exception as e:
        print(f"Error processing {county}-{naics_code}: {e}")
        return []
# Async main function
async def main(start_batch, end_batch):
    # Read NAICS codes from the file
    with open("qnaics_codes.txt", 'r') as f:
        codes = [line.strip() for line in f.readlines()]

    # Define counties
    counties = [
        "Alexander", "Alleghany", "Ashe", "Avery", "Buncombe", "Burke",
        "Caldwell", "Catawba", "Clay", "Cleveland", "Gaston", "Haywood",
        "Henderson", "Jackson", "Lincoln", "Macon", "Madison", "McDowell",
        "Mecklenburg", "Mitchell", "Polk", "Rutherford", "Swain",
        "Transylvania", "Watauga", "Wilkes", "Yancey"
    ]

    # Combine counties and NAICS codes into pairs
    tasks_data = [(county, code) for county in counties for code in codes][:100]

    # Split the tasks into 20 batches
    batches = split_into_batches(tasks_data, 20)

    # Validate batch range
    if start_batch < 1 or end_batch > len(batches):
        print(f"Invalid batch range: {start_batch}-{end_batch}. Must be between 1 and {len(batches)}.")
        return

    # Create a WebDriver instance
    driver = webdriver.Chrome()

    try:
        for batch_idx in range(start_batch - 1, end_batch):
            print(f"Processing batch {batch_idx + 1}/{end_batch}...")
            progress_bar = tqdm(total=len(batches[batch_idx]), desc=f"Batch {batch_idx + 1}/{end_batch}")

            # Create tasks for the current batch
            tasks = [
                fetch_links(driver, county, code)
                for county, code in batches[batch_idx]
            ]

            # Run all tasks in the current batch
            await asyncio.gather(*tasks)

            # Update progress bar
            for _ in tasks:
                progress_bar.update(1)

            progress_bar.close()
            print(f"Batch {batch_idx + 1} completed.")
            await asyncio.sleep(5)  # Optional rest between batches

        print(f"All tasks from batch {start_batch} to {end_batch} completed. Links are written to all_links.txt.")
    finally:
        driver.quit()
