<a href="https://colab.research.google.com/github/RevazRevazashvili/DataScraping/blob/main/ncbiotech_directory_scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install cloudscraper pandas beautifulsoup4

Collecting cloudscraper
  Downloading cloudscraper-1.2.71-py2.py3-none-any.whl.metadata (19 kB)
Downloading cloudscraper-1.2.71-py2.py3-none-any.whl (99 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.7/99.7 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: cloudscraper
Successfully installed cloudscraper-1.2.71


In [8]:

import cloudscraper
from bs4 import BeautifulSoup
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import time
import threading

# Thread-local storage for scrapers to avoid conflicts
thread_local = threading.local()


def get_scraper():
    """Get a thread-local scraper instance"""
    if not hasattr(thread_local, 'scraper'):
        thread_local.scraper = cloudscraper.create_scraper()
    return thread_local.scraper


def get_urls(url):
    s = get_scraper()
    r = s.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')

    urls_amount = soup.find('div',
                            class_='views-element-container block block-views block-views-blockcompany-directory-search-company-directory-search-block').find(
        'header').find('p').find('span').text.split()[-1]
    pagination = int(urls_amount) // 50 + 1
    urls = []

    print(f"Found {urls_amount} companies, fetching URLs across {pagination} pages...")

    for i in tqdm(range(pagination), desc="Fetching company URLs", unit="page"):
        r = s.get(f"{url}&page={i}")
        soup = BeautifulSoup(r.text, 'html.parser')
        raw_urls = [a.find('a') for a in soup.find_all('td', class_='views-field views-field-title')]
        urls.extend(["https://directory.ncbiotech.org" + a.get('href') for a in raw_urls])

        # Small delay to be respectful to the server
        time.sleep(0.1)

    return urls


def scrape_single_url(url):
    """Scrape a single company URL - designed for parallel execution"""
    try:
        s = get_scraper()
        res = s.get(url)
        inner_soup = BeautifulSoup(res.text, 'html.parser')

        # Basic company info
        name = inner_soup.find('div', class_='block block-core block-page-title-block').get_text(strip=True)
        desc_tag = inner_soup.find('div', id='company-body')
        desc = desc_tag.get_text(strip=True) if desc_tag else ""
        web_tag = inner_soup.find('div',
                                  class_='field field--name-field-company-website field--type-link field--label-hidden field__item')
        web = web_tag.find('a').get('href') if web_tag else ""

        # Addresses
        wrap = inner_soup.find("div", class_="company-mailing-address-wrap")
        first_address = wrap.find_all('div', class_=lambda s: s and "mailing" in s.split('-')) if wrap else []
        address = ", ".join([div.get_text(strip=True) for div in first_address])
        second_address = wrap.find_all('div', class_=lambda s: s and "alternate" in s.split('-')) if wrap else []
        alternate_address = ", ".join([div.get_text(strip=True) for div in second_address])

        # Phone
        phone_tag = inner_soup.find('div', class_='company-phone location')
        phone = phone_tag.find('div', class_='field__item').get_text(strip=True) if phone_tag else ""
        pho = {"Phone": phone}

        # Country & Region
        country_tag = inner_soup.find('div', class_='company-country location')
        country = country_tag.find('div', class_='field__item').get_text(strip=True) if country_tag else ""
        region_tag = inner_soup.find('div', class_='company-region location')
        region = region_tag.find('div', class_='field__item').get_text(strip=True) if region_tag else ""

        # --- Company Details Parsing ---
        details = {}
        for cell in inner_soup.select(".cell.small-12.medium-6, .cell.small-12.medium-12"):
            label = cell.find("div", class_="field__label")
            if not label:
                continue
            label_text = label.get_text(strip=True)

            values = []
            for v in cell.find_all(["div"], class_=["field__item", "field_item"]):
                values.append(v.get_text(strip=True))

            if len(values) == 1:
                details[label_text] = values[0]
            elif len(values) > 1:
                details[label_text] = values

        record = {
            "Name": name,
            "Description": desc,
            "Website": web,
            "Address": address,
            "Alternate Address": alternate_address,
            "Phone": phone,
            "Country": country,
            "Region": region
        }

        # Merge company details into main record
        record.update(details)
        record.update(pho)

        return record

    except Exception as e:
        print(f"Error scraping {url}: {str(e)}")
        return None


def scrape_urls_parallel(urls, max_workers=10):
    """Scrape URLs in parallel with progress bar"""
    records = []
    failed_urls = []

    print(f"\nScraping {len(urls)} company pages with {max_workers} parallel workers...")

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit all tasks
        future_to_url = {executor.submit(scrape_single_url, url): url for url in urls}

        # Process completed tasks with progress bar
        for future in tqdm(as_completed(future_to_url), total=len(urls), desc="Scraping companies", unit="company"):
            url = future_to_url[future]
            try:
                result = future.result()
                if result:
                    records.append(result)
                else:
                    failed_urls.append(url)
            except Exception as e:
                print(f"Exception for {url}: {str(e)}")
                failed_urls.append(url)

    if failed_urls:
        print(f"\nWarning: Failed to scrape {len(failed_urls)} URLs")
        print("Failed URLs:")
        for url in failed_urls[:5]:  # Show first 5 failed URLs
            print(f"  - {url}")
        if len(failed_urls) > 5:
            print(f"  ... and {len(failed_urls) - 5} more")

    return records


def save_as_csv(data, filename="companies.csv"):
    df = pd.DataFrame(data)
    df.to_csv(filename, index=False, encoding="utf-8-sig")
    print(f"\nSaved {len(data)} company records to {filename}")


def find_main_urls(url):
    s = get_scraper()
    r = s.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    type_urls = soup.find(
      "div",
      class_="field field--name-field-home-searches-heading field--type-string field--label-hidden field__item",
      string=lambda text: text and "Type" in text
    )
    main_urls = ["https://directory.ncbiotech.org"+a.get('href') for a in type_urls.find_next_sibling().find_all('a')]
    return main_urls


def main(max_workers=10):
    """Main function with configurable parallelism"""
    print("Starting NC Biotech Directory Scraper...")
    print("=" * 50)

    # Get main category URLs
    print("Fetching main category URLs...")
    main_urls = find_main_urls('https://directory.ncbiotech.org/')
    print(f"Found {len(main_urls)} main categories")

    # Collect all business URLs
    all_business_urls = []
    for i, url in enumerate(main_urls, 1):
        print(f"\nProcessing category {i}/{len(main_urls)}: {url}")
        business_urls = get_urls(url)
        all_business_urls.extend(business_urls)
        print(f"Found {len(business_urls)} companies in this category")

    print(f"\nTotal companies to scrape: {len(all_business_urls)}")

    # Scrape all URLs in parallel
    start_time = time.time()
    data = scrape_urls_parallel(all_business_urls, max_workers=max_workers)
    end_time = time.time()

    # Save results
    save_as_csv(data)

    print(f"\nScraping completed in {end_time - start_time:.2f} seconds")
    print(f"Successfully scraped {len(data)} out of {len(all_business_urls)} companies")
    print(f"Success rate: {len(data) / len(all_business_urls) * 100:.1f}%")


if __name__ == "__main__":
    # You can adjust max_workers based on your needs and server tolerance
    # Start with 10, increase if the server can handle it, decrease if you get errors
    main(max_workers=10)



Starting NC Biotech Directory Scraper...
Fetching main category URLs...
Found 2 main categories

Processing category 1/2: https://directory.ncbiotech.org/company-directory?f[0]=search_by_company_type:10226
Found 912 companies, fetching URLs across 19 pages...


Fetching company URLs: 100%|██████████| 19/19 [00:03<00:00,  4.86page/s]


Found 912 companies in this category

Processing category 2/2: https://directory.ncbiotech.org/company-directory?f[0]=search_by_company_type:10231
Found 2407 companies, fetching URLs across 49 pages...


Fetching company URLs: 100%|██████████| 49/49 [00:10<00:00,  4.50page/s]


Found 2307 companies in this category

Total companies to scrape: 3219

Scraping 3219 company pages with 10 parallel workers...


Scraping companies: 100%|██████████| 3219/3219 [03:55<00:00, 13.69company/s]


Saved 3219 company records to companies.csv

Scraping completed in 235.71 seconds
Successfully scraped 3219 out of 3219 companies
Success rate: 100.0%



