### Loading directory lists

In [1]:
import pandas as pd

# Load the most recent CSV file
directories = pd.read_csv('zimbabwe_company_directories_web.csv')
directories

Unnamed: 0,number,title,url,domain,source,query,country,search_date
0,1,Zimbabwe Business Directory 2025 | AFRIKTA,https://afrikta.com/listing-locations/zimbabwe/,afrikta.com,web_search,yellow pages zimbabwe,Zimbabwe,2/12/2025
1,2,Parco nazionale e riserva di Wrangell-St. Elias,https://antropocene.it/2021/10/23/parco-nazion...,antropocene.it,web_search,zimbabweypzimbabwe suppliers directory,Zimbabwe,2/12/2025
2,3,Trade With Vikas by Vikas Ola - appadvice.com,https://appadvice.com/app/trade-with-vikas/660...,appadvice.com,web_search,zimbabweypzimbabwe suppliers directory,Zimbabwe,2/12/2025
3,4,‎Trade With Vikas on the App Store,https://apps.apple.com/in/app/trade-with-vikas...,apps.apple.com,web_search,zimbabweypzimbabwe suppliers directory,Zimbabwe,2/12/2025
4,5,Google Chrome for Google Chrome - Download,https://chrome.en.softonic.com/,chrome.en.softonic.com,web_search,zimbabweypzimbabwe suppliers directory,Zimbabwe,2/12/2025
...,...,...,...,...,...,...,...,...
104,105,in company with”和“ in the company of”的区别是什么_百度...,https://zhidao.baidu.com/question/436616749019...,zhidao.baidu.com,web_search,company directories in zimbabwe,Zimbabwe,2/12/2025
105,106,"""company""怎么变复数？？？_百度知道",https://zhidao.baidu.com/question/501890641.html,zhidao.baidu.com,web_search,company directories in zimbabwe,Zimbabwe,2/12/2025
106,107,accompany companion company的区别_百度知道,https://zhidao.baidu.com/question/50954363.html,zhidao.baidu.com,web_search,company directories in zimbabwe,Zimbabwe,2/12/2025
107,108,江苏省企业全链通综合服务平台怎么填报 - 百度知道,https://zhidao.baidu.com/question/637813618701...,zhidao.baidu.com,web_search,business directory zimbabwe,Zimbabwe,2/12/2025


In [2]:
import re
from urllib.parse import urljoin, urlparse
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
import requests
from bs4 import BeautifulSoup

def is_company_listing_page(soup, url):
    """
    Check if the page contains a list of companies.
    Returns a confidence score (0-1) and the type of listing found.
    """
    text = soup.get_text().lower()

    # Check for common indicators of company listings
    indicators = {
        'table': len(soup.find_all('table')) > 0,
        'list_items': len(soup.find_all(['ul', 'ol'])) > 2,
        'keywords': any(word in text for word in [
            'company', 'business', 'directory', 'listings', 'enterprises',
            'corporat', 'industr', 'limited', 'ltd', 'pty', 'inc', 'llc'
        ]),
        'links': len(soup.find_all('a', href=True)) > 10
    }

    # Calculate confidence score
    confidence = sum(1 for v in indicators.values() if v) / len(indicators)
    return confidence > 0.5

def find_company_listings(url, max_depth=2, current_depth=0):
    """
    Recursively search for pages containing company listings.
    Returns a list of URLs that likely contain company listings.
    """
    if current_depth > max_depth:
        return []

    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Check if current page is a listing
        if is_company_listing_page(soup, url):
            return [url]

        # If not, look for links that might lead to listings
        listing_urls = []
        for link in soup.find_all('a', href=True):
            href = link['href']
            if not href or href.startswith(('javascript:', 'mailto:', 'tel:', '#')):
                continue

            # Handle relative URLs
            full_url = urljoin(url, href)

            # Avoid duplicate checks and non-HTTP URLs
            if full_url == url or not full_url.startswith(('http://', 'https://')):
                continue

            # Check if URL looks like it might contain listings
            path = urlparse(full_url).path.lower()
            if any(term in path for term in ['company', 'business', 'directory', 'listings', 'members', 'membership']):
                listing_urls.append(full_url)

        # Recursively check potential listing URLs
        results = []
        for listing_url in listing_urls[:5]:  # Limit to top 5 to avoid too many requests
            try:
                results.extend(find_company_listings(listing_url, max_depth, current_depth + 1))
            except:
                continue
            time.sleep(1)  # Be nice to the server

        return list(set(results))  # Remove duplicates

    except Exception as e:
        print(f"Error processing {url}: {str(e)}")
        return []

def process_url(row):
    """Process a single URL and return updated row with found listings"""
    url = row['url']
    print(f"Processing: {url}")

    try:
        listings = find_company_listings(url)
        if listings:
            row['listing_urls'] = ' | '.join(listings)
            row['has_listings'] = True
        else:
            row['listing_urls'] = ''
            row['has_listings'] = False
    except Exception as e:
        print(f"Error with {url}: {e}")
        row['listing_urls'] = ''
        row['has_listings'] = False

    return row

# Process URLs in parallel (adjust max_workers as needed)
def process_all_urls(df, max_workers=3):
    """Process all URLs in the DataFrame"""
    results = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(process_url, row) for _, row in df.iterrows()]
        for future in as_completed(futures):
            results.append(future.result())
            print(f"Completed: {len(results)}/{len(df)}")

    return pd.DataFrame(results)

# Run the processing (this may take some time)
print("Starting URL processing...")
directories_processed = process_all_urls(directories)

# Save results
directories_processed.to_csv('directories_with_listings.csv', index=False)
print("Processing complete. Results saved to 'directories_with_listings.csv'")

# Show results
print("\nSummary of findings:")
print(f"Total directories processed: {len(directories_processed)}")
print(f"Directories with listings: {directories_processed['has_listings'].sum()}")


Starting URL processing...
Processing: https://afrikta.com/listing-locations/zimbabwe/
Processing: https://antropocene.it/2021/10/23/parco-nazionale-e-riserva-di-wrangell-st-elias/
Processing: https://appadvice.com/app/trade-with-vikas/6608975480
Processing: https://apps.apple.com/in/app/trade-with-vikas/id6608975480Completed: 1/109

Processing: https://chrome.en.softonic.com/Completed: 2/109

Processing: https://chrome.en.softonic.com/download
Completed: 3/109
Processing: https://classplusapp.com/w/tradewithvikasCompleted: 4/109

Processing: https://eduniversal-ranking.com/gem-alpine-business-school-ranking.htmlCompleted: 5/109

Processing: https://en.wikipedia.org/wiki/Zimbabwe
Completed: 6/109
Processing: https://forum.wordreference.com/threads/company-is-are.2961868/Completed: 7/109

Processing: https://forum.wordreference.com/threads/in-at-the-company.215026/
Completed: 8/109
Processing: https://forum.wordreference.com/threads/m-s-followed-by-a-company-name.1606232/Completed: 9/10