### Identifying company directories

In [9]:
# ============================================
# Zimbabwe Business Directories & Google Places
# ============================================

# 1) Imports and configuration
import os
import time
from datetime import datetime
from urllib.parse import urlparse

import pandas as pd
import requests
from bs4 import BeautifulSoup
from duckduckgo_search import DDGS

In [10]:
# Read API key from environment (set GOOGLE_MAPS_API_KEY as an env var if you want Places integration)
GOOGLE_PLACES_API_KEY = os.getenv("GOOGLE_MAPS_API_KEY") or os.getenv("GOOGLE_PLACES_API_KEY")

# Search phrases to discover business directories for Zimbabwe
WEB_SEARCH_QUERIES = [
    "company directories in zimbabwe",
    "business directory zimbabwe",
    "companies directory zimbabwe",
    "zimbabwe business listings",
    "yellow pages zimbabwe",
    "zimbabwe suppliers directory",
    "zimbabwe company database",
]

MAX_RESULTS_PER_QUERY = 40  # per query for web search
USER_AGENT = (
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
    "AppleWebKit/537.36 (KHTML, like Gecko) "
    "Chrome/120.0.0.0 Safari/537.36"
)
REQ_HEADERS = {"User-Agent": USER_AGENT, "Accept-Language": "en-US,en;q=0.9"}

In [11]:

# 2) URL helpers
def normalize_url(url: str) -> str:
    """
    Normalize URL for deduplication:
    - lower-case host and path
    - remove trailing slash
    - drop fragments and query
    """
    try:
        p = urlparse(url)
        host = (p.netloc or "").lower()
        path = (p.path or "").rstrip("/").lower()
        return f"{host}{path}"
    except Exception:
        return url.strip().lower()


def get_domain(url: str) -> str:
    """Extract domain from URL."""
    try:
        d = urlparse(url).netloc.lower()
        return d.replace("www.", "")
    except Exception:
        return ""


def fetch_page_title(url: str, timeout: int = 8) -> str:
    """Best-effort fetch of the page title (kept optional to avoid slowing things down)."""
    try:
        r = requests.get(url, headers=REQ_HEADERS, timeout=timeout, allow_redirects=True)
        if r.status_code >= 400:
            return ""
        soup = BeautifulSoup(r.text, "html.parser")
        title = soup.title.string.strip() if soup.title and soup.title.string else ""
        return title
    except Exception:
        return ""

In [12]:
# 3) Web search (DuckDuckGo) for business directories
def ddg_search_directories(queries, max_results_per_query=40, fetch_titles=False):
    """
    Run multiple web searches and collect URLs.
    Uses duckduckgo_search for reliability (no API key required).
    """
    rows = []
    with DDGS(timeout=10) as ddgs:
        for q in queries:
            rank = 0
            for item in ddgs.text(q, max_results=max_results_per_query, region="wt-wt"):
                # item fields: title, href, body
                url = item.get("href")
                title = item.get("title") or ""
                if not url:
                    continue
                rank += 1
                rows.append(
                    {
                        "query": q,
                        "source": "web_search",
                        "rank": rank,
                        "title_raw": title,
                        "url": url,
                        "url_key": normalize_url(url),
                        "domain": get_domain(url),
                    }
                )

    # Deduplicate by normalized URL
    # Keep the best-ranked (lowest rank) result per url_key
    rows.sort(key=lambda r: (r["url_key"], r["rank"]))
    unique = []
    seen = set()
    for r in rows:
        if r["url_key"] not in seen:
            seen.add(r["url_key"])
            unique.append(r)

    df = pd.DataFrame(unique)
    if df.empty:
        return df

    # Optionally enrich with fetched titles (can be slow)
    if fetch_titles:
        titles = []
        for u in df["url"]:
            titles.append(fetch_page_title(u))
            time.sleep(0.2)  # gentle rate limit
        df["title"] = [t if t else df.loc[i, "title_raw"] for i, t in enumerate(titles)]
    else:
        df["title"] = df["title_raw"].where(df["title_raw"].str.len() > 0, other=df["url"])

    # Final columns
    df["country"] = "Zimbabwe"
    df["search_date"] = datetime.now().strftime("%Y-%m-%d")
    df.insert(0, "number", range(1, len(df) + 1))
    return df[["number", "title", "url", "domain", "source", "query", "country", "search_date"]]

In [13]:
# 4) Google Places: text search + details (optional; needs API key)
PLACES_TEXTSEARCH_URL = "https://maps.googleapis.com/maps/api/place/textsearch/json"
PLACES_DETAILS_URL = "https://maps.googleapis.com/maps/api/place/details/json"


def google_places_text_search(api_key: str, query: str, max_pages: int = 3, sleep_between_pages: float = 2.5):
    """
    Text Search for places like 'business directory Zimbabwe'.
    Returns list of results (dicts from the Places API).
    """
    results = []
    params = {
        "query": query,
        "language": "en",
        "key": api_key,
    }
    pagetoken = None
    pages = 0

    while pages < max_pages:
        if pagetoken:
            params = {"pagetoken": pagetoken, "key": api_key}
            # next_page_token takes a short while to become valid
            time.sleep(sleep_between_pages)

        resp = requests.get(PLACES_TEXTSEARCH_URL, params=params, headers=REQ_HEADERS, timeout=10)
        data = resp.json()
        status = data.get("status")

        if status not in ("OK", "ZERO_RESULTS"):
            # Common statuses: OVER_QUERY_LIMIT, REQUEST_DENIED, INVALID_REQUEST
            break

        results.extend(data.get("results", []))
        pagetoken = data.get("next_page_token")
        pages += 1
        if not pagetoken:
            break

    return results


def google_place_details(api_key: str, place_id: str):
    """
    Fetch website and other details for a Place.
    """
    params = {
        "place_id": place_id,
        "fields": "name,website,formatted_address,types,rating,user_ratings_total",
        "language": "en",
        "key": api_key,
    }
    try:
        resp = requests.get(PLACES_DETAILS_URL, params=params, headers=REQ_HEADERS, timeout=10)
        data = resp.json()
        if data.get("status") == "OK":
            return data.get("result", {})
        return {}
    except Exception:
        return {}


def gather_google_places_directories(api_key: str):
    """
    Query Places for terms that likely surface business directory organizations in Zimbabwe,
    then fetch details to extract websites.
    """
    place_queries = [
        "business directory Zimbabwe",
        "company directory Zimbabwe",
        "yellow pages Zimbabwe",
        "business listings Zimbabwe",
    ]

    # Text search
    text_results = []
    for q in place_queries:
        text_results.extend(google_places_text_search(api_key, q, max_pages=3, sleep_between_pages=2.5))

    if not text_results:
        return pd.DataFrame()

    # Dedup by place_id
    uniq = {}
    for r in text_results:
        pid = r.get("place_id")
        if not pid:
            continue
        # Keep highest user_ratings_total
        curr = uniq.get(pid)
        if not curr or (r.get("user_ratings_total", 0) > curr.get("user_ratings_total", 0)):
            uniq[pid] = r

    # Fetch details (website) for the top N results (limit to avoid quota blow-up)
    top = list(uniq.values())[:40]
    rows = []
    for idx, r in enumerate(top, start=1):
        pid = r.get("place_id", "")
        details = google_place_details(api_key, pid) if pid else {}
        website = details.get("website", "")
        rows.append(
            {
                "place_id": pid,
                "name": r.get("name", ""),
                "formatted_address": r.get("formatted_address", ""),
                "types": ",".join(r.get("types", [])),
                "rating": r.get("rating", None),
                "user_ratings_total": r.get("user_ratings_total", None),
                "website": website,
                "domain": get_domain(website) if website else "",
            }
        )
        time.sleep(0.2)  # gentle rate limit

    df = pd.DataFrame(rows)
    if df.empty:
        return df

    df["source"] = "google_places"
    df["country"] = "Zimbabwe"
    df["search_date"] = datetime.now().strftime("%Y-%m-%d")
    df.insert(0, "number", range(1, len(df) + 1))
    # Harmonize columns with the web search where possible
    df.rename(columns={"name": "title", "website": "url"}, inplace=True)
    return df[["number", "title", "url", "domain", "source", "formatted_address", "rating", "user_ratings_total", "country", "search_date"]]

In [14]:
# 5) Run everything end-to-end
def run_all(fetch_titles=False, save_csv=True):
    # A) Web search for business directories
    print("Searching the web for Zimbabwe business directories ...")
    df_web = ddg_search_directories(WEB_SEARCH_QUERIES, MAX_RESULTS_PER_QUERY, fetch_titles=fetch_titles)
    print(f"Web directories collected: {len(df_web)}")

    # B) Google Places (optional)
    if GOOGLE_PLACES_API_KEY:
        print("Querying Google Places (using your API key) ...")
        df_places = gather_google_places_directories(GOOGLE_PLACES_API_KEY)
        print(f"Places results collected: {len(df_places)}")
    else:
        print("No Google Places API key found in environment. Skipping Places step.")
        df_places = pd.DataFrame()

    # C) Save results
    ts = datetime.now().strftime("%Y%m%d_%H%M%S")
    if save_csv:
        if not df_web.empty:
            df_web.to_csv(f"zimbabwe_company_directories_web_{ts}.csv", index=False)
        if not df_places.empty:
            df_places.to_csv(f"zimbabwe_company_directories_places_{ts}.csv", index=False)

        if not df_web.empty or not df_places.empty:
            print("Saved CSVs with timestamp", ts)

    # D) Display previews
    if not df_web.empty:
        display(df_web.head(20))
    if not df_places.empty:
        display(df_places.head(20))

    # E) Combined (if both exist)
    if not df_web.empty and not df_places.empty:
        common_cols = ["number", "title", "url", "domain", "source", "country", "search_date"]
        df_web2 = df_web[common_cols].copy()
        df_places2 = df_places[common_cols].copy()
        df_combined = pd.concat([df_web2.assign(kind="web_directory"), df_places2.assign(kind="place")], ignore_index=True)
        display(df_combined.head(20))
        if save_csv:
            df_combined.to_csv(f"zimbabwe_company_directories_combined_{ts}.csv", index=False)
        return df_web, df_places, df_combined

    return df_web, df_places, None


# Execute
df_web, df_places, df_all = run_all(fetch_titles=False, save_csv=True)

Searching the web for Zimbabwe business directories ...


  with DDGS(timeout=10) as ddgs:


Web directories collected: 50
No Google Places API key found in environment. Skipping Places step.
Saved CSVs with timestamp 20251202_120820


Unnamed: 0,number,title,url,domain,source,query,country,search_date
0,1,Zimbabwe Business Directory 2025 | AFRIKTA,https://afrikta.com/listing-locations/zimbabwe/,afrikta.com,web_search,yellow pages zimbabwe,Zimbabwe,2025-12-02
1,2,causes of employment syndrome in zimbabwe and ...,https://brainly.in/question/2489004,brainly.in,web_search,zimbabwe business listings,Zimbabwe,2025-12-02
2,3,Why Zimbabwe not considered a democratic count...,https://brainly.in/question/3130822,brainly.in,web_search,zimbabwe business listings,Zimbabwe,2025-12-02
3,4,"explain how the disciplines of Linguistic, Ant...",https://brainly.in/question/56239706,brainly.in,web_search,zimbabwe business listings,Zimbabwe,2025-12-02
4,5,describe 4 common types of roofs in Zimbabwe u...,https://brainly.in/question/61821509,brainly.in,web_search,zimbabwe business listings,Zimbabwe,2025-12-02
5,6,company is/are - WordReference Forums,https://forum.wordreference.com/threads/compan...,forum.wordreference.com,web_search,company directories in zimbabwe,Zimbabwe,2025-12-02
6,7,company who or company which? - WordReference ...,https://forum.wordreference.com/threads/compan...,forum.wordreference.com,web_search,company directories in zimbabwe,Zimbabwe,2025-12-02
7,8,In/at the company - WordReference Forums,https://forum.wordreference.com/threads/in-at-...,forum.wordreference.com,web_search,company directories in zimbabwe,Zimbabwe,2025-12-02
8,9,M/S followed by a Company Name? - WordReferenc...,https://forum.wordreference.com/threads/m-s-fo...,forum.wordreference.com,web_search,company directories in zimbabwe,Zimbabwe,2025-12-02
9,10,"""Who is *Company Name*"" or ""Who are ""Company N...",https://forum.wordreference.com/threads/who-is...,forum.wordreference.com,web_search,company directories in zimbabwe,Zimbabwe,2025-12-02
