In [34]:
import pandas as pd

import time
import re
import csv
from urllib.parse import urljoin

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import (
    TimeoutException, NoSuchElementException, WebDriverException
)
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from urllib.parse import urljoin, urlparse, urlunparse
from typing import Dict, List, Tuple, Set, Optional

In [35]:

# ---------------------------
# Configuration
# ---------------------------
HEADLESS = True

# Tighten timeouts to reduce long stalls (you can tune these)
PAGE_LOAD_TIMEOUT = 14            # was 25
IMPLICIT_WAIT = 0                 # use explicit waits instead
WAIT_AFTER_NAV = 0.25             # was 2.0; rely on WebDriverWait

# Hard per-site budget (seconds) to cap work; prevents pathological sites
SITE_TIME_BUDGET = 180            
RESTART_DRIVER_EVERY = 10        # restart webdriver after this many sites

# --- Domain blacklist & SIDEARM detection ---
DOMAIN_BLACKLIST = {
    "uwbadgers.com",
    "sidearmsports.com",
    "bigtenplus.com",
}

def is_blacklisted(url: str) -> bool:
    from urllib.parse import urlparse
    host = urlparse(url).netloc.lower()
    return any(host == d or host.endswith('.' + d) or host.endswith(d) for d in DOMAIN_BLACKLIST)

def looks_like_sidearm(soup: BeautifulSoup) -> bool:
    if not soup:
        return False
    html = str(soup).lower()
    txt = (soup.get_text(separator=" ").lower() if soup else "")
    return any([
        'data-sidearm-app' in html,
        'pause all rotators' in txt,
        'sidearm-icons.svg' in html,
        'sidearmstats' in html,
        'images.sidearmdev.com' in html,
        'swiper ' in html
    ])

ABOUT_KEYWORDS = [
    "about", "about us", "our story", "who we are", "company", "mission", "vision"
]

STRONG_LEADERSHIP_KEYWORDS = [
    "leadership", "executive", "executive-team", "management", "management-team",
    "board", "board-of-directors", "principals"
]
WEAK_LEADERSHIP_KEYWORDS = [
    "our team", "team", "people", "our people", "staff"
]
LEADERSHIP_KEYWORDS = STRONG_LEADERSHIP_KEYWORDS + WEAK_LEADERSHIP_KEYWORDS

COMMON_TITLES = {
    "ceo", "chief executive officer", "president", "chairman", "coo", "cfo",
    "vice president", "vp", "senior vice president", "svp", "executive vice president",
    "evp", "director", "managing director", "partner", "principal", "founder",
    "co-founder", "chair", "board", "board chair", "board of directors", "advisor",
    "general manager", "gm", "operations", "finance", "marketing", "strategy",
    "people officer", "hr", "talent", "safety", "innovation"
}

LEADERSHIP_TITLE_TOKENS = {
    "chief", "ceo", "coo", "cfo", "cto", "cio", "president", "chair", "chairman",
    "vp", "vice president", "svp", "evp", "executive", "director", "principal",
    "partner", "board"
}

MENU_WORDS = {
    "news", "insights", "contact", "careers", "locations",
    "services", "projects", "markets", "company", "home", "blog", "resources"
}

NON_NAME_HEADINGS = {
    "our history", "history", "mission", "vision", "philosophy", "values",
    "community", "locations", "services", "projects", "insights"
}

BAD_CONTEXT_PATHS = {
    "/projects", "/project", "/insights", "/blog", "/resources", "/news", "/case-studies", "/events"
}

PREFERRED_ABOUT_PATHS = {
    "/about", "/about-us", "/company", "/who-we-are", "/our-story", "/history", "/about-us/"
}

PEOPLE_DIR_SLUGS = {
    "/people", "/our-people", "/eua-people", "/team", "/our-team", "/staff"
}

NON_NAME_TERMS = {
    "services","solutions","tools","machining","casting","alloy","selector",
    "comparison","shop","request","quote","careers","news","insights","resources",
    "projects","contact","privacy","terms"
}

# Limits to speed up heavy steps
MAX_CAROUSEL_STEPS = 5
MAX_PROFILES_TO_ENRICH = 8        # baseline; will adapt if needed

# ---------------------------
# News configuration
# ---------------------------

NEWS_KEYWORDS = [
    "news", "press", "press-releases", "media",
    "company-news", "news-events", "announcements", "insights"
]

# Cap how much we crawl/read
MAX_NEWS_ARTICLES = 5           # first page only
MAX_ARTICLE_BODY_CHARS = 2000   # per article body

# ---------------------------
# Imports
# ---------------------------

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException
import time, re, json
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin

# ---------------------------
# Utilities
# ---------------------------

def ensure_list(data):
    return data if isinstance(data, list) else [data]

def normalize_url(url: str) -> str:
    url = url.strip()
    if not url.startswith(("http://", "https://")):
        url = "https://" + url
    return url

def clean_text(text: str) -> str:
    if not text:
        return ""
    return re.sub(r"\s+", " ", text).strip()

def normalize_field(val) -> str:
    if val is None:
        return ""
    if isinstance(val, list):
        return clean_text(" ".join(str(x) for x in val if x is not None))
    if isinstance(val, dict):
        for k in ("name", "jobTitle", "url", "@id"):
            if k in val and val[k]:
                return normalize_field(val[k])
        return clean_text(str(val))
    return clean_text(str(val))

# ---------------------------
# Driver helpers (Optimized)
# ---------------------------

def build_driver(headless: bool = True):
    chrome_opts = Options()
    if headless:
        chrome_opts.add_argument("--headless=new")

    # SPEED: stop waiting for all subresources (DOMContentLoaded eager)
    chrome_opts.page_load_strategy = "eager"

    chrome_opts.add_argument("--disable-gpu")
    chrome_opts.add_argument("--no-sandbox")
    chrome_opts.add_argument("--window-size=1400,1000")
    chrome_opts.add_argument("--disable-dev-shm-usage")
    chrome_opts.add_argument("--lang=en-US")
    chrome_opts.add_argument("--disable-extensions")
    chrome_opts.add_argument("--remote-allow-origins=*")
    chrome_opts.add_argument(
        "--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36"
    )

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_opts)
    driver.set_page_load_timeout(PAGE_LOAD_TIMEOUT)
    driver.implicitly_wait(IMPLICIT_WAIT)

    # Default: block heavy image formats using CDP (we’ll enable temporarily on leadership pages if needed)
    try:
        driver.execute_cdp_cmd("Network.enable", {})
        driver.execute_cdp_cmd("Network.setBlockedURLs", {"urls": [
            "*.jpg", "*.jpeg", "*.png", "*.gif", "*.webp", "*.svg",
            "*googletagmanager.com*", "*doubleclick.net*", "*gpt*",
            "*sidearmstats*", "*images.sidearmdev.com*", "*cloudfront.net*/*sidearm*",
            "*statbroadcast.com*", "*foxsports.com*", "*thevarsitynetwork.com*"
        ]})
    except Exception:
        pass

    return driver

def set_image_loading(driver, enabled: bool):
    """Toggle image loading at runtime using CDP."""
    try:
        driver.execute_cdp_cmd("Network.setBlockedURLs", {"urls": [] if enabled else ["*.jpg", "*.jpeg", "*.png", "*.gif", "*.webp", "*.svg"]})
    except Exception:
        pass

def safe_get(driver, url: str, timeout: int = None) -> bool:
    """Navigate with an enforced page-load timeout. Returns True on success, False on timeout/exception."""
    try:
        if timeout is None:
            timeout = PAGE_LOAD_TIMEOUT
        driver.set_page_load_timeout(timeout)
        driver.get(url)
        return True
    except TimeoutException:
        print(f"[safe_get] Timeout loading {url}")
        return False
    except WebDriverException as e:
        print(f"[safe_get] WebDriverException loading {url}: {e}")
        return False

def wait_for_body(driver):
    try:
        WebDriverWait(driver, 8).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
    except TimeoutException:
        pass

def wait_for_content(driver, selectors: list[str], timeout: int = 8):
    """Explicit wait for any of the selectors; returns True/False."""
    end = time.time() + timeout
    while time.time() < end:
        try:
            for sel in selectors:
                if driver.find_elements(By.CSS_SELECTOR, sel):
                    return True
        except WebDriverException:
            pass
        time.sleep(0.2)
    return False

def prime_page_for_extraction(driver, cycles: int = 1, pause: float = 0.4):
    """SPEED: fewer scroll cycles to trigger lazy content only once."""
    try:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight * 0.6);")
        time.sleep(pause)
        driver.execute_script("window.scrollTo(0, 0);")
    except WebDriverException:
        pass

def deep_lazy_scroll(driver, cycles: int = 2, pause: float = 0.5):
    """SPEED: reduce cycles compared to original."""
    try:
        for _ in range(cycles):
            if site_time_remaining() <= 0:
                break
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(pause)
            if site_time_remaining() <= 0:
                break
            driver.execute_script("window.scrollTo(0, 0);")
            time.sleep(pause)
    except WebDriverException:
        pass

def dismiss_cookie_banners(driver):
    selectors = [
        "[id*='cookie'] button", "[class*='cookie'] button",
        "[aria-label*='Accept']", "button[onclick*='accept']",
        "button[title*='Accept']"
    ]
    for sel in selectors[:3]:  # SPEED: check fewer patterns
        try:
            btns = driver.find_elements(By.CSS_SELECTOR, sel)
            for b in btns[:2]:
                driver.execute_script("arguments[0].click();", b)
                time.sleep(0.2)
        except WebDriverException:
            continue

def get_soup(driver) -> BeautifulSoup:
    return BeautifulSoup(driver.page_source, "lxml")

def get_main_container(soup: BeautifulSoup) -> BeautifulSoup:
    candidates = soup.select("main, [role='main'], article, .page-content, #content, #main, .content")
    for c in candidates:
        if c and len(clean_text(c.get_text())) > 50:
            return c
    return soup

def is_in_excluded_area(el: BeautifulSoup) -> bool:
    p = el
    while p and hasattr(p, "parent"):
        if p.name in {"header", "nav", "footer", "aside"}:
            return True
        classes = p.get("class", []) or []
        if any(cls in {"site-header", "global-nav", "site-footer", "footer", "navbar", "menu"} for cls in classes):
            return True
        p = p.parent
    return False

# ---------------------------
# Leadership UI expansion (NEW)
# ---------------------------

def _query_contains(driver, selector: str, label: str):
    """
    Emulate :contains('text') for simple text matches in Selenium.
    selector = CSS selector w/o :contains, label = lowercase text to search.
    """
    els = driver.find_elements(By.CSS_SELECTOR, selector)
    out = []
    for el in els:
        try:
            txt = (el.text or "").lower()
            if label in txt:
                out.append(el)
        except WebDriverException:
            continue
    return out

def expand_accordions(driver):
    """Click common accordion/collapse triggers to reveal bios/names."""
    triggers = driver.find_elements(By.CSS_SELECTOR,
        ".accordion [aria-expanded='false'], .accordion button, .collapse-toggle, .accordion-header, .faq-item button"
    )
    clicks = 0
    for t in triggers:
        if site_time_remaining() <= 0:
            break
        if clicks >= 6:
            break
        try:
            driver.execute_script("arguments[0].scrollIntoView({block:'center'});", t)
            time.sleep(0.2)
            driver.execute_script("arguments[0].click();", t)
            clicks += 1
            time.sleep(0.3)
        except WebDriverException:
            continue

def expand_leadership_ui(driver):
    """
    Expand common UI controls on leadership/team pages to reveal all people:
    - Tabs/filters (team, leadership, board)
    - Load more / view all buttons
    - Accordions
    - Basic pagination (click “next” once or twice)
    """
    # Tabs/filters
    tab_selectors = [
        "[role='tab']",
        ".tabs [role='tab']",
        ".filter [role='tab']",
        "[class*='tab'][class*='team']",
        "[data-filter*='leadership']",
        "[data-filter*='executive']",
        "[data-filter*='board']",
        "[data-filter*='team']",
        ".team-tabs button, .filters button"
    ]
    # Load more / view all
    load_selectors = [
        "button.load-more, a.load-more, .load-more button",
        "button.view-all, a.view-all, .view-all button",
        "button.show-more, a.show-more, .show-more button",
    ]
    # Label-based controls (using contains-emulation)
    label_controls = [
        ("button", "load more"),
        ("a", "load more"),
        ("button", "view all"),
        ("a", "view all"),
        ("button", "show more"),
        ("a", "show more"),
    ]

    # bail early if site budget exhausted
    clicked = 0
    if site_time_remaining() <= 0:
        return
    MAX_CLICKS = 8

    # Try tab/filter selectors
    for sel in tab_selectors:
        if site_time_remaining() <= 0:
            break
        if clicked >= MAX_CLICKS:
            break
        try:
            for el in driver.find_elements(By.CSS_SELECTOR, sel):
                if site_time_remaining() <= 0:
                    break
                if clicked >= MAX_CLICKS:
                    break
                driver.execute_script("arguments[0].scrollIntoView({block:'center'});", el)
                time.sleep(0.2)
                driver.execute_script("arguments[0].click();", el)
                clicked += 1
                time.sleep(0.4)
        except WebDriverException:
            continue

    # Try load-more/selectors
    for sel in load_selectors:
        if site_time_remaining() <= 0:
            break
        if clicked >= MAX_CLICKS:
            break
        try:
            for el in driver.find_elements(By.CSS_SELECTOR, sel):
                if site_time_remaining() <= 0:
                    break
                if clicked >= MAX_CLICKS:
                    break
                driver.execute_script("arguments[0].scrollIntoView({block:'center'});", el)
                time.sleep(0.2)
                driver.execute_script("arguments[0].click();", el)
                clicked += 1
                time.sleep(0.5)
        except WebDriverException:
            continue

    # Try label-based contains controls
    for base_sel, label in label_controls:
        if site_time_remaining() <= 0:
            break
        if clicked >= MAX_CLICKS:
            break
        try:
            candidates = _query_contains(driver, base_sel, label)
            for el in candidates:
                if site_time_remaining() <= 0:
                    break
                if clicked >= MAX_CLICKS:
                    break
                driver.execute_script("arguments[0].scrollIntoView({block:'center'});", el)
                time.sleep(0.2)
                driver.execute_script("arguments[0].click();", el)
                clicked += 1
                time.sleep(0.5)
        except WebDriverException:
            continue

    # Expand accordions
    expand_accordions(driver)

    # Basic pagination (click “next” 1–2 times)
    for _ in range(2):
        if site_time_remaining() <= 0:
            break
        try:
            next_candidates = driver.find_elements(By.CSS_SELECTOR, ".pagination a.next, a[rel='next'], a.next, .page-nav .next")
            if not next_candidates:
                break
            driver.execute_script("arguments[0].scrollIntoView({block:'center'});", next_candidates[0])
            time.sleep(0.2)
            driver.execute_script("arguments[0].click();", next_candidates[0])
            wait_for_body(driver)
            time.sleep(0.5)
        except WebDriverException:
            break

# ---------------------------
# Link discovery (minor speed tweaks)
# ---------------------------

def _path_segments(path: str) -> list[str]:
    segs = [s for s in re.split(r"[/_\-]+", path) if s]
    return [s.lower() for s in segs]

def _matches_segments(path: str, keywords: list[str]) -> bool:
    segs = _path_segments(path)
    kws = [k.lower().strip() for k in keywords]
    return any(k in segs for k in kws)

def _score_candidate_link(haystack_text: str, href_path: str, keywords: list[str], strong: bool) -> float:
    score = 0.0
    if any(k in haystack_text for k in keywords):
        score += 1.0
    if _matches_segments(href_path, keywords):
        score += 2.5
    for bad in BAD_CONTEXT_PATHS:
        if bad in href_path:
            score -= 1.5
    if strong:
        score += 0.3
    return score

def find_page_link_candidates(driver, keywords, strong: bool = False, exclude_paths=None) -> list[tuple[str,float]]:
    exclude_paths = exclude_paths if exclude_paths is not None else BAD_CONTEXT_PATHS
    anchors = driver.find_elements(By.TAG_NAME, "a")
    base = driver.current_url
    base_domain = urlparse(base).netloc

    candidates = []
    for a in anchors:
        try:
            href = a.get_attribute("href")
            if not href:
                continue
            href_abs = urljoin(base, href)
            parsed = urlparse(href_abs)
            href_path = (parsed.path or "").lower()
            href_frag = (parsed.fragment or "")

            if (not href_path) or href_path == "/" or href.startswith("#") or href_frag:
                continue

            haystack = " ".join([
                (a.text or ""),
                (a.get_attribute("aria-label") or ""),
                (a.get_attribute("title") or ""),
                (a.get_attribute("data-text") or "")
            ]).lower()

            score = _score_candidate_link(haystack, href_path, keywords, strong)

            if parsed.netloc != base_domain:
                continue

            if any(href_path.rstrip("/") == slug for slug in PEOPLE_DIR_SLUGS):
                score += 1.0

            if _matches_segments(href_path.rstrip("/"), ["leadership", "executive", "team", "people", "board", "management"]):
                score += 0.5

            if any(b in href_path for b in exclude_paths):
                continue

            candidates.append((href_abs, score))
        except WebDriverException:
            continue

    candidates.sort(key=lambda x: x[1], reverse=True)
    uniq, seen = [], set()
    for u, s in candidates[:8]:  # SPEED: cap candidates to examine
        if u not in seen:
            uniq.append((u,s)); seen.add(u)
    return uniq

# Site-level remaining time helper (uses SITE_DEADLINE set per-site in process_url)
def site_time_remaining() -> float:
    try:
        return max(0.0, SITE_DEADLINE - time.time())
    except Exception:
        return SITE_TIME_BUDGET

def choose_news_page(driver, base_url: str, debug: dict) -> str | None:
    """
    Pick a candidate news/press page from the landing page.
    Strategy:
      - score anchors by NEWS_KEYWORDS (internal domain only)
      - skip bad contexts (blogs with pagination are OK, we will only take the first page)
      - accept on first page visit if article cards are visible
    """
    
    cand = find_page_link_candidates(
        driver,
        NEWS_KEYWORDS,
        strong=False,
        exclude_paths=BAD_CONTEXT_PATHS - {"/news"}
    )[:6]
    debug["news_candidates"] = [u for (u, _) in cand]
    visited = set()

    for u, _score in cand:
        if u in visited:
            continue
        visited.add(u)
        try:
            ok = safe_get(driver, u)
            if not ok:
                debug.setdefault("news_errors", []).append({"url": u, "error": "timeout"})
                continue
            wait_for_body(driver)
            dismiss_cookie_banners(driver)

            # look for typical WP archive markers
            has_cards = wait_for_content(
                driver,
                selectors=[
                    "article", ".wp-block-post", ".post", ".entry", ".card", ".grid-item"
                ],
                timeout=6
            )
            if has_cards:
                debug["news_accepted"] = u
                return u
            else:
                debug.setdefault("news_rejected", []).append({"url": u, "reason": "no_cards"})
        except Exception as e:
            debug.setdefault("news_errors", []).append({"url": u, "error": str(e)})
            continue

    return None

def find_about_link(driver) -> str | None:
    anchors = driver.find_elements(By.TAG_NAME, "a")
    base = driver.current_url
    base_domain = urlparse(base).netloc

    for a in anchors:
        try:
            href = a.get_attribute("href")
            if not href:
                continue
            href_abs = urljoin(base, href)
            parsed = urlparse(href_abs)
            path = (parsed.path or "").lower()
            if parsed.netloc == base_domain and any(path.rstrip("/") == p for p in PREFERRED_ABOUT_PATHS):
                return href_abs
        except WebDriverException:
            continue

    cand = find_page_link_candidates(driver, ABOUT_KEYWORDS, strong=False)
    for u, _ in cand:
        path = urlparse(u).path.lower()
        if any(b in path for b in BAD_CONTEXT_PATHS) and path not in PREFERRED_ABOUT_PATHS:
            continue
        return u
    return None

def click_keyword_button(driver, keywords: list[str]) -> bool:
    base_url = driver.current_url
    controls = driver.find_elements(By.CSS_SELECTOR, "button, [role='button'], .btn, .button")
    for el in controls:
        try:
            text = " ".join(filter(None, [
                el.text or "",
                el.get_attribute("aria-label") or "",
                el.get_attribute("title") or ""
            ])).lower()
            attrs = " ".join(filter(None, [
                el.get_attribute("onclick") or "",
                el.get_attribute("data-url") or "",
                el.get_attribute("data-href") or ""
            ])).lower()

            if any(k in text for k in keywords) or any(k in attrs for k in keywords):
                driver.execute_script("arguments[0].click();", el)
                try:
                    WebDriverWait(driver, 5).until(lambda d: d.current_url != base_url)
                except TimeoutException:
                    pass
                return driver.current_url != base_url
        except WebDriverException:
            continue
    return False

# ---------------------------
# Person signals & heuristics
# ---------------------------


PERSON_CARD_SELECTORS = [
    # existing…
    ".team-member", ".leader", ".staff", ".person", ".profile",
    ".member", ".employee", ".card", ".tile", ".grid-item", "article",
    ".profile-card", ".bio-card", ".team-card", ".exec-card", ".board-card",
    # NEW: common WordPress block containers that often hold name/title
    ".wp-block-group", ".wp-block-columns", ".wp-block-column", ".wp-block-post"
]

PERSON_NAME_SELECTORS = [
    "[itemprop='name']", "[data-name]", ".name", ".person-name", ".team-name",
    ".member-name", ".profile-name", ".card-title", ".heading", "h2", "h3", "h4", "figcaption", ".tile-start-name"
]
PERSON_TITLE_SELECTORS = [
    "[itemprop='jobTitle']", "[data-title]", ".title", ".role", ".position",
    ".job-title", ".card-subtitle", ".subtitle", ".designation", "small", "em", "strong", ".tile-start-title"
]

def looks_like_title(text: str) -> bool:
    if not text:
        return False
    t = text.strip().lower()
    if any(tok in t for tok in COMMON_TITLES):
        return True
    return 3 < len(t) < 80 and not re.search(r"[.!?]{2,}", t)

def looks_like_name(text: str) -> bool:
    if not text:
        return False
    t = clean_text(text)
    tl = t.lower()
    for term in NON_NAME_TERMS:
        if term in tl:
            return False
    if tl in NON_NAME_HEADINGS or tl.strip(":") in NON_NAME_HEADINGS:
        return False
    if tl in MENU_WORDS:
        return False
    if any(sym in t for sym in ["/", "|", "+"]):
        return False
    if len(t) > 80 or len(t) < 2:
        return False

    words = re.split(r"\s+", t)
    def is_proper(w):
        return re.match(r"^[A-Z][a-z'’\-]+$|^[A-Z]\.$|^O'[A-Z][a-z]+$|^[A-Z]{2,}$", w) is not None
    proper_tokens = sum(1 for w in words if is_proper(w))
    return proper_tokens >= 2

def is_leadership_title(title: str) -> bool:
    t = (title or "").lower()
    return any(tok in t for tok in LEADERSHIP_TITLE_TOKENS)

# ---------- Role tokens / phrases & helpers (overlay-aware) ----------
def norm_role(role_text: str) -> str:
    r = role_text or ""
    r = r.lower()
    r = re.sub(r'[^a-z0-9\s\-+]', ' ', r)  # keep alnum, spaces, hyphen, plus
    r = re.sub(r'\s+', ' ', r).strip()
    return r

# Exact tokens (standalone words) -> leadership signal
ROLE_TOKENS: Dict[str, float] = {
    'owner': 2.2, 'co-owner': 2.0,
    'founder': 1.5, 'co-founder': 1.4,
    'president': 1.8,
    'ceo': 1.6, 'coo': 1.2, 'cfo': 0.9,          # acronyms only as tokens
    'principal': 1.0, 'partner': 1.0,
    'chairman': 0.8, 'chair': 0.8,
    'director': 0.5,                              # generic director modest
}

# Exact phrases (contiguous substrings) -> leadership signal
ROLE_PHRASES: Dict[str, float] = {
    'chief executive officer': 1.6,
    'chief operating officer': 1.2,
    'chief financial officer': 0.9,
    'managing director': 1.5,
    'general manager': 1.4,
    'board member': 0.4,
    'chairman of the board': 0.8,                 # CGSchmidt card phrase
}

# Blocklist: staff/support roles (never leadership)
ROLE_BLOCKLIST = {'coordinator', 'assistant', 'specialist', 'administrator', 'ambassador', 'intern'}

def role_weight_tokens(role_text: str) -> float:
    """Return leadership weight using tokens/phrases; 0 if blocklisted or no match."""
    r = norm_role(role_text)
    tokens = set(r.split())
    if any(b in tokens for b in ROLE_BLOCKLIST):
        return 0.0
    for tok, w in ROLE_TOKENS.items():
        if tok in tokens:
            return w
    for phrase, w in ROLE_PHRASES.items():
        if phrase in r:
            return w
    return 0.0

def canonical_title(title: str) -> str:
    """Map variants to clean labels (e.g., 'President & CEO' remains as-is, phrases title-cased)."""
    r = norm_role(title)
    # phrase first
    for phrase in ROLE_PHRASES:
        if phrase in r:
            return phrase.title()
    # token fallback
    tok_hits = [tok for tok in ROLE_TOKENS if tok in r.split()]
    if tok_hits:
        # Special-case combined roles
        if 'president' in tok_hits and 'ceo' in tok_hits:
            return 'President & CEO'
        if 'coo' in tok_hits and 'executive' in r and 'vice' in r and 'president' in r:
            return 'Chief Operating Officer, Executive Vice President'
        # single token
        return tok_hits[0].title()
    return title.strip()

# ---------------------------
# JSON-LD & Microdata
# ---------------------------

def parse_jsonld_people(soup: BeautifulSoup, base_url: str) -> list[dict]:
    leaders = []
    for script in soup.find_all("script", type="application/ld+json"):
        try:
            data = json.loads(script.string or "")
        except Exception:
            continue

        items = data if isinstance(data, list) else [data]
        for item in items:
            graphs = item.get("@graph", [])
            candidates = graphs if isinstance(graphs, list) else []
            if (normalize_field(item.get("@type")) or "").lower() == "person":
                candidates.append(item)

            for p in candidates:
                if (normalize_field(p.get("@type")) or "").lower() != "person":
                    continue
                name  = normalize_field(p.get("name", ""))
                title = normalize_field(p.get("jobTitle", ""))
                bio   = normalize_field(p.get("description", ""))
                img   = normalize_field(p.get("image", ""))
                url   = normalize_field(p.get("url", ""))
                if name or title or bio:
                    leaders.append({
                        "name": name, "title": title, "bio": bio,
                        "image_url": urljoin(base_url, img) if img else "",
                        "profile_url": urljoin(base_url, url) if url else "",
                        "source": "jsonld"
                    })
    return leaders

# ---------------------------
# Leadership extraction (Optimized order + limits)
# ---------------------------


def find_leadership_sections(soup: BeautifulSoup) -> list[BeautifulSoup]:
    sections = []
    root = get_main_container(soup)

    # existing heading-based section scan…
    for section in root.find_all(["section", "div", "article"]):
        if is_in_excluded_area(section):
            continue
        h = section.find(["h1", "h2", "h3", "h4"])
        ht = clean_text(h.get_text()) if h else ""
        if ht and any(k in ht.lower() for k in LEADERSHIP_KEYWORDS):
            sections.append(section)

    # existing id/class filters…
    for sel in [
        "[id*='leader']", "[class*='leader']",
        "[id*='team']", "[class*='team']",
        "[id*='staff']", "[class*='staff']",
        "[id*='people']", "[class*='people']",
        "[id*='executive']", "[class*='executive']",
        "[id*='management']", "[class*='management']",
        "[id*='board']", "[class*='board']",
        # NEW: WP block containers near leadership sections
        ".wp-block-group", ".wp-block-columns", ".wp-block-post"
    ]:
        for section in root.select(sel):
            if is_in_excluded_area(section):
                continue
            sections.append(section)

    unique_sections, seen = [], set()
    for s in sections:
        key = id(s)
        if key not in seen:
            unique_sections.append(s); seen.add(key)
    return unique_sections or [root]


def extract_by_headings_nearby_title(soup: BeautifulSoup) -> list[dict]:
    root = get_main_container(soup)
    out = []

    title_tokens = {
        "chief", "ceo", "coo", "cfo", "cto", "cio",
        "president", "chair", "chairman", "vice president",
        "vp", "svp", "evp", "executive vice president",
        "senior vice president", "director", "principal"
    }

    # Scan h2/h3/h4 that look like names
    for h in root.find_all(["h2", "h3", "h4"]):
        name = clean_text(h.get_text())
        if not looks_like_name(name):
            continue

        # Walk a few following siblings to find a paragraph/div with title tokens
        title = ""
        sib = h
        steps = 0
        while sib and steps < 5:
            sib = sib.find_next_sibling()
            steps += 1
            if not sib or sib.name not in {"p", "div", "small", "em", "strong"}:
                continue
            cand = clean_text(sib.get_text())
            # Remove quoted pull‑quotes; keep first sentence/line
            cand = cand.split("\n")[0].split(".”")[0].split(".”")[0]
            cl = cand.lower()
            if any(tok in cl for tok in title_tokens) and looks_like_title(cand):
                title = cand
                break

        if not title:
            # Look downward inside the same parent container for a p/div containing a title token
            parent = h.find_parent(["div", "section", "article"]) or root
            for el in parent.find_all(["p", "div", "small", "em", "strong"], limit=6):
                cand = clean_text(el.get_text())
                cl = cand.lower()
                if any(tok in cl for tok in title_tokens) and looks_like_title(cand):
                    title = cand
                    break

        if title:
            out.append({"name": name, "title": title, "bio": "", "image_url": "", "profile_url": ""})

    # De‑dup by (name,title)
    uniq = {}
    for L in out:
        key = (normalize_field(L["name"]).lower(), normalize_field(L["title"]).lower())
        if key not in uniq:
            uniq[key] = L
    return list(uniq.values())

def extract_cards_from_container(container: BeautifulSoup, base_url: str) -> list[dict]:
    leaders = []
    for sel in PERSON_CARD_SELECTORS:
        for card in container.select(sel):
            raw_text = clean_text(card.get_text())
            if len(raw_text) < 5:
                continue

            name_el = None
            for s in PERSON_NAME_SELECTORS:
                cand = card.select_one(s)
                text = clean_text(cand.get_text() if cand else "")
                if text and looks_like_name(text):
                    name_el = cand
                    break

            title_el = None
            for s in PERSON_TITLE_SELECTORS:
                cand = card.select_one(s)
                text = clean_text(cand.get_text() if cand else "")
                if text and looks_like_title(text):
                    title_el = cand
                    break
            if not title_el:
                continue

            bio_el = None
            for s in [".bio", ".summary", ".description", "[itemprop='description']"]:
                cand = card.select_one(s)
                if cand and len(clean_text(cand.get_text())) > 30:
                    bio_el = cand
                    break

            img_el = card.select_one("img")
            img_src = ""
            if img_el:
                for attr in ["src", "data-src", "data-original", "data-image"]:
                    val = img_el.get(attr)
                    if val:
                        img_src = val
                        break

            a_el = card.select_one("a[href]")
            href  = a_el.get("href") if a_el else ""

            name  = clean_text(name_el.get_text()) if name_el else ""
            title = clean_text(title_el.get_text())
            bio   = clean_text(bio_el.get_text()) if bio_el else ""

            if not name and img_el:
                img_alt = clean_text(img_el.get("alt",""))
                if img_alt and looks_like_name(img_alt):
                    name = img_alt

            if not name:
                continue

            leaders.append({
                "name": name,
                "title": title,
                "bio": bio[:2000] if bio else "",
                "image_url": urljoin(base_url, img_src) if img_src else "",
                "profile_url": urljoin(base_url, href) if href else "",
            })
    return leaders

def harvest_name_title_from_headings(container: BeautifulSoup) -> list[dict]:
    leaders = []
    for h in container.find_all(["h2", "h3", "h4", "figcaption"]):
        name = clean_text(h.get_text())
        if not looks_like_name(name):
            continue
        title = ""
        sib = h.find_next_sibling()
        while sib and sib.name in ["br", "hr"]:
            sib = sib.find_next_sibling()
        if sib:
            candidate = clean_text(sib.get_text())
            if looks_like_title(candidate):
                title = candidate
        if not title:
            for sel in [".title", ".role", ".position", ".job-title", "small", "em", "strong"]:
                el = h.find_next(sel)
                if el:
                    candidate = clean_text(el.get_text())
                    if looks_like_title(candidate):
                        title = candidate
                        break
        if not title:
            continue
        leaders.append({"name": name, "title": title, "bio": "", "image_url": "", "profile_url": ""})
    return leaders


def harvest_from_images(container: BeautifulSoup, base_url: str) -> list[dict]:
    """
    Overlay-first card parser for 'tile' components:
    - Read .tile-start-name and .tile-start-title directly from the grid.
    - Ignore image alt (e.g., 'RickSchmidt-Bio-Mobile' is not a name).
    - Do not visit profiles for titles; only attach href for later bio enrichment if needed.
    """
    leaders = []
    # Limit scope to tiles (owl carousel/card grid)
    for tile in container.select(".tile"):
        # Explicit overlay selectors (CGSchmidt structure)
        name_el  = tile.select_one(".tile-start-name")
        title_el = tile.select_one(".tile-start-title")

        name  = clean_text(name_el.get_text()) if name_el else ""
        title = clean_text(title_el.get_text()) if title_el else ""

        # Fallback: parse visible lines when classes differ
        if not (name and title):
            raw = clean_text(tile.get_text(separator="\n"))
            if not name:
                # first name-like line
                for line in [l for l in re.split(r"\s*\n+\s*", raw) if l.strip()]:
                    if looks_like_name(line):
                        name = line.strip()
                        break
            if not title:
                # scan next few lines after name for a leadership title (COO != Coordinator)
                lines = [l for l in re.split(r"\s*\n+\s*", raw) if l.strip()]
                if name:
                    try:
                        i = lines.index(name)
                        for j in range(i+1, min(len(lines), i+5)):
                            if role_weight_tokens(lines[j]) > 0.0:
                                title = lines[j].strip()
                                break
                    except ValueError:
                        pass

        # Validate and canonicalize
        if not looks_like_name(name):
            continue
        if role_weight_tokens(title) <= 0.0:
            continue
        title = canonical_title(title)

        # Optional image/profile link (do not rely on profile for title)
        img_el = tile.select_one("img")
        img_src = ""
        if img_el:
            for attr in ["src", "data-src", "data-original", "data-image"]:
                val = img_el.get(attr)
                if val:
                    img_src = val
                    break

        a_el = tile.select_one("a.cta[href], a[href]")  # 'Learn More' link
        href  = a_el.get("href") if a_el else ""

        leaders.append({
            "name": name,
            "title": title,
            "bio": "",  # bio can be enriched later, but we won't overwrite title from profiles
            "image_url": urljoin(base_url, img_src) if img_src else "",
            "profile_url": urljoin(base_url, href) if href else "",
        })
    return leaders


def extract_leaders_from_text_blocks(soup: BeautifulSoup, base_url: str) -> list[dict]:
    leaders = []
    root = get_main_container(soup)
    def find_name_title_pairs_in_text(block_text: str) -> list[tuple[str, str]]:
        out = []
        if not block_text:
            return out
        parts = re.split(r"[;\n\r]+| — | - ", block_text)
        for part in parts:
            part = clean_text(part)
            if not part:
                continue
            if re.search(r"[,–—-]", part):
                tokens = re.split(r"[,–—-]", part, maxsplit=1)
                name_candidate = clean_text(tokens[0]) if tokens else ""
                title_candidate = clean_text(tokens[1]) if len(tokens) > 1 else ""
            else:
                tokens = part.split()
                name_candidate = clean_text(" ".join(tokens[:4]))
                title_candidate = clean_text(" ".join(tokens[4:]))
            if looks_like_name(name_candidate) and looks_like_title(title_candidate):
                out.append((name_candidate, title_candidate))
        return out

    for el in root.find_all(["h2", "h3", "h4", "p", "li"]):
        # Header-followed-by-header pattern: e.g. <h3>NAME</h3>\n<h4>TITLE</h4>
        try:
            if el.name in ("h2", "h3", "h4"):
                nxt = el.find_next_sibling()
                # skip over non-tags/text nodes
                while nxt is not None and getattr(nxt, 'name', None) is None:
                    nxt = nxt.find_next_sibling()
                if nxt is not None and getattr(nxt, 'name', None) in ("h2", "h3", "h4", "p", "div", "span"):
                    name_cand = clean_text(el.get_text())
                    title_cand = clean_text(nxt.get_text())
                    if looks_like_name(name_cand) and looks_like_title(title_cand):
                        leaders.append({"name": name_cand, "title": title_cand, "bio": "", "image_url": "", "profile_url": ""})
                        # continue to next element to avoid double-count
                        continue
        except Exception:
            pass
        text = clean_text(el.get_text())
        pairs = find_name_title_pairs_in_text(text)
        for name, title in pairs:
            leaders.append({
                "name": name, "title": title, "bio": "",
                "image_url": "", "profile_url": ""
            })

    uniq = {}
    for L in leaders:
        key = (normalize_field(L["name"]).lower(), normalize_field(L["title"]).lower())
        if key not in uniq:
            uniq[key] = L
    return list(uniq.values())

def collect_profile_links(container: BeautifulSoup, base_url: str) -> list[str]:
    """
    Broadened to catch WP/team patterns and profile pages more reliably.
    """
    links = []
    BAD_PATHS = ("/contact", "/careers", "/news", "/locations", "/services", "/markets", "/projects")
    BAD_EXACT = {"#", "/", ""}

    for a in container.find_all("a", href=True):
        href = a["href"]
        href_abs = urljoin(base_url, href)
        low = href_abs.lower()
        txt = clean_text(a.get_text())

        # Strong signals
        if any(k in low for k in ["leadership", "executive", "team", "people", "staff", "board", "profile", "member", "bio"]):
            links.append(href_abs); continue
        if looks_like_name(txt):
            links.append(href_abs); continue

        parsed = urlparse(href_abs)
        path_low = (parsed.path or "").lower()
        if href in BAD_EXACT:
            continue
        if any(path_low.rstrip("/").endswith(bp) or path_low == bp for bp in BAD_PATHS):
            continue

        # WP-ish and team directories
        if any(s in path_low for s in ["/company/", "/about/", "/team/", "/people/", "/leadership/", "/board/", "/staff/", "/bio/", "/profiles/", "/members/"]):
            links.append(href_abs)

    # Also harvest links from known containers (fallback)
    for sel in [".wp-block-post a", ".wp-block-group a", ".team-list a", ".person-card a", ".grid a"]:
        for a in container.select(sel):
            href = a.get("href")
            if href:
                links.append(urljoin(base_url, href))

    # uniquify
    uniq = []
    seen = set()
    for u in links:
        if u not in seen:
            uniq.append(u); seen.add(u)
    return uniq

def enrich_from_profiles(driver, profile_links: list[str], base_url: str, max_profiles: int = MAX_PROFILES_TO_ENRICH) -> list[dict]:
    results = []
    for link in profile_links[:max_profiles]:
        if site_time_remaining() <= 0:
            break
        try:
            path_low = urlparse(link).path.lower()
            if any(b in path_low for b in BAD_CONTEXT_PATHS):
                continue

            ok = safe_get(driver, link)
            if not ok:
                continue
            wait_for_body(driver)
            dismiss_cookie_banners(driver)

            soup = get_soup(driver)
            name = ""
            for tag in ["h1", "h2", "h3"]:
                el = soup.find(tag)
                if el:
                    cand = clean_text(el.get_text())
                    if looks_like_name(cand):
                        name = cand; break
            if not name:
                img = soup.find("img")
                img_alt = clean_text(img.get("alt","")) if img else ""
                if looks_like_name(img_alt):
                    name = img_alt

            title = ""
            for sel in [".title", ".role", ".position", ".job-title", "strong", "em"]:
                el = soup.select_one(sel)
                if el:
                    cand = clean_text(el.get_text())
                    if looks_like_title(cand):
                        title = cand; break
            if not title:
                p = soup.find("p")
                cand = clean_text(p.get_text()) if p else ""
                cand = cand.split(".")[0][:120]
                if looks_like_title(cand):
                    title = cand

            if not (name and title):
                continue

            bio = ""
            for sel in [".bio", ".summary", ".description"]:
                el = soup.select_one(sel)
                if el and len(clean_text(el.get_text())) > 30:
                    bio = clean_text(el.get_text()); break

            results.append({
                "name": name, "title": title, "bio": bio,
                "image_url": "", "profile_url": link
            })
        except Exception:
            continue
    return results

def filter_to_likely_leaders(leaders: list[dict], page_url: str) -> list[dict]:
    path = urlparse(page_url).path.lower()
    if any(seg in path for seg in ["/people", "/team", "/our-team", "/staff", "/our-people"]):
        return [L for L in leaders if is_leadership_title(normalize_field(L.get("title","")))]
    return leaders

def dedup_leaders(leaders: list[dict]) -> list[dict]:
    uniq = {}
    for L in leaders:
        L["name"] = normalize_field(L.get("name",""))
        L["title"] = normalize_field(L.get("title",""))
        L["profile_url"] = normalize_field(L.get("profile_url",""))
        key = (L["name"].lower(), L["profile_url"].lower())
        if not key[0]:
            continue
        if key not in uniq:
            uniq[key] = L
        else:
            if not uniq[key].get("title") and L.get("title"):
                uniq[key] = L
    return list(uniq.values())

def validate_leaders(leaders: list[dict]) -> bool:
    valid = []
    for L in leaders:
        name = normalize_field(L.get("name",""))
        title = normalize_field(L.get("title",""))
        source = L.get("source","")
        if (looks_like_name(name) and looks_like_title(title)) or (source == "jsonld" and (name or title)):
            valid.append(L)
    return len(valid) >= 2

# ---------------------------
# Carousel extraction (Limited)
# ---------------------------

def extract_from_carousel(driver, base_url: str) -> list[dict]:
    leaders = []

    carousel_wrappers = [
        ".slick-slider", ".slick-carousel", ".slick-list",
        ".swiper", ".swiper-container", ".swiper-wrapper",
        ".owl-carousel", ".owl-stage", ".owl-stage-outer",
        ".carousel", ".carousel-inner", ".carousel-container"
    ]
    slide_selectors = [".slick-slide", ".swiper-slide", ".owl-item", ".carousel-item", ".slide"]
    next_btn_selectors = [".slick-next", ".swiper-button-next", ".owl-next", ".carousel-control-next", ".next", "[aria-label='Next']"]

    def parse_slide_html(html: str) -> list[dict]:
        soup = BeautifulSoup(html, "lxml")
        return extract_cards_from_container(soup, base_url)

    carousels = []
    for sel in carousel_wrappers:
        try:
            carousels.extend(driver.find_elements(By.CSS_SELECTOR, sel))
        except WebDriverException:
            continue

    if not carousels:
        return []

    for car in carousels[:2]:  # SPEED: limit number of carousels parsed
        try:
            leaders.extend(parse_slide_html(car.get_attribute("innerHTML") or ""))
        except Exception:
            pass

        slides = []
        for sel in slide_selectors:
            try:
                found = car.find_elements(By.CSS_SELECTOR, sel)
                if found:
                    slides = found
                    break
            except WebDriverException:
                continue

        next_btn = None
        for sel in next_btn_selectors:
            try:
                btns = car.find_elements(By.CSS_SELECTOR, sel)
                if btns:
                    next_btn = btns[0]
                    break
            except WebDriverException:
                continue

        step = 0
        while next_btn and step < MAX_CAROUSEL_STEPS:
            if site_time_remaining() <= 0:
                break
            try:
                driver.execute_script("arguments[0].click();", next_btn)
            except WebDriverException:
                try:
                    next_btn.click()
                except Exception:
                    break
            time.sleep(0.4)
            try:
                leaders.extend(parse_slide_html(car.get_attribute("innerHTML") or ""))
            except Exception:
                pass
            step += 1

    uniq = {}
    for L in leaders:
        name = normalize_field(L.get("name","")).lower()
        title = normalize_field(L.get("title","")).lower()
        profile = normalize_field(L.get("profile_url","")).lower()
        image = normalize_field(L.get("image_url","")).lower()
        key = (name, title, profile, image)
        if key not in uniq:
            uniq[key] = L
    return list(uniq.values())

# ---------------------------
# Generic extraction pipeline (Optimized + Adaptive enrichment)
# ---------------------------


def extract_leaders_generic(driver, base_url: str) -> tuple[list[dict], str, dict]:
    soup = get_soup(driver)
    leaders = []
    dbg = {"extract_static": {}}

    # 1) JSON-LD + microdata (FAST PATH)
    jsonld = parse_jsonld_people(soup, base_url)
    leaders.extend(jsonld)
    dbg["extract_static"]["jsonld_count"] = len(jsonld)

    micro_count = 0
    for person in soup.select("[itemscope][itemtype*='Person']"):
        # existing microdata collection…
        # (unchanged)
        pass
    dbg["extract_static"]["microdata_count"] = micro_count

    # Early exit if static sufficient
    if leaders:
        # existing de‑dup + validate…
        pass

    # 2) Sections/cards/headings/images (limit sections)
    sections = find_leadership_sections(soup)
    sec_cards = 0
    for sec in sections[:4]:
        c1 = extract_cards_from_container(sec, base_url)
        c2 = harvest_from_images(sec, base_url)
        c3 = harvest_name_title_from_headings(sec)
        leaders.extend(c1); leaders.extend(c2); leaders.extend(c3)
        sec_cards += len(c1) + len(c2) + len(c3)
    dbg["extract_static"]["section_cards_count"] = sec_cards

    # >>> NEW: headings + nearby paragraphs (very effective on WordPress pages like CG Schmidt)
    fast_pairs = extract_by_headings_nearby_title(soup)
    leaders.extend(fast_pairs)
    dbg["extract_static"]["fast_pairs"] = len(fast_pairs)

    # 3) Text-block fallback
    text_fallback = extract_leaders_from_text_blocks(soup, base_url)
    leaders.extend(text_fallback)
    dbg["extract_static"]["text_block_pairs"] = len(text_fallback)

    # De‑dup + validate before heavy steps
    uniq, dedup_static = {}, []
    for L in leaders:
        key = (
            normalize_field(L.get("name","")).lower(),
            normalize_field(L.get("title","")).lower(),
            normalize_field(L.get("profile_url","")).lower()
        )
        if key not in uniq:
            uniq[key] = L; dedup_static.append(L)

    if dedup_static and validate_leaders(dedup_static):
        dbg["pipeline"] = "leaders_found_static"
        return (dedup_static, "leaders_found_static", dbg)

    # 4) Only try carousel if static found <4 leaders
    if len(dedup_static) < 4:
        leaders_carousel = extract_from_carousel(driver, base_url)
        if leaders_carousel and validate_leaders(leaders_carousel):
            dbg["pipeline"] = "leaders_found_carousel"
            return (leaders_carousel, "leaders_found_carousel", dbg)

    # 5) Profile links crawl (ADAPTIVE), only if static <4
    profile_links = []
    if len(dedup_static) < 4:
        for sec in sections:
            profile_links.extend(collect_profile_links(sec, base_url))
        profile_links = list(dict.fromkeys(profile_links))
        dbg["profiles_collected"] = len(profile_links)

        if profile_links:
            adaptive_max = MAX_PROFILES_TO_ENRICH
            if len(dedup_static) < 2 and len(profile_links) >= 10:
                adaptive_max = min(5, len(profile_links))

            enriched = enrich_from_profiles(driver, profile_links, base_url, max_profiles=adaptive_max)
            uniq2, out = {}, []
            for L in enriched:
                key = (
                    normalize_field(L.get("name","")).lower(),
                    normalize_field(L.get("title","")).lower(),
                    normalize_field(L.get("profile_url","")).lower()
                )
                if key not in uniq2:
                    uniq2[key] = L; out.append(L)
            if out and validate_leaders(out):
                dbg["pipeline"] = "leaders_found_via_profiles"
                return (out, "leaders_found_via_profiles", dbg)

    dbg["pipeline"] = "leaders_not_listed_or_unrecognized_structure"
    return ([], "leaders_not_listed_or_unrecognized_structure", dbg)


# ---------------------------
# About extraction
# ---------------------------

def extract_about_from_soup(soup: BeautifulSoup) -> str:
    heading_tags = ["h1", "h2", "h3", "h4"]

    def not_in_chrome(el):
        p = el
        while p and hasattr(p, "parent"):
            if p.name in {"header", "nav", "footer", "aside"}:
                return False
            classes = p.get("class", []) or []
            if any(cls in {"site-header", "global-nav", "site-footer", "footer", "navbar", "menu"} for cls in classes):
                return False
            p = p.parent
        return True

    for el in soup.select("[id*='about'], [class*='about']"):
        text = clean_text(el.get_text(separator=" ", strip=True))
        if len(text) > 60 and not_in_chrome(el):
            return text

    for h in soup.find_all(heading_tags):
        h_text = clean_text(h.get_text()).lower()
        if any(k in h_text for k in ABOUT_KEYWORDS) and not_in_chrome(h):
            section = h.find_parent(["section", "article", "div"]) or h
            parts = [clean_text(h.get_text())]
            for sib in section.find_all(recursive=False):
                if sib == h:
                    continue
                if sib.name in heading_tags:
                    break
                parts.append(clean_text(sib.get_text(separator=" ", strip=True)))
            combined = clean_text(" ".join(p for p in parts if p))
            if len(combined) > 80:
                return combined

    for selector in ["main", "[role='main']", "article", ".content", ".page-content", "#content", "#main"]:
        for el in soup.select(selector):
            text = clean_text(el.get_text(separator=" ", strip=True))
            if "about" in text.lower() and len(text) > 120:
                return text

    COOKIE_TERMS = ["cookie", "cookies", "opt out", "opt-out", "opt out of cookies", "save my preferences", "privacy", "consent", "preferences", "opt in"]

    def looks_like_cookie_text(t: str) -> bool:
        tl = t.lower()
        for term in COOKIE_TERMS:
            if term in tl:
                return True
        return False

    paragraphs = [clean_text(p.get_text()) for p in soup.find_all("p")]
    # filter out cookie/privacy paragraphs
    paragraphs = [p for p in paragraphs if p and not looks_like_cookie_text(p)]
    candidates = [c for c in paragraphs if "about" in c.lower()]
    if candidates:
        return max(candidates, key=len)

    # fallback: use body text but strip cookie/privacy lines first
    full_lines = [l.strip() for l in re.split(r"[\n\r]+|(?<=[\.\!\?])\s+", clean_text(soup.get_text(separator=" \n", strip=True))) if l.strip()]
    filtered_lines = [l for l in full_lines if not looks_like_cookie_text(l)]
    body_text = " ".join(filtered_lines)
    return body_text[:3000] if len(body_text) > 200 else body_text

def _extract_article_cards_from_archive(soup: BeautifulSoup, base_url: str) -> list[dict]:
    """
    Extract article 'cards' (title, href, date/excerpt if present) from a category/archive page.
    We do NOT paginate; we only read what's on the first page.
    """
    articles = []
    container = get_main_container(soup)

    
    selectors = [
        "article",
        ".wp-block-post",
        ".post",
        ".entry",
        ".card",
        ".grid-item",
        ".grid .item",
        ".news-item",
        ".insights-item",
        ".blog-item"
    ]

    for sel in selectors:
        for el in container.select(sel):
            if site_time_remaining() <= 0:
                return articles
            # Link
            a = el.find("a", href=True)
            href = urljoin(base_url, a["href"]) if a else ""
            if not href or href.startswith("#"):
                continue

            # Title
            title = ""
            h = el.find(["h1","h2","h3","h4"])
            if h:
                title = clean_text(h.get_text())
            if not title and a:
                title = clean_text(a.get_text())

            # Date (best-effort from <time> or meta)
            date_txt = ""
            t = el.find("time")
            if t:
                date_txt = clean_text(t.get("datetime") or t.get_text() or "")
            if not date_txt:
                meta_date = el.find(attrs={"class": re.compile(r"date|posted|time", re.I)})
                if meta_date:
                    date_txt = clean_text(meta_date.get_text())

            # Excerpt
            excerpt = ""
            for ex_sel in [".excerpt", ".entry-summary", ".wp-block-post-excerpt", "p"]:
                ex_el = el.select_one(ex_sel)
                if ex_el:
                    excerpt = clean_text(ex_el.get_text())
                    break

            articles.append({
                "url": href,
                "title": title,
                "date": date_txt,
                "excerpt": excerpt
            })

            if len(articles) >= MAX_NEWS_ARTICLES:
                return articles
    
# Fallback: find links that look like articles
    if not articles:
        for a in container.find_all("a", href=True):
            if site_time_remaining() <= 0:
                break
            href = urljoin(base_url, a["href"])
            if "/news-insights/" in href.lower() and href.rstrip("/") != base_url.rstrip("/"):
                title = clean_text(a.get_text())
                if len(title) >= 15:
                    articles.append({
                        "url": href,
                        "title": title,
                        "date": "",
                        "excerpt": ""
                    })
            if len(articles) >= MAX_NEWS_ARTICLES:
                break

    return articles[:MAX_NEWS_ARTICLES]

# ---------------------------
# Leadership page selection (Optimized)
# ---------------------------

def probe_known_people_paths(base_url: str) -> list[str]:
    base = base_url.rstrip("/")
    return [
        urljoin(base + "/", "people"),
        urljoin(base + "/", "our-people"),
        urljoin(base + "/", "eua-people"),
        urljoin(base + "/", "team"),
        urljoin(base + "/", "our-team"),
        urljoin(base + "/", "staff"),
    ]

def choose_leadership_page(driver, base_url: str, debug: dict) -> str | None:
    strong = find_page_link_candidates(driver, STRONG_LEADERSHIP_KEYWORDS, strong=True)[:4]
    weak   = find_page_link_candidates(driver, WEAK_LEADERSHIP_KEYWORDS, strong=False)[:3]
    cand = strong + weak

    debug["leadership_candidates"] = [u for (u, _) in cand]
    visited = set()

    for u, _score in cand:
        if u in visited:
            continue
        visited.add(u)
        try:
            ok = safe_get(driver, u)
            if not ok:
                debug.setdefault("errors", []).append({"url": u, "error": "timeout_or_cdp_block"})
                continue
            wait_for_body(driver)
            dismiss_cookie_banners(driver)
            wait_for_content(driver, PERSON_CARD_SELECTORS + [".slick-slide", ".swiper-slide", ".owl-item"], timeout=6)
            prime_page_for_extraction(driver)
            deep_lazy_scroll(driver, cycles=1, pause=0.5)

            cur = driver.current_url
            parsed = urlparse(cur)
            if (not (parsed.path or "").strip("/")) or parsed.fragment:
                debug.setdefault("rejected", []).append({"url": cur, "reason": "fragment_or_empty_path"})
                continue

            soup = get_soup(driver)
            path = parsed.path.lower()
            if any(b in path for b in BAD_CONTEXT_PATHS):
                debug.setdefault("rejected", []).append({"url": cur, "reason": "bad_context"})
                continue

            # Quick accept: see if static extraction yields leaders
            leaders, status_suffix, _dbg = extract_leaders_generic(driver, cur)
            if leaders and validate_leaders(leaders):
                debug["accepted"] = cur
                return cur
            else:
                debug.setdefault("rejected", []).append({"url": cur, "reason": "no_person_signals"})
        except Exception as e:
            debug.setdefault("errors", []).append({"url": u, "error": str(e)})
            continue

    # Try clicking keyword buttons on the base page
    try:
        ok = safe_get(driver, base_url)
        if ok:
            wait_for_body(driver); dismiss_cookie_banners(driver)
            if click_keyword_button(driver, STRONG_LEADERSHIP_KEYWORDS + WEAK_LEADERSHIP_KEYWORDS):
                wait_for_content(driver, PERSON_CARD_SELECTORS + [".slick-slide", ".swiper-slide", ".owl-item"], timeout=6)
                prime_page_for_extraction(driver)
                deep_lazy_scroll(driver, cycles=1, pause=0.5)

                cur = driver.current_url
                parsed = urlparse(cur)
                if (not (parsed.path or "").strip("/")) or parsed.fragment:
                    debug.setdefault("rejected", []).append({"url": cur, "reason": "fragment_or_empty_path"})
                else:
                    soup = get_soup(driver)
                    leaders, status_suffix, _dbg = extract_leaders_generic(driver, cur)
                    if leaders and validate_leaders(leaders):
                        debug["accepted_via_button"] = cur
                        return cur
                    else:
                        debug.setdefault("rejected", []).append({"url": cur, "reason": "button_navigated_but_no_signals"})
    except Exception as e:
        debug.setdefault("errors", []).append({"url": base_url, "error": f"button_click: {e}"})

    # Probe common slugs (fast)
    for guess in probe_known_people_paths(base_url):
        try:
            ok = safe_get(driver, guess)
            if not ok:
                continue
            wait_for_body(driver); dismiss_cookie_banners(driver)
            wait_for_content(driver, PERSON_CARD_SELECTORS + [".slick-slide", ".swiper-slide", ".owl-item"], timeout=5)
            prime_page_for_extraction(driver)

            cur = driver.current_url
            parsed = urlparse(cur)
            if (not (parsed.path or "").strip("/")) or parsed.fragment:
                continue
            soup = get_soup(driver)
            path = parsed.path.lower()
            if any(b in path for b in BAD_CONTEXT_PATHS):
                continue

            leaders, status_suffix, _dbg = extract_leaders_generic(driver, cur)
            if leaders and validate_leaders(leaders):
                debug["accepted_probe_known"] = cur
                return cur
        except Exception as e:
            debug.setdefault("errors", []).append({"url": guess, "error": str(e)})
            continue

    # Fallback: None
    return None

def _extract_article_body(driver, article_url: str) -> tuple[str, str]:
    """
    Load a single article URL and return (date, body_text).
    - Prefer <article> or main container
    - Trim menus/footers via get_main_container
    - Cap to MAX_ARTICLE_BODY_CHARS
    - Return date best-effort (if present on page)
    """
    try:
        ok = safe_get(driver, article_url)
        if not ok:
            return ("", "")
        wait_for_body(driver)
        dismiss_cookie_banners(driver)
        soup = get_soup(driver)
        container = soup.select_one("article") or get_main_container(soup)

        # Date (prefer <time datetime>)
        date_txt = ""
        t = container.find("time") if container else None
        if t:
            date_txt = clean_text(t.get("datetime") or t.get_text() or "")
        if not date_txt and soup:
            t2 = soup.find("time")
            if t2:
                date_txt = clean_text(t2.get("datetime") or t2.get_text() or "")

        # Body: strip obvious chrome
        if container:
            # Remove common chrome sections within article
            for bad in container.select("nav, aside, .share, .social, .tags, .post-meta, .wp-block-buttons"):
                bad.decompose()
            body = clean_text(container.get_text(separator=" ", strip=True))
        else:
            body = clean_text(soup.get_text(separator=" ", strip=True))

        body = body[:MAX_ARTICLE_BODY_CHARS]
        return (date_txt, body)
    except Exception:
        return ("", "")

# ---------------------------
# Core: Process a single URL (Optimized with budget + UI expansion)
# ---------------------------

def process_url(driver, url: str) -> dict:
    start_ts = time.time()
    # set module-level deadline so helper functions can check remaining time
    try:
        global SITE_DEADLINE
    except Exception:
        pass
    SITE_DEADLINE = start_ts + SITE_TIME_BUDGET
    url = normalize_url(url)
    result = {
        "url": url,
        "about_url": "",
        "about_text": "",
        "leaders_url": "",
        "leaders": [],
        "leaders_summary": "",
        "status": "ok",
        "news_url": "",
        "news_articles": [],
        "debug": {}
    }

    def time_remaining():
        return SITE_TIME_BUDGET - (time.time() - start_ts)

    try:
        # LANDING
        ok = safe_get(driver, url)
        if not ok:
            result["status"] = "timeout"
            return result
        wait_for_body(driver)
        dismiss_cookie_banners(driver)

        # ABOUT (quick)
        if time_remaining() <= 0:
            result["status"] = "budget_exhausted"
            return result

        about_link = find_about_link(driver)
        if about_link:
            ok = safe_get(driver, about_link)
            if ok:
                wait_for_body(driver); dismiss_cookie_banners(driver)
                path = urlparse(driver.current_url).path.lower()
                about_soup = get_soup(driver)
            else:
                about_soup = get_soup(driver)
            if any(b in path for b in BAD_CONTEXT_PATHS) and path not in PREFERRED_ABOUT_PATHS:
                result["about_text"] = extract_about_from_soup(about_soup)
                result["about_url"] = ""
            else:
                result["about_url"] = driver.current_url
                result["about_text"] = extract_about_from_soup(about_soup)
        else:
            about_soup = get_soup(driver)
            result["about_text"] = extract_about_from_soup(about_soup)
            result["about_url"] = ""

        if time_remaining() <= 0:
            result["status"] = "budget_exhausted"
            return result

        # LEADERSHIP
        # Use remaining budget efficiently—go straight to leadership discovery
        ok = safe_get(driver, url)
        if not ok:
            result["status"] = (result["status"] + "; landing_reload_timeout").strip("; ")
        wait_for_body(driver)
        dismiss_cookie_banners(driver)

        leaders_link = choose_leadership_page(driver, url, result["debug"])
        if leaders_link and time_remaining() > 0:
            try:
                # Enable images temporarily (optional; alt attributes are available without loading)
                set_image_loading(driver, True)

                ok = safe_get(driver, leaders_link)
                if not ok:
                    result["status"] = (result["status"] + "; leaders_page_timeout").strip("; ")
                wait_for_body(driver); dismiss_cookie_banners(driver)
                wait_for_content(driver, PERSON_CARD_SELECTORS + [".slick-slide", ".swiper-slide", ".owl-item"], timeout=6)

                # NEW: expand tabs, load more, accordions, pagination
                expand_leadership_ui(driver)

                prime_page_for_extraction(driver)
                deep_lazy_scroll(driver, cycles=2, pause=0.5)

                parsed = urlparse(driver.current_url)
                if (not (parsed.path or "").strip("/")) or parsed.fragment:
                    result["status"] = (result["status"] + "; no_leaders_found").strip("; ")
                    result["leaders_url"] = ""
                    result["leaders"] = []
                    result["leaders_summary"] = ""
                else:
                    result["leaders_url"] = driver.current_url

                    leaders, status_suffix, dbg = extract_leaders_generic(driver, result["leaders_url"] or url)
                    result["debug"]["extract"] = dbg
                    leaders = filter_to_likely_leaders(leaders, result["leaders_url"] or url)
                    leaders = dedup_leaders(leaders)

                    if not validate_leaders(leaders):
                        result["status"] = (result["status"] + "; no_leaders_found").strip("; ")
                        result["leaders"] = []
                        result["leaders_summary"] = ""
                    else:
                        result["leaders"] = leaders
                        result["leaders_summary"] = "; ".join(
                            [f"{normalize_field(L.get('name',''))} — {normalize_field(L.get('title',''))}" for L in leaders if L.get("name") or L.get("title")]
                        )
                        result["status"] = (result["status"] + "; " + status_suffix).strip("; ")

            except Exception as e:
                result["status"] = (result["status"] + f"; leaders_parse_error: {e}").strip("; ")
            finally:
                # Disable images again for speed on next site/page
                set_image_loading(driver, False)

        else:
            # No dedicated page found; try landing page once
            leaders, status_suffix, dbg = extract_leaders_generic(driver, url)
            result["debug"]["extract"] = dbg
            leaders = filter_to_likely_leaders(leaders, url)
            leaders = dedup_leaders(leaders)

            if not validate_leaders(leaders):
                result["status"] = (result["status"] + "; no_leaders_found; no_leadership_link_found").strip("; ")
                result["leaders"] = []
                result["leaders_summary"] = ""
            else:
                result["leaders"] = leaders
                result["leaders_summary"] = "; ".join(
                    [f"{normalize_field(L.get('name',''))} — {normalize_field(L.get('title',''))}" for L in leaders if L.get("name") or L.get("title")]
                )
                result["status"] = (result["status"] + "; " + status_suffix + "; no_leadership_link_found").strip("; ")
    
        # ---------------------------
        # NEWS (first page only)
        # ---------------------------
        if time_remaining() <= 0:
            result["status"] = (result["status"] + "; budget_exhausted").strip("; ")
            return result

        try:
            ok = safe_get(driver, url)
            if not ok:
                result.setdefault("debug", {}).setdefault("news_errors", []).append({"url": url, "error": "landing_timeout_for_news"})
            wait_for_body(driver); dismiss_cookie_banners(driver)
            news_link = choose_news_page(driver, url, result["debug"])
            result["news_url"] = news_link or ""

            news_articles = []
            if news_link and time_remaining() > 0:
                ok = safe_get(driver, news_link)
                if not ok:
                    result.setdefault("debug", {}).setdefault("news_errors", []).append({"url": news_link, "error": "news_page_timeout"})
                wait_for_body(driver); dismiss_cookie_banners(driver)
                prime_page_for_extraction(driver)

                soup_news = get_soup(driver)
                cards = _extract_article_cards_from_archive(soup_news, news_link)
                for card in cards:
                    if time_remaining() <= 0:
                        break
                    # Avoid off-domain, fragments, PDFs
                    parsed = urlparse(card["url"])
                    if parsed.scheme not in ("http","https") or parsed.fragment or card["url"].lower().endswith(".pdf"):
                        continue
                    # Fetch body
                    dtxt, body = _extract_article_body(driver, card["url"])
                    article = {
                        "url": card["url"],
                        "title": card["title"],
                        "date": dtxt or card["date"],
                        "body": body or card.get("excerpt","")
                    }
                    news_articles.append(article)
                    if len(news_articles) >= MAX_NEWS_ARTICLES:
                        break

            result["news_articles"] = news_articles
        except Exception as e:
            result.setdefault("debug", {}).setdefault("news_errors", []).append(str(e))
            result["news_url"] = ""
            result["news_articles"] = []

    except TimeoutException:
        result["status"] = "timeout"
    except WebDriverException as e:
        result["status"] = f"webdriver_error: {e}"
    except Exception as e:
        result["status"] = f"error: {e}"

    return result

# ---------------------------
# Runner (Optimized: reuse driver)
# ---------------------------

def run_all(URLS):
    results = []
    leader_rows = []
    # Restart driver every N sites to avoid long-lived driver state/memory buildup
    restart_every = globals().get('RESTART_DRIVER_EVERY', 10)
    driver = None
    try:
        for idx, raw_url in enumerate(URLS):
            # rebuild driver at chunk start
            if driver is None:
                driver = build_driver(headless=HEADLESS)

            t0 = time.time()
            print(f"\nProcessing: {raw_url}")

            try:
                r = process_url(driver, raw_url)
            finally:
                try:
                    driver.delete_all_cookies()
                except Exception:
                    pass

            leaders_summary = "; ".join(
                [f"{normalize_field(L.get('name',''))} — {normalize_field(L.get('title',''))}"
                 for L in r.get("leaders", []) if L.get("name") or L.get("title")]
            )

            results.append({
                "url": r.get("url",""),
                "about_url": r.get("about_url",""),
                "about_text": r.get("about_text",""),
                "leaders_url": r.get("leaders_url",""),
                "leaders_summary": leaders_summary,
                "status": r.get("status",""),
                "news_url": r.get("news_url",""),
                "news_res" : r.get("news_articles", "")
            })

            for L in r.get("leaders", []):
                leader_rows.append({
                    "site_url": r.get("url",""),
                    "leaders_url": r.get("leaders_url",""),
                    "name": normalize_field(L.get("name","")),
                    "title": normalize_field(L.get("title","")),
                    "profile_url": normalize_field(L.get("profile_url","")),
                    "image_url": normalize_field(L.get("image_url","")),
                    "bio": normalize_field(L.get("bio","")),
                })

            t1 = time.time()
            print(f"Done: {raw_url} in {t1 - t0:.2f}s")

            # restart driver periodically
            try:
                if (idx + 1) % restart_every == 0:
                    try:
                        driver.quit()
                    except Exception:
                        pass
                    driver = None
            except Exception:
                pass
    finally:
        if driver is not None:
            try:
                driver.quit()
            except Exception:
                pass

    return results, leader_rows




In [36]:
#----------------------------
#    Predictor Code
#----------------------------
import re
import math
import json
import argparse
from typing import Dict, List, Tuple

# ---------- News scoring ----------
NEWS_WEIGHT = 0.45          # news evidence is weaker than About
NEWS_TOTAL_CAP = 2.5        # absolute cap on |z_news_own| + |z_news_op|

# About/news family-term multiplier settings
ABOUT_NEWS_MULT = 1.2       # fallback boost for news on About pages
FAMILY_ARTICLE_MULT = 0.6   # per-article multiplier increment when family-term appears
FAMILY_ARTICLE_MULT_CAP = 2.0
FAMILY_ARTICLE_RX = re.compile(r"\bfamily\b(?:\W+\w+){0,6}\W+\b(owned|operated|run|led)\b", re.I)
MAX_NEWS_ARTICLES = 50
PER_ARTICLE_CAP = 0.8       # cap contribution from any single article (tighter)
DOMAIN_DEDUPE_FACTOR = 0.5  # weight for subsequent articles from same domain
OWN_NEWS_MULT = 0.4         # reduce ownership-specific news weights slightly


# ---------- Configurable priors ----------
# Keep ownership low unless clear signals exist
OWNERSHIP_PRIOR_Z = -2.2   # ~0.10 baseline with no evidence
OPERATION_PRIOR_Z = -0.4   # unused when gate forces op=0, but applied after gate
ROLE_CAP_TOTAL = 4.0
BOARD_CAP_TOTAL = 1.2

# ---------- News-only operation patterns (succession) ----------
# Succession indicators remain strong
NEWS_OP_POS = [
    (re.compile(r'\bsucceeds?\s+(his|her|their)\s+(father|mother)\b', re.I), 3.0),
    (re.compile(r'\bson\s+of\s+the\s+founder\b', re.I), 2.8),
    (re.compile(r'\bdaughter\s+of\s+the\s+founder\b', re.I), 2.8),
    (re.compile(r'\bsecond[-\s]?generation\b', re.I), 2.2),
    (re.compile(r'\bthird[-\s]?generation\b', re.I), 2.4),
    (re.compile(r'\bfamily\s+legacy\b', re.I), 1.6),
    # Broader signals indicating succession/lineage or leadership mentions (reduced weight)
    (re.compile(r'\b(named|promoted|appointed|elected)\b', re.I), 0.15),
    (re.compile(r'\b(nominated|honor(ed|s)?|awarded)\b', re.I), 0.15),
]

# ---------- Ownership-positive patterns ----------
OWN_POS = [
    (re.compile(r'\bfamily[-\s]?owned\b', re.I), 3.2),
    (re.compile(r'\bfamily[-\s]?business\b', re.I), 2.0),
    (re.compile(r'\bfamily\s+member\s+since\b', re.I), 2.0),
    (re.compile(r'\bin\s+family\s+ownership\b', re.I), 2.6),
    (re.compile(r'\bthe\s+([A-Z][a-z]+)\s+family\b'), 2.2),
    (re.compile(r'\b(?:third|fourth|fifth|multi)[-\s]?generation\b', re.I), 2.4),
    (re.compile(r'\bprivately\s+owned\s+by\s+the\s+[A-Z][a-z]+\s+family\b', re.I), 2.8),
    # Co-occurrence: "family" near "owned"
    (re.compile(r'\bfamily\b(?:\W+\w+){0,6}\W+\bowned\b', re.I), 2.6),
    # Spelled-out / numeric "X generations"
    (re.compile(r'\b(?:one|two|three|four|five|six|seven|eight|nine|ten)\s+generations?\b', re.I), 2.2),
    (re.compile(r'\b\d{1,2}\s+generations?\b', re.I), 2.2),
    # Broader ownership-ish mentions sometimes found in press/company history
    (re.compile(r'\bcentury[-\s]?old|five\s+generations|family\s+legacy\b', re.I), 1.6),
]

# ---------- Operation-positive patterns (added only after gate passes) ----------
OP_POS = [
    (re.compile(r'\bfamily[-\s]?operated\b', re.I), 3.0),
    (re.compile(r'\bfamily\s+member\s+since\b', re.I), 2.0),
    (re.compile(r'\bfamily[-\s]?run\b', re.I), 2.8),
    (re.compile(r'\bower[-\s]?operated\b', re.I), 2.2),
    (re.compile(r'\bfounder[-\s]?led\b', re.I), 1.4),
    (re.compile(r'\bfamily\b(?:\W+\w+){0,6}\W+\b(operated|run|led)\b', re.I), 2.6),
    # Broader leadership/appointment language — lowered weight
    (re.compile(r'\b(promoted|appointed|named|served\s+as)\b', re.I), 0.15),
    # Named ... title pattern: reduce weight and will require title proximity check
    (re.compile(r'\bnamed\b[\s\S]{0,120}?\b(president|ceo|chair|vice president|director|owner)\b', re.I), 0.25),
]

# ---------- Negatives (private-firm relevant; your list preserved) ----------
NEG = [
    (re.compile(r'\bsubsidiary\s+of\b', re.I), -2.6),
    (re.compile(r'\bdivision\s+of\b', re.I), -2.2),
    (re.compile(r'\bacquired\s+by\b', re.I), -2.2),
    (re.compile(r'\bwholly\s+owned\s+by\b', re.I), -2.6),
    (re.compile(r'\bportfolio\s+company\b', re.I), -2.5),
    (re.compile(r'\bpart\s+of\b', re.I), -1.6),
    (re.compile(r'\bmanaged\s+by\b', re.I), -1.4),
    (re.compile(r'\bfranchisee\s+of\b', re.I), -1.0),
]

# ---------- Anti false-positive ----------
ANTI_FP = [
    re.compile(r'customers?\s+like\s+family', re.I),
    re.compile(r'join\s+our\s+family', re.I),
    re.compile(r'\bfamily\s+values\b', re.I),
]

# ---------- Leadership URL keywords ----------
LEADERSHIP_URL_KEYWORDS = ["leadership", "people", "team", "management", "executives", "owners"]

# ---------- Token-based role matching ----------

from datetime import datetime, timezone

def parse_date_best_effort(dt_str: str) -> datetime | None:
    """
    Accepts ISO-like or textual month formats; returns naive UTC datetime or None.
    We avoid external libraries; handle 'YYYY-MM-DD', 'Month DD, YYYY', etc.
    """
    s = (dt_str or "").strip()
    if not s:
        return None
    # ISO first
    for fmt in ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S%z", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d %H:%M", "%Y/%m/%d"):
        try:
            return datetime.strptime(s, fmt)
        except Exception:
            pass
    # Textual month
    for fmt in ("%b %d, %Y", "%B %d, %Y", "%d %b %Y", "%d %B %Y"):
        try:
            return datetime.strptime(s, fmt)
        except Exception:
            pass
    return None

def news_recency_factor(dt: datetime | None) -> float:
    """
    Scale contribution by age: <= 730 days -> 1.0; else 0.6.
    """
    if not dt:
        return 1.0
    try:
        now = datetime.now(timezone.utc).replace(tzinfo=None)
        age_days = max(0, (now - dt).days)
        return 1.0 if age_days <= 730 else 0.6
    except Exception:
        return 1.0

def extract_all_person_names_from_news(articles: List[Dict]) -> List[str]:
    names = []
    rx_name = re.compile(r"\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)\b")  # simple proper case
    for a in articles:
        text = " ".join([a.get("title",""), a.get("body","")])
        for m in rx_name.finditer(text):
            cand = m.group(1).strip()
            if is_valid_person_name(cand):
                names.append(cand)
    return names

def classify_news_page(articles: List[Dict]) -> str:
    names = extract_all_person_names_from_news(articles)
    surnames = [last_name_of(n) for n in names if n]
    repeated = 0
    if surnames:
        for s in set(surnames):
            repeated = max(repeated, surnames.count(s))
    lineage_terms = 0
    joined = " ".join([a.get("title","") + " " + a.get("body","") for a in articles])
    for kw in ["founded", "since", "generation", "family"]:
        if re.search(rf"\b{kw}\b", joined, re.I):
            lineage_terms += 1
    # Make lineage detection stricter: require multiple lineage keywords, or strong repeated-name signal plus a lineage keyword
    if lineage_terms >= 2 or (lineage_terms >= 1 and repeated >= 3):
        return "lineage"
    return "corporate"

def norm_role(role_text: str) -> str:
    """
    Normalize role to lowercase tokens, keeping alnum, spaces, hyphen, plus.
    """
    r = role_text or ""
    r = r.lower()
    r = re.sub(r'[^a-z0-9\s\-+]', ' ', r)  # remove punctuation except hyphen/plus
    r = re.sub(r'\s+', ' ', r).strip()
    return r

# Exact tokens (standalone words) -> weights
ROLE_TOKENS: Dict[str, float] = {
    'owner': 2.2, 'co-owner': 2.0,
    'founder': 1.5, 'co-founder': 1.4,
    'president': 1.8, 'vice president' : 1.0,
    'ceo': 1.6, 'coo': 1.2, 'cfo': 0.9,   # acronyms as tokens only
    'principal': 1.0, 'partner': 1.0,
    'chairman': 0.8, 'chair': 0.8,
    'director': 0.5,                      # generic director modest
}

# Exact phrases (contiguous substrings) -> weights
ROLE_PHRASES: Dict[str, float] = {
    'vice president': 1.0,
    'chief executive officer': 1.6,
    'chief operating officer': 1.2,
    'chief financial officer': 0.9,
    'managing director': 1.5,
    'general manager': 1.4,
    'board member': 0.4,
}

# Blocklist: not leadership for clustering/weights
ROLE_BLOCKLIST = {'coordinator', 'assistant', 'specialist', 'administrator', 'ambassador', 'intern'}

def role_weight(role_text: str) -> float:
    """
    Return leadership weight for a role using tokens/phrases (0 if blocklisted or no match).
    """
    r = norm_role(role_text)
    tokens = set(r.split())
    # Blocklist
    if any(b in tokens for b in ROLE_BLOCKLIST):
        return 0.0
    # Phrase hits first (prefer multi-word exact phrases like 'vice president')
    # iterate phrases in descending length to prefer longer matches
    for phrase, w in sorted(ROLE_PHRASES.items(), key=lambda kv: -len(kv[0])):
        if phrase in r:
            return w
    # Token hits (single-word tokens only) — skip multi-word keys here
    for tok, w in ROLE_TOKENS.items():
        if ' ' in tok:
            continue
        if tok in tokens:
            return w
    return 0.0

# ---------- Helpers ----------
def sigmoid(z: float) -> float:
    return 1 / (1 + math.exp(-z))

def clean_text(t: str) -> str:
    return (t or "").strip()

def parse_leaders(summary: str) -> List[Tuple[str, str]]:
    """Split 'Name — Role; Name — Role; ...' into list of (name, role)."""
    if not summary:
        return []
    entries = [e.strip() for e in re.split(r';\s*', summary) if e.strip()]
    pairs: List[Tuple[str, str]] = []
    for e in entries:
        parts = re.split(r'\s+[—\-–]\s+', e)   # em dash / en dash / hyphen
        if len(parts) == 1:
            parts = re.split(r'\s*,\s*', e, maxsplit=1)  # fallback on comma
        name = parts[0].strip()
        role = parts[1].strip() if len(parts) > 1 else ""
        pairs.append((name, role))
    return pairs

def is_valid_person_name(name: str) -> bool:
    if not name or re.search(r'\d', name):
        return False
    tmp = re.sub(r'[()&]', '', name).strip()
    tokens = tmp.split()
    if len(tokens) < 2:
        return False
    return tokens[0][0].isupper() and tokens[1][0].isupper()

def last_name_of(name: str) -> str:
    return re.sub(r'[^A-Za-z\-]', '', (name.split()[-1] if name else "")).lower()

# ---------- Leadership-only weighted surname clustering ----------
def leadership_cluster(leaders_list: List[Tuple[str, str]]) -> Tuple[float, str, Dict[str, float], float, int, float, bool]:
    """
    Compute weighted surname clustering among leadership-only.
    Includes only entries with role_weight > 0.
    Returns: (ratio_weighted, top_surname, weighted_counts, total_weight, shared_count_top, max_role_weight_top, family_role_present)
    """
    counts_w: Dict[str, float] = {}
    counts_raw: Dict[str, int] = {}
    max_role_w_by_ln: Dict[str, float] = {}
    total_w = 0.0
    family_role_present = False

    for name, role in leaders_list:
        if not is_valid_person_name(name):
            continue
        w = role_weight(role)
        # If the role text explicitly mentions 'family' or 'family member', treat it as stronger leadership-family signal
        try:
            if role and re.search(r'family\s+member|family', role, re.I):
                family_role_present = True
                w = w * 1.6
        except Exception:
            pass
        if w <= 0:
            continue
        ln = last_name_of(name)
        if not ln:
            continue
        counts_w[ln] = counts_w.get(ln, 0.0) + w
        counts_raw[ln] = counts_raw.get(ln, 0) + 1
        max_role_w_by_ln[ln] = max(max_role_w_by_ln.get(ln, 0.0), w)
        total_w += w

    if total_w <= 0 or not counts_w:
        return (0.0, "", counts_w, total_w, 0, 0.0, family_role_present)

    top_ln, top_w = max(counts_w.items(), key=lambda kv: kv[1])
    ratio = top_w / total_w
    shared_count_top = counts_raw.get(top_ln, 0)
    max_role_w_top = max_role_w_by_ln.get(top_ln, 0.0)
    return (ratio, top_ln, counts_w, total_w, shared_count_top, max_role_w_top, family_role_present)

# ---------- Main scorer ----------

def score_news_articles(articles: List[Dict], page_type: str, leader_full_set: set | None = None) -> Tuple[float, float, List[Tuple[str, float]], List[Tuple[str, float]], List[Tuple[str, float]]]:
    """
    Return (z_news_own, z_news_op, evidence_own, evidence_op, evidence_neg)
    Applies NEWS_WEIGHT and recency factor per-article.
    Caps total contribution.
    Implements stricter proximity and corroboration for generic matches.
    Now requires that articles provide a leader surname mention and a family-term before counting appointment/award matches.
    Adds per-article caps and per-domain deduplication to avoid press-kit inflation.
    Ownership news now requires corroboration across >=2 distinct domains to count.
    """
    z_no, z_np = 0.0, 0.0
    evidence_own: List[Tuple[str, float]] = []
    evidence_op:  List[Tuple[str, float]] = []
    evidence_neg: List[Tuple[str, float]] = []

    generic_entries: List[Tuple[int, float, str, float]] = []  # (article_idx, w_eff, snippet, domain_factor)
    seen_domains: Dict[str, int] = {}

    # ownership accumulation by domain (deferred credit)
    ownership_by_domain: Dict[str, float] = {}
    ownership_snips_by_domain: Dict[str, List[str]] = {}

    # derive leader surnames from leader_full_set (if provided)
    leader_surnames = set()
    try:
        if leader_full_set:
            for n in leader_full_set:
                if not n:
                    continue
                # leader_full_set entries are expected as 'first last' lowercase; extract last token
                leader_surnames.add(last_name_of(n))
    except Exception:
        leader_surnames = set()

    for ai, a in enumerate((articles or [])[:MAX_NEWS_ARTICLES]):
        text = (a.get("body","") or "") + " " + (a.get("title","") or "")
        dt = parse_date_best_effort(a.get("date",""))
        rec = news_recency_factor(dt)

        # domain dedupe factor
        domain = ""
        try:
            url = (a.get("url") or a.get("source") or "")
            m = re.search(r'https?://([^/]+)', url)
            if m:
                domain = m.group(1).lower()
        except Exception:
            domain = ""
        seen_count = seen_domains.get(domain, 0)
        domain_factor = 1.0 if seen_count == 0 else DOMAIN_DEDUPE_FACTOR
        seen_domains[domain] = seen_count + 1

        # per-article accumulators
        per_article_z_no = 0.0
        per_article_z_np = 0.0
        per_article_own_snips: List[str] = []

        # determine if this article explicitly mentions any leader surname
        has_leader_surname = True if not leader_surnames else False
        if leader_surnames:
            for ln in leader_surnames:
                try:
                    if re.search(r"\b" + re.escape(ln) + r"\b", text, re.I):
                        has_leader_surname = True
                        break
                except re.error:
                    continue

        # require family-term anywhere in article for per-article qualification
        family_terms = ("family", "founder", "generation", "son", "daughter", "succeeds", "legacy", "succession", "founding")
        has_family_term = False
        for term in family_terms:
            try:
                if re.search(r"\b" + re.escape(term) + r"\b", text, re.I):
                    has_family_term = True
                    break
            except re.error:
                continue

        # normalize article person full-names (first + last)
        def _norm_full(name: str) -> str:
            if not name:
                return ""
            toks = [t for t in re.sub(r"[^A-Za-z\\s]", " ", name).split() if t]
            if len(toks) >= 2:
                return f"{toks[0].lower()} {toks[-1].lower()}"
            return ""
        article_names = [ _norm_full(n) for n in extract_all_person_names_from_news([a]) ]
        article_full_set = {n for n in article_names if n}

        # helper: leader proximity and family-term proximity
        def _leader_near_match(span_start, span_end, leaders, window=200):
            if not leaders:
                return False
            for leader in leaders:
                try:
                    for lm in re.finditer(r"\\b" + re.escape(leader) + r"\\b", text.lower()):
                        if lm.start() >= max(0, span_start - window) and lm.start() <= span_end + window:
                            return True
                except re.error:
                    continue
            return False
        def _term_near_match(span_start, span_end, terms, window=200):
            if not terms:
                return False
            for term in terms:
                try:
                    for tm in re.finditer(r"\\b" + re.escape(term) + r"\\b", text, re.I):
                        if tm.start() >= max(0, span_start - window) and tm.start() <= span_end + window:
                            return True
                except re.error:
                    continue
            return False

        # optional debug: show article preview and which patterns match
        if False:
            try:
                print("ARTICLE PREVIEW:", text[:300])
                for rx, w in OWN_POS + OP_POS + NEWS_OP_POS + NEG:
                    m = rx.search(text)
                    if m:
                        print("MATCH:", rx.pattern, "->", (m.group(0) or ""), "w=", w)
            except Exception:
                pass

        # Ownership positives (always considered) — reduced by OWN_NEWS_MULT and accumulated per-article
        for rx, w in OWN_POS:
            for m in rx.finditer(text):
                w_eff = w * NEWS_WEIGHT * OWN_NEWS_MULT * rec
                per_article_z_no += w_eff
                snippet = text[max(0, m.start()-60): m.end()+60]
                per_article_own_snips.append(snippet.strip())

        # If article does not mention a leader surname or lacks family-term, skip operation/appointment patterns to avoid false positives
        if not (has_leader_surname and has_family_term):
            # defer ownership credit to domain aggregation (store per-domain)
            adj_own = min(per_article_z_no, PER_ARTICLE_CAP) * domain_factor
            ownership_by_domain[domain] = ownership_by_domain.get(domain, 0.0) + adj_own
            if per_article_own_snips:
                ownership_snips_by_domain.setdefault(domain, []).extend(per_article_own_snips)
            continue

        # Operation positives (generic) — only processed if article mentions leader surname and family-term
        generic_keywords = ("named", "promoted", "appointed", "served", "nominated", "honor", "awarded", "elected")
        for rx, w in OP_POS:
            for m in rx.finditer(text):
                is_generic = any(k in rx.pattern for k in generic_keywords)
                w_eff = w * NEWS_WEIGHT * rec
                snippet = text[max(0, m.start()-60): m.end()+60]

                # For generic appointment/award patterns, require full-name proximity and family-term proximity
                if is_generic and page_type != "lineage" and leader_full_set:
                    if not (_leader_near_match(m.start(), m.end(), leader_full_set) and _term_near_match(m.start(), m.end(), family_terms)):
                        continue
                    # If this is specifically the named...title pattern, require a title token nearby (tighten false positives)
                    if 'president' in rx.pattern or 'ceo' in rx.pattern or 'chair' in rx.pattern or 'vice president' in rx.pattern or 'owner' in rx.pattern:
                        window_snip = text[max(0, m.start()-120): m.end()+120]
                        if not re.search(r"\b(president|ceo|chair|owner)\b", window_snip, re.I):
                            continue
                    # Defer corroboration: store generic entries for post-article aggregation (include domain factor)
                    generic_entries.append((ai, w_eff, snippet.strip(), domain_factor))
                    per_article_z_np += w_eff
                    continue
                # Non-generic or lineage-allowed -> immediate credit (accumulate per-article)
                per_article_z_np += w_eff
                evidence_op.append((snippet.strip(), round(w_eff,3)))

        # News-only operation (succession)
        for rx, w in NEWS_OP_POS:
            for m in rx.finditer(text):
                succession_keywords = ("succeeds", "son", "daughter", "generation")
                is_succession = any(k in rx.pattern for k in succession_keywords)
                w_eff = w * NEWS_WEIGHT * rec
                snippet = text[max(0, m.start()-60): m.end()+60]

                if not is_succession and page_type != "lineage" and leader_full_set:
                    if not (_leader_near_match(m.start(), m.end(), leader_full_set) and _term_near_match(m.start(), m.end(), family_terms)):
                        continue
                    # treat broader NEWS_OP generic matches similarly (defer for corroboration)
                    generic_entries.append((ai, w_eff, snippet.strip(), domain_factor))
                    per_article_z_np += w_eff
                    continue
                per_article_z_np += w_eff
                evidence_op.append((snippet.strip(), round(w_eff,3)))

        # Negatives (still processed even when leader surname present)
        for rx, w in NEG:
            for m in rx.finditer(text):
                w_eff = w * NEWS_WEIGHT * rec
                per_article_z_no += w_eff
                per_article_z_np += w_eff
                snippet = text[max(0, m.start()-60): m.end()+60]
                evidence_neg.append((snippet.strip(), round(w_eff,3)))

        # apply per-article caps and domain factor after processing this article
        adj_own = min(per_article_z_no, PER_ARTICLE_CAP) * domain_factor
        ownership_by_domain[domain] = ownership_by_domain.get(domain, 0.0) + adj_own
        if per_article_own_snips:
            ownership_snips_by_domain.setdefault(domain, []).extend(per_article_own_snips)

        z_np += min(per_article_z_np, PER_ARTICLE_CAP) * domain_factor

    # Post-process ownership_by_domain: require corroboration across >=2 distinct domains to count ownership news credit
    distinct_domains_with_own = [d for d, v in ownership_by_domain.items() if v > 0]
    if len(distinct_domains_with_own) >= 2:
        total_own_news = sum(ownership_by_domain.values())
        z_no += total_own_news
        # add evidence per-domain
        for d in distinct_domains_with_own:
            snips = ownership_snips_by_domain.get(d, [])
            if snips:
                evidence_own.append((f"news(ownership, {d}): " + " ; ".join(snips), round(ownership_by_domain.get(d,0.0),3)))
    else:
        # no corroboration across domains -> drop news ownership credit
        pass

    # Post-process generic entries: require corroboration across >=2 distinct articles to give full credit,
    # otherwise drop single-article generic signals to reduce false positives.
    if generic_entries:
        # group by article index
        per_article_generic: Dict[int, float] = {}
        per_article_domain_factor: Dict[int, float] = {}
        per_article_snips: Dict[int, List[str]] = {}
        for ai, w_eff, snippet, df in generic_entries:
            per_article_generic[ai] = per_article_generic.get(ai, 0.0) + w_eff
            per_article_domain_factor[ai] = df
            per_article_snips.setdefault(ai, []).append(snippet)

        article_set = set(per_article_generic.keys())
        if len(article_set) >= 2:
            for ai in sorted(article_set):
                summed = per_article_generic.get(ai, 0.0)
                df = per_article_domain_factor.get(ai, 1.0)
                adj = min(summed, PER_ARTICLE_CAP) * df
                z_np += adj
                # join snippets for evidence
                evidence_op.append(("news(generic): " + " ; ".join(per_article_snips.get(ai, [])), round(adj,3)))

    # Collapse duplicate snippets within each evidence list (sum weights)
    def _collapse(evidence: List[Tuple[str, float]]) -> List[Tuple[str, float]]:
        acc: Dict[str, float] = {}
        order: List[str] = []
        for s, w in evidence:
            if s not in acc:
                order.append(s)
                acc[s] = 0.0
            acc[s] += float(w)
        return [(s, round(acc[s], 3)) for s in order]

    evidence_own = _collapse(evidence_own)
    evidence_op  = _collapse(evidence_op)
    evidence_neg = _collapse(evidence_neg)

    # Page-type caps
    if page_type == "corporate":
        # allow only small ownership nudge, roles ok but capped
        z_no = max(min(z_no, 0.3), -NEWS_TOTAL_CAP)
        z_np = max(min(z_np, 1.2), -NEWS_TOTAL_CAP)
    else:
        # lineage page: full cap
        z_no = max(min(z_no, NEWS_TOTAL_CAP), -NEWS_TOTAL_CAP)
        z_np = max(min(z_np, NEWS_TOTAL_CAP), -NEWS_TOTAL_CAP)

    return (z_no, z_np, evidence_own, evidence_op, evidence_neg)


def score_record(rec: Dict) -> Dict:
    url = rec.get("url")
    about_url = rec.get("about_url", "") or ""
    leaders_url = rec.get("leaders_url", "") or ""
    about_text = clean_text(rec.get("about_text", ""))
    leaders_summary = clean_text(rec.get("leaders_summary", ""))
    # Parsed leaders (used for later name cross-matching with news)
    leaders_for_news = parse_leaders(leaders_summary) if leaders_summary else []

    # ---- News fields from record (available even if empty) ----
    news_articles = rec.get("news_res", []) or []
    news_url = rec.get("news_url", "") or ""

    # Priors
    z_own = OWNERSHIP_PRIOR_Z
    z_op  = OPERATION_PRIOR_Z

    evidence_own: List[Tuple[str, float]] = []
    evidence_op:  List[Tuple[str, float]] = []
    evidence_neg: List[Tuple[str, float]] = []
    notes: List[str] = []

    # -------------------------------
    # Ownership (About) first
    # -------------------------------
    about_exists = bool(about_url.strip())
    if not about_exists:
        # If you prefer a penalty instead of hard zero, swap with: z_own -= 1.0; p_own = sigmoid(z_own)
        p_own = 0.0
        notes.append("about_url missing; ownership prob forced to 0")
    else:
        # If About explicitly contains a family-owned phrase, treat it as strong evidence
        about_family = bool(FAMILY_ARTICLE_RX.search(about_text))
        if about_family:
            # stronger direct boosts for explicit About claims
            ABOUT_FAMILY_OWN_BOOST = 1.2
            ABOUT_FAMILY_OP_BOOST  = 0.9
            z_own += ABOUT_FAMILY_OWN_BOOST
            z_op  += ABOUT_FAMILY_OP_BOOST
            evidence_own.append(("about-text: explicit family-owned phrase", round(ABOUT_FAMILY_OWN_BOOST,3)))
            notes.append("about: explicit family-owned phrase detected; applied direct boost")

        # Apply negatives and positives -- but if about explicitly claims family-owned,
        # avoid applying aggressive negatives from cookie/privacy boilerplate that may
        # have been captured by earlier extraction.
        if not about_family:
            for rx in ANTI_FP:
                if rx.search(about_text):
                    z_own -= 0.6
                    z_op  -= 0.6
                    evidence_neg.append(("anti-fp", -0.6))
            for rx, w in NEG:
                for m in rx.finditer(about_text):
                    z_own += w
                    z_op  += w
                    snippet = about_text[max(0, m.start()-60): m.end()+60]
                    evidence_neg.append((snippet.strip(), w))

        for rx, w in OWN_POS:
            for m in rx.finditer(about_text):
                z_own += w
                snippet = about_text[max(0, m.start()-60): m.end()+60]
                evidence_own.append((snippet.strip(), w))

        p_own = sigmoid(z_own)

    # -------------------------------------------------
    # NEWS — compute ONCE (safe defaults)
    # -------------------------------------------------
    z_news_own = 0.0
    z_news_op  = 0.0
    page_type  = "none"

    if news_articles:
        page_type = classify_news_page(news_articles)  # "lineage" or "corporate"
        # prepare leader full-name set for stricter news matching
        leader_full_set = set()
        try:
            leader_full_set = { (lambda n: (lambda toks: f"{toks[0].lower()} {toks[-1].lower()}" if len(toks)>=2 else "") ( [t for t in re.sub(r"[^A-Za-z\\s]"," ", n).split() if t] ))(n) for n, _ in leaders_for_news if n }
            leader_full_set = {n for n in leader_full_set if n}
        except Exception:
            leader_full_set = set()

        z_news_own, z_news_op, ev_no, ev_np, ev_nn = score_news_articles(news_articles, page_type, leader_full_set)
        # evidence
        evidence_own.extend([("news: " + s, w) for (s, w) in ev_no])
        evidence_op.extend([("news: " + s, w) for (s, w) in ev_np])
        evidence_neg.extend([("news: " + s, w) for (s, w) in ev_nn])

        # Leader-name cross-match: if news mentions parsed leaders (surname overlap), give a bounded boost
        try:
            if leaders_for_news:
                # Normalize to first+last lowercase form for stricter matching
                def _norm_full(name: str) -> str:
                    if not name:
                        return ""
                    toks = [t for t in re.sub(r"[^A-Za-z\\s]", " ", name).split() if t]
                    if len(toks) >= 2:
                        return f"{toks[0].lower()} {toks[-1].lower()}"
                    return ""

                news_names = extract_all_person_names_from_news(news_articles)
                news_full = {_norm_full(n) for n in news_names if n}
                leader_full = {_norm_full(n) for n, _ in leaders_for_news if n}
                common = {n for n in news_full & leader_full if n}
                if common:
                    # bounded boost: single full-name match -> 0.15, two or more -> 0.3
                    boost = min(0.3, 0.15 * len(common))
                    z_news_own += boost
                    evidence_own.append((f"news-leader name match: {', '.join(sorted(common))}", round(boost,3)))
                    notes.append(f"news matched leader full-names; boost {round(boost,3)}")
        except Exception:
            pass

    # Ownership: fold in news contribution (always allowed)
    # If this is an About page, check how many articles explicitly mention family-owned/operated
    try:
        # If we have an About page (about_url present), prefer About-derived family signals
        if about_exists and news_articles:
            family_hits = 0
            for a in news_articles:
                txt = (a.get('title','') or '') + ' ' + (a.get('body','') or '')
                if FAMILY_ARTICLE_RX.search(txt):
                    family_hits += 1

            # also check About text directly for explicit family-owned phrasing
            about_family = bool(FAMILY_ARTICLE_RX.search(about_text))
            if about_family:
                # count the about page as an additional corroborating hit
                family_hits += 1
                evidence_own.append(("about-text: family phrase found", 0.6))

            if family_hits > 0:
                mult = min(FAMILY_ARTICLE_MULT_CAP, 1.0 + FAMILY_ARTICLE_MULT * family_hits)
                z_news_own *= mult
                z_news_op  *= mult
                notes.append(f"about: boosted news by family-article multiplier x{mult:.2f} (hits={family_hits})")
            else:
                # fallback modest boost for About pages
                z_news_own *= ABOUT_NEWS_MULT
                z_news_op  *= ABOUT_NEWS_MULT
                notes.append(f"about: applied fallback ABOUT_NEWS_MULT x{ABOUT_NEWS_MULT}")
    except Exception:
        pass

    z_own += z_news_own
    p_own  = sigmoid(z_own)  # recompute in case news changed it

    # -------------------------------------------------
    # Operation (HARD gate on leadership cluster)
    # -------------------------------------------------
    leaders_exists = bool(leaders_url.strip()) and any(k in leaders_url.lower() for k in LEADERSHIP_URL_KEYWORDS)
    leaders_list = parse_leaders(leaders_summary) if leaders_exists else []

    if not leaders_exists or not leaders_list:
        # Soft gate: do NOT force operation to zero. Instead apply a conservative penalty
        # when no leadership list or leaders page is missing, but allow news/about
        # derived signals to partially rescue operation probability (capped).
        penalty = -1.2  # conservative downward shift when leadership not observed
        z_op = OPERATION_PRIOR_Z + penalty
        # allow capped rescue from news-derived op signal
        z_op += min(z_news_op, 0.8)
        p_op = sigmoid(z_op)
        notes.append("operation gate: no leadership cluster; soft gate applied (penalized), news rescue capped")
        # keep previous lineage rescue note if applicable
        if page_type == "lineage" and z_news_op > 0:
            notes.append("operation gate: lineage news rescue considered")
    else:
        # Leadership-only surname clustering
        ratio, ln, counts_w, total_w, shared_count_top, max_role_w_top, family_role_present = leadership_cluster(leaders_list)
        evidence_op.append((
            f"surname cluster (leadership-only; diagnostic): top='{ln}' ratio={ratio:.2f} ",
            f"shared_count={shared_count_top} total_weight={total_w:.2f} counts={counts_w}",
            0.0,
        ))

        # Gate: require a leadership surname cluster (>=2 shared entries)
        # Relaxation: accept single-entry surname if there are explicit family-role mentions
        # or if About page explicitly claims family ownership. This helps sites that list
        # one family-named leader alongside other non-family leaders but still indicate
        # family operation (e.g. 'Family Member Since 2014').
        family_cluster_hit = False
        if ln and shared_count_top >= 2:
            family_cluster_hit = True
        else:
            # allow rescue when 'family' is mentioned in leadership roles
            if family_role_present and ratio >= 0.25:
                family_cluster_hit = True
            # allow rescue when About page explicitly states family-owned and ratio modest
            if about_family and ln and ratio >= 0.20:
                family_cluster_hit = True

        if not family_cluster_hit:
            # HARD gate — company not family operated (by your definition)
            p_op = 0.0
            z_op = OPERATION_PRIOR_Z
            notes.append("operation gate: no leadership surname cluster; op prob forced to 0")

            # Optional: lineage-news rescue (capped)
            if page_type == "lineage" and z_news_op > 0:
                z_op += min(z_news_op, 0.8)  # cap rescue
                p_op  = sigmoid(z_op)
                notes.append("operation gate: lineage news rescue applied (capped)")
        else:
            # Gate passed → allow news contribution
            z_op += z_news_op

            # Role contributions (token-based; with caps)
            role_total = 0.0
            board_total = 0.0
            role_evidence: List[Tuple[str, float]] = []
            for name, role in leaders_list:
                w = role_weight(role)
                if w > 0:
                    role_evidence.append((f"{name} — {role}", w))
                    role_total += w
                    if 'board member' in norm_role(role):
                        board_total += ROLE_PHRASES['board member']  # 0.4

            # Apply caps
            z_op += min(max(0.0, role_total - board_total), ROLE_CAP_TOTAL)
            z_op += min(board_total, BOARD_CAP_TOTAL)
            evidence_op.extend(role_evidence)

            # Cluster boost based on ratio
            if ratio >= 0.5:
                z_op += 2.0
            elif 0.35 <= ratio < 0.5:
                z_op += 1.2
            elif 0.25 <= ratio < 0.35:
                z_op += 0.8

            # Negatives on leadership summary (if any)
            for rx, w in NEG:
                for m in rx.finditer(leaders_summary):
                    z_own += w
                    z_op  += w
                    snippet = leaders_summary[max(0, m.start()-60): m.end()+60]
                    evidence_neg.append((snippet.strip(), w))

            p_op = sigmoid(z_op)

    # -------------------------------------------------
    # Final AND via Harmonic Mean (soft-zero)
    # -------------------------------------------------
    EPS = 1e-6  # numerical floor
    p_and = 2.0 / (1.0 / max(p_own, EPS) + 1.0 / max(p_op, EPS))

    return {
        "url": url,
        "family_owned_and_operated_prob": round(p_and, 3),
        "ownership": {
            "prob": round(p_own, 3),
            "logit": round(z_own, 3),
            "evidence": evidence_own,
        },
        "operation": {
            "prob": round(p_op, 3),
            "logit": round(z_op, 3),
            "evidence": evidence_op,
        },
        "negatives": evidence_neg,
        "notes": notes,
    }




In [None]:
# ---------------------------
# Run 
# ---------------------------
sites = [
"https://www.pyramaxbank.com",
"https://www.1901inc.com/",
"www.appliedtechproducts.com",
"www.aeincorporated.com",
"www.abrjobs.com",
"https://www.absoluteconcrete.com",
"https://www.acbusinessmedia.com",
"http://www.accelgen.com",
"http://www.accuratefab.net",
"http://www.acieta.com"
] # input list of werbsite names in the following format: ["site1.com", "site2.com", ...]

URLS = [u if u.startswith(("http://", "https://")) else "https://" + u for u in ensure_list(sites)]

results, leader_rows = run_all(URLS)


recs = []
for i in range(len(results)):
    recs.append(score_record(results[i]))

for r in recs:
    print(r)



Processing: https://cgschmidt.com
Done: https://cgschmidt.com in 116.73s
{'url': 'https://cgschmidt.com', 'family_owned_and_operated_prob': 0.946, 'ownership': {'prob': 0.9, 'logit': 2.2, 'evidence': [('t better than a solid partnership and honest conversations. Five generations of quality, caring, and commitment At CG Schmidt, we have b', 2.2), ('t better than a solid partnership and honest conversations. Five generations of quality, caring, and commitment At CG Schmidt, we have b', 1.6), ('news-leader name match: eric schmidt, mark lillesand, mike abuls, rick schmidt, sarah dunn', 0.3)]}, 'operation': {'prob': 0.997, 'logit': 5.93, 'evidence': [('news: 5 | News & Events Eric Schmidt, Ryan Schmidt & Josh Schmitz Named 2025 Newsmakers of the Year by The Daily Reporter CG Schmid', 0.272), ('news: team—Eric Schmidt, Ryan Schmidt and Josh Schmitz—have been named 2025 Newsmakers of the Year by The Daily Reporter. These di', 0.272), ('news: ng a strong, comp Eric Schmidt, Ryan Schmidt & Jo

In [None]:
# Save predictor results to Excel
import pandas as pd
from pathlib import Path

# Prefer `recs` (scored records) then `results` (raw) then `records` (test input)
ws = globals()
data = None
for name in ("recs", "results", "records", "out_records", "results_list"):
    if name in ws:
        data = ws[name]
        break

if data is None:
    print("No results variable found in notebook namespace. Nothing saved.")
else:
    # Normalize list of dicts to flat table
    rows = []
    for r in data:
        row = {}
        row["url"] = r.get("url")
        row["family_owned_and_operated_prob"] = r.get("family_owned_and_operated_prob")

        # Ownership
        own = r.get("ownership", {}) or {}
        row["ownership_prob"] = own.get("prob")
        row["ownership_logit"] = own.get("logit")
        ev_own = own.get("evidence") or []
        # evidence is list of tuples/snippets; serialize compactly
        def serialize_evidence(ev_list):
            out = []
            for it in ev_list:
                try:
                    if isinstance(it, (list, tuple)) and len(it) >= 2:
                        snippet = str(it[0]).replace("\n", " ")
                        weight = it[1]
                        out.append(f"{snippet} || {weight}")
                    else:
                        out.append(str(it))
                except Exception:
                    out.append(str(it))
            return " ;; ".join(out)

        row["ownership_evidence"] = serialize_evidence(ev_own)

        # Operation
        op = r.get("operation", {}) or {}
        row["operation_prob"] = op.get("prob")
        row["operation_logit"] = op.get("logit")
        row["operation_evidence"] = serialize_evidence(op.get("evidence") or [])

        # Negatives and notes
        negs = r.get("negatives") or []
        row["negatives"] = " ;; ".join([str(x[0]) if isinstance(x, (list, tuple)) else str(x) for x in negs])
        row["notes"] = " ;; ".join(r.get("notes") or [])

        rows.append(row)

    df = pd.DataFrame(rows)
    out_path = Path.cwd() / "family_owned_results.xlsx"
    try:
        df.to_excel(out_path, index=False)
        print(f"Saved results to: {out_path}")
    except Exception as e:
        print("Failed to save Excel file:", e)

Saved results to: c:\Users\VB948\OneDrive - R&R Insurance\Desktop\RR Projects\family_owned_results.xlsx
