In [3]:
import time
import re
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
from openpyxl import load_workbook
from openpyxl.worksheet.datavalidation import DataValidation

# ============== CONFIGURATION ==============
SEARCH_TERM = "general-contractors"   # e.g., "general-contractors", "real-estate-agents", "plumbers", "lawyers"
LOCATION = "brooklyn-ny"               # e.g., "brooklyn-ny", "queens-ny", "manhattan-ny", "los-angeles-ca"
INDUSTRY_LABEL = "Construction"        # Label for the Industry column in output
OUTPUT_FILE = "yellow_pages_leads.xlsx"
MAX_PAGES = 5                          # Number of pages to scrape (set to None for all pages)
FETCH_EMAILS = True                    # Set to False to skip visiting detail pages (faster but no emails)
# ===========================================


def create_driver():
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--disable-gpu")
    options.add_argument("--window-size=1920,1080")
    options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
    return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)


def extract_email_from_detail(driver, detail_url):
    try:
        driver.get(detail_url)
        time.sleep(1.5)
        
        # Try clicking "Email Business" button first to reveal email
        try:
            email_btn = driver.find_element(By.CSS_SELECTOR, "a.email-business")
            href = email_btn.get_attribute("href")
            if href and href.startswith("mailto:"):
                return href.replace("mailto:", "").split("?")[0]
        except:
            pass
        
        soup = BeautifulSoup(driver.page_source, "html.parser")
        
        # Method 1: Look for email-business class link
        email_link = soup.select_one("a.email-business[href^='mailto:']")
        if email_link:
            return email_link["href"].replace("mailto:", "").split("?")[0]
        
        # Method 2: Any mailto link
        email_link = soup.select_one("a[href^='mailto:']")
        if email_link:
            return email_link["href"].replace("mailto:", "").split("?")[0]
        
        # Method 3: Check all dd elements for email pattern
        for dd in soup.select("dd"):
            text = dd.get_text()
            match = re.search(r'[\w\.-]+@[\w\.-]+\.\w+', text)
            if match:
                return match.group()
        
        # Method 4: Search entire page
        page_text = soup.get_text()
        match = re.search(r'[\w\.-]+@[\w\.-]+\.\w+', page_text)
        if match:
            return match.group()
            
    except Exception as e:
        print(f"    Error: {e}")
    return ""


def parse_listing(listing):
    try:
        company = listing.select_one(".business-name span").text.strip()
        phone = listing.select_one(".phones").text.strip() if listing.select_one(".phones") else ""
        
        address_street = listing.select_one(".street-address")
        address_locality = listing.select_one(".locality")
        full_address = " ".join(filter(None, [
            address_street.text.strip() if address_street else "",
            address_locality.text.strip() if address_locality else ""
        ]))
        
        website_el = listing.select_one(".track-visit-website")
        website = website_el["href"] if website_el else ""
        
        detail_link = "https://www.yellowpages.com" + listing.select_one(".business-name")["href"]

        return {
            "#": None,
            "Company Name": company,
            "Industry": INDUSTRY_LABEL,
            "Contact Name": "",
            "Email Address": "",
            "Phone Number": phone,
            "Website URL": website,
            "Address": full_address,
            "Date Added": datetime.now().strftime("%-m/%-d/%y"),
            "Date Contacted": "",
            "Source": detail_link,
            "Notes": "",
            "Called": "",
            "Followed Up": "",
            "Closed": ""
        }
    except Exception as e:
        print(f"  Skipping listing: {e}")
        return None


def add_checkboxes(filepath):
    wb = load_workbook(filepath)
    ws = wb.active
    
    checkbox_validation = DataValidation(type="list", formula1='"☐,☑"', allow_blank=True)
    ws.add_data_validation(checkbox_validation)
    
    headers = {cell.value: cell.column for cell in ws[1]}
    
    for col_name in ["Called", "Followed Up", "Closed"]:
        if col_name in headers:
            col_idx = headers[col_name]
            for row in range(2, ws.max_row + 1):
                cell = ws.cell(row=row, column=col_idx)
                cell.value = "☐"
                checkbox_validation.add(cell)
    
    wb.save(filepath)


def scrape_yellowpages():
    base_url = f"https://www.yellowpages.com/{LOCATION}/{SEARCH_TERM}"
    print(f"Starting scrape: {base_url}")
    print(f"Max pages: {MAX_PAGES or 'All'} | Fetch emails: {FETCH_EMAILS}\n")
    
    driver = create_driver()
    all_data = []
    page = 1
    
    try:
        while MAX_PAGES is None or page <= MAX_PAGES:
            url = base_url if page == 1 else f"{base_url}?page={page}"
            print(f"[Page {page}] {url}")
            
            driver.get(url)
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(2)

            try:
                WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.CLASS_NAME, "result"))
                )
            except:
                print(f"  No results found, stopping.")
                break

            listings = driver.find_elements(By.CLASS_NAME, "result")
            if not listings:
                print(f"  No listings, stopping.")
                break
                
            print(f"  Found {len(listings)} listings")

            # Parse all listings on this page
            page_data = []
            for listing in listings:
                soup = BeautifulSoup(listing.get_attribute("outerHTML"), "html.parser")
                parsed = parse_listing(soup)
                if parsed:
                    page_data.append(parsed)
            
            # Fetch emails from detail pages
            if FETCH_EMAILS:
                for i, entry in enumerate(page_data):
                    print(f"  [{i+1}/{len(page_data)}] {entry['Company Name'][:40]}", end="")
                    email = extract_email_from_detail(driver, entry["Source"])
                    if email:
                        entry["Email Address"] = email
                        print(f" → {email}")
                    else:
                        print()
            
            all_data.extend(page_data)
            
            # Check for next page - just increment and try
            page += 1
            
    finally:
        driver.quit()
    
    if not all_data:
        print("No data collected!")
        return pd.DataFrame()
    
    # Add row numbers
    for i, row in enumerate(all_data, start=1):
        row["#"] = i

    # Save to single Excel file
    df = pd.DataFrame(all_data)
    df.to_excel(OUTPUT_FILE, index=False)
    add_checkboxes(OUTPUT_FILE)
    
    print(f"\n{'='*50}")
    print(f"Done! Saved {len(all_data)} leads to {OUTPUT_FILE}")
    print(f"{'='*50}")
    
    return df


if __name__ == "__main__":
    scrape_yellowpages()

Starting scrape: https://www.yellowpages.com/brooklyn-ny/general-contractors
Max pages: 5 | Fetch emails: True

[Page 1] https://www.yellowpages.com/brooklyn-ny/general-contractors
  Found 31 listings
  Skipping listing: 'NoneType' object has no attribute 'text'
  [1/30] SAS Roofing & Waterproofing
  [2/30] Total Renovation Contractors
  [3/30] City & County Paving Corp
  [4/30] Glenwood Construction
  [5/30] M. Bhuiyan Construction Company Inc.
  [6/30] Alam General Contracting Inc
  [7/30] M&K Construction Company
  [8/30] Yellow Construction Inc.
  [9/30] Rosul Contracting Corporation
  [10/30] Miller General Contracting, Ltd.
  [11/30] J.K. Construction N.Y. Inc.
  [12/30] Big Rose 1 Construction
  [13/30] Dynamax Construction Corp
  [14/30] Multi Construction Company
  [15/30] Green View Construction Inc.
  [16/30] N J General Contracting Inc
  [17/30] A Howard Construction Inc
  [18/30] Donofrio General Contractors Corp
  [19/30] Z H N Contracting Corp
  [20/30] Cheever Developme