In [1]:
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
from openpyxl import load_workbook
from openpyxl.worksheet.datavalidation import DataValidation
import re

# ============== CONFIGURATION ==============
SEARCH_TERM = "general-contractors"  # e.g., "general-contractors", "real-estate-agents", "plumbers"
LOCATION = "brooklyn-ny"              # e.g., "brooklyn-ny", "queens-ny", "manhattan-ny"
INDUSTRY_LABEL = "Construction"       # Label for the Industry column in output
OUTPUT_FILE = "yellow_pages_leads.xlsx"
MAX_PAGES = 5                         # Number of pages to scrape (set to None for all pages)
# ===========================================

def extract_email_from_detail(driver, detail_url):
    """Visit detail page and extract email if available"""
    try:
        driver.get(detail_url)
        time.sleep(1)
        soup = BeautifulSoup(driver.page_source, "html.parser")
        
        # Look for email in various places
        email_link = soup.select_one("a.email-business")
        if email_link and email_link.get("href", "").startswith("mailto:"):
            return email_link["href"].replace("mailto:", "").split("?")[0]
        
        # Check Brands field or other sections for email
        brands_section = soup.find("dt", string=re.compile("Brands|Email", re.I))
        if brands_section:
            dd = brands_section.find_next_sibling("dd")
            if dd:
                text = dd.get_text()
                email_match = re.search(r'[\w\.-]+@[\w\.-]+\.\w+', text)
                if email_match:
                    return email_match.group()
        
        # Search entire page for email pattern
        page_text = soup.get_text()
        email_match = re.search(r'[\w\.-]+@[\w\.-]+\.\w+', page_text)
        if email_match:
            return email_match.group()
            
    except Exception as e:
        print(f"Error extracting email from {detail_url}: {e}")
    return ""

def parse_listing(listing):
    try:
        company = listing.select_one(".business-name span").text.strip()
        phone = listing.select_one(".phones").text.strip() if listing.select_one(".phones") else ""
        address_street = listing.select_one(".street-address")
        address_locality = listing.select_one(".locality")
        full_address = " ".join(filter(None, [
            address_street.text.strip() if address_street else "",
            address_locality.text.strip() if address_locality else ""
        ]))
        website = listing.select_one(".track-visit-website")["href"] if listing.select_one(".track-visit-website") else "N/A"
        detail_link = "https://www.yellowpages.com" + listing.select_one(".business-name")["href"]

        return {
            "#": None,
            "Company Name": company,
            "Industry": INDUSTRY_LABEL,
            "Contact Name": "",
            "Email Address": "",
            "Phone Number": phone,
            "Website URL": website,
            "Address": full_address,
            "Date Added": datetime.now().strftime("%-m/%-d/%y"),
            "Date Contacted": "",
            "Source": detail_link,
            "Notes": "",
            "Called": "",
            "Followed Up": "",
            "Closed": ""
        }
    except Exception as e:
        print("Skipping listing due to parse error:", e)
        return None

def add_checkboxes(filepath):
    wb = load_workbook(filepath)
    ws = wb.active
    
    checkbox_validation = DataValidation(type="list", formula1='"☐,☑"', allow_blank=True)
    ws.add_data_validation(checkbox_validation)
    
    # Find checkbox columns (Called, Followed Up, Closed)
    headers = {cell.value: cell.column for cell in ws[1]}
    checkbox_cols = ["Called", "Followed Up", "Closed"]
    
    for col_name in checkbox_cols:
        if col_name in headers:
            col_idx = headers[col_name]
            col_letter = ws.cell(row=1, column=col_idx).column_letter
            for row in range(2, ws.max_row + 1):
                cell = ws.cell(row=row, column=col_idx)
                cell.value = "☐"
                checkbox_validation.add(cell)
    
    wb.save(filepath)

def scrape_yellowpages_rendered(base_url):
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    
    all_data = []
    page = 1
    
    while True:
        if MAX_PAGES and page > MAX_PAGES:
            break
            
        url = base_url if page == 1 else f"{base_url}?page={page}"
        print(f"Scraping page {page}: {url}")
        
        driver.get(url)
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)

        try:
            WebDriverWait(driver, 15).until(
                EC.presence_of_element_located((By.CLASS_NAME, "result"))
            )
        except:
            print(f"No more results found on page {page}")
            break

        listings = driver.find_elements(By.CLASS_NAME, "result")
        if not listings:
            print("No listings found, stopping pagination")
            break
            
        print(f"Found {len(listings)} listings on page {page}")

        # First pass: collect basic data and detail URLs
        page_data = []
        for listing in listings:
            soup = BeautifulSoup(listing.get_attribute("outerHTML"), "html.parser")
            parsed = parse_listing(soup)
            if parsed:
                page_data.append(parsed)
        
        # Second pass: visit detail pages for emails
        for i, entry in enumerate(page_data):
            detail_url = entry["Source"]
            print(f"  [{i+1}/{len(page_data)}] Checking for email: {entry['Company Name']}")
            email = extract_email_from_detail(driver, detail_url)
            if email:
                entry["Email Address"] = email
                print(f"    Found email: {email}")
        
        all_data.extend(page_data)
        
        # Check for next page
        driver.get(url)
        time.sleep(1)
        soup = BeautifulSoup(driver.page_source, "html.parser")
        next_link = soup.select_one("a.next")
        if not next_link:
            print("No next page link found, finished pagination")
            break
        
        page += 1
    
    driver.quit()
    
    # Add row numbers
    for i, row in enumerate(all_data, start=1):
        row["#"] = i

    df = pd.DataFrame(all_data)
    output_file = OUTPUT_FILE
    df.to_excel(output_file, index=False)
    
    # Add checkbox validation
    add_checkboxes(output_file)
    
    print(f"\nSaved {len(all_data)} leads to {output_file}")
    return df

if __name__ == "__main__":
    url = f"https://www.yellowpages.com/{LOCATION}/{SEARCH_TERM}"
    print(f"Scraping: {url}")
    scrape_yellowpages_rendered(url)

Scraping: https://www.yellowpages.com/brooklyn-ny/general-contractors
Scraping page 1: https://www.yellowpages.com/brooklyn-ny/general-contractors
Found 31 listings on page 1
Skipping listing due to parse error: 'NoneType' object has no attribute 'text'
  [1/30] Checking for email: SAS Roofing & Waterproofing
  [2/30] Checking for email: Total Renovation Contractors
  [3/30] Checking for email: City & County Paving Corp
  [4/30] Checking for email: Glenwood Construction
  [5/30] Checking for email: M. Bhuiyan Construction Company Inc.
  [6/30] Checking for email: Alam General Contracting Inc
  [7/30] Checking for email: M&K Construction Company
  [8/30] Checking for email: Yellow Construction Inc.
  [9/30] Checking for email: Rosul Contracting Corporation
  [10/30] Checking for email: Miller General Contracting, Ltd.
  [11/30] Checking for email: J.K. Construction N.Y. Inc.
  [12/30] Checking for email: Big Rose 1 Construction
  [13/30] Checking for email: Dynamax Construction Corp
  [