In [None]:
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
from openpyxl import load_workbook
from openpyxl.worksheet.datavalidation import DataValidation

def parse_listing(listing, source_url):
    try:
        company = listing.select_one(".business-name span").text.strip()
        phone = listing.select_one(".phones").text.strip() if listing.select_one(".phones") else ""
        address_street = listing.select_one(".street-address")
        address_locality = listing.select_one(".locality")
        full_address = " ".join(filter(None, [
            address_street.text.strip() if address_street else "",
            address_locality.text.strip() if address_locality else ""
        ]))
        website = listing.select_one(".track-visit-website")["href"] if listing.select_one(".track-visit-website") else "N/A"
        detail_link = "https://www.yellowpages.com" + listing.select_one(".business-name")["href"]

        return {
            "Company Name": company,
            "Industry": "Construction",
            "Contact Name": "",
            "Email Address": "",
            "Phone Number": phone,
            "Website URL": website,
            "Address": full_address,
            "Date Added": datetime.now().strftime("%-m/%-d/%y"),
            "Date Contacted": "",
            "Notes": detail_link,
            "Source": source_url,
            "Called": "",
            "Followed Up": "",
            "Closed": ""
        }
    except Exception as e:
        print("Skipping listing due to parse error:", e)
        return None

def add_checkboxes(filepath):
    wb = load_workbook(filepath)
    ws = wb.active
    
    checkbox_validation = DataValidation(type="list", formula1='"‚òê,‚òë"', allow_blank=True)
    ws.add_data_validation(checkbox_validation)
    
    # Find checkbox columns (Called, Followed Up, Closed)
    headers = {cell.value: cell.column for cell in ws[1]}
    checkbox_cols = ["Called", "Followed Up", "Closed"]
    
    for col_name in checkbox_cols:
        if col_name in headers:
            col_idx = headers[col_name]
            col_letter = ws.cell(row=1, column=col_idx).column_letter
            for row in range(2, ws.max_row + 1):
                cell = ws.cell(row=row, column=col_idx)
                cell.value = "‚òê"
                checkbox_validation.add(cell)
    
    wb.save(filepath)

def scrape_yellowpages_rendered(url):
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    driver.get(url)
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(2)

    try:
        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.CLASS_NAME, "result"))
        )
    except:
        print("Timeout waiting for listings to load.")
        print(driver.page_source[:1000])
        driver.quit()
        return pd.DataFrame()

    listings = driver.find_elements(By.CLASS_NAME, "result")
    print(f"Found {len(listings)} listings.")

    data = []
    for listing in listings:
        soup = BeautifulSoup(listing.get_attribute("outerHTML"), "html.parser")
        parsed = parse_listing(soup, url)
        if parsed:
            data.append(parsed)

    driver.quit()

    df = pd.DataFrame(data)
    output_file = "yellow_pages_general_contractors.xlsx"
    df.to_excel(output_file, index=False)
    
    # Add checkbox validation
    add_checkboxes(output_file)
    
    print(f"Saved to {output_file}")
    return df

if __name__ == "__main__":
    url = "https://www.yellowpages.com/brooklyn-ny/general-contractors"
    scrape_yellowpages_rendered(url)

Found 31 listings.
Skipping listing due to parse error: 'NoneType' object has no attribute 'text'
Saved to yellow_pages_general_contractors.xlsx


In [None]:
# # ---------------- SETTINGS ----------------
# MAX_PAGES = 10
# BASE_URL = "https://www.yellowpages.com/brooklyn-ny/general-contractors"
# OUTPUT_FILE = "yellow_pages_general_contractors.xlsx"
# HEADLESS = True
# USER_AGENT = (
#     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
#     "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
# )

# # ---------------- SCRAPER ----------------
# import undetected_chromedriver as uc
# from selenium.webdriver.common.by import By
# from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.support import expected_conditions as EC
# from bs4 import BeautifulSoup
# import pandas as pd
# from datetime import datetime
# import time
# import random

# def parse_listing(listing, page_number=None):
#     try:
#         company = listing.select_one(".business-name span").text.strip()
#         phone = listing.select_one(".phones").text.strip() if listing.select_one(".phones") else ""
#         address_street = listing.select_one(".street-address")
#         address_locality = listing.select_one(".locality")
#         full_address = " ".join(filter(None, [
#             address_street.text.strip() if address_street else "",
#             address_locality.text.strip() if address_locality else ""
#         ]))
#         website = listing.select_one(".track-visit-website")["href"] if listing.select_one(".track-visit-website") else "N/A"
#         detail_link = "https://www.yellowpages.com" + listing.select_one(".business-name")["href"]

#         return {
#             "Company Name": company,
#             "Industry": "Construction",
#             "Contact Name": "",
#             "Email Address": "",
#             "Phone Number": phone,
#             "Website URL": website,
#             "Address": full_address,
#             "Date Added": datetime.now().strftime("%-m/%-d/%y"),
#             "Date Contacted": "",
#             "Contacted": "",
#             "Notes": detail_link,
#             "My Offering": "$1,500 setup $20 per month (blog, landing & contact form)",
#             "Page": page_number if page_number else 1
#         }
#     except Exception as e:
#         print("Skipping listing due to parse error:", e)
#         return None

# def scrape_yellowpages_rendered(base_url, max_pages):
#     options = uc.ChromeOptions()
#     if HEADLESS:
#         options.add_argument("--headless=new")
#     options.add_argument("--no-sandbox")
#     options.add_argument("--disable-dev-shm-usage")
#     options.add_argument(f"user-agent={USER_AGENT}")

#     driver = uc.Chrome(options=options)
#     all_data = []

#     for page in range(1, max_pages + 1):
#         page_url = base_url if page == 1 else f"{base_url}?page={page}"
#         print(f"\n‚û°Ô∏è Scraping Page {page}: {page_url}")
#         driver.get(page_url)

#         try:
#             WebDriverWait(driver, 15).until(
#                 EC.presence_of_element_located((By.CLASS_NAME, "result"))
#             )
#         except:
#             print(f"‚ö†Ô∏è Timeout on page {page} ‚Äî printing debug HTML:")
#             print(driver.page_source[:1000])
#             continue

#         driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
#         time.sleep(1.5)

#         listings = driver.find_elements(By.CLASS_NAME, "result")
#         print(f"‚úîÔ∏è Found {len(listings)} listings on page {page}")

#         if not listings:
#             break

#         for listing in listings:
#             soup = BeautifulSoup(listing.get_attribute("outerHTML"), "html.parser")
#             parsed = parse_listing(soup, page)
#             if parsed:
#                 all_data.append(parsed)

#         # üëá Randomized delay to reduce bot detection
#         delay = random.uniform(4, 8)
#         print(f"‚è≥ Sleeping {delay:.2f}s before next page...")
#         time.sleep(delay)

#     driver.quit()

#     df = pd.DataFrame(all_data)
#     if OUTPUT_FILE.endswith(".csv"):
#         df.to_csv(OUTPUT_FILE, index=False)
#     else:
#         df.to_excel(OUTPUT_FILE, index=False)

#     print(f"\n‚úÖ Scraped {len(df)} total results. Saved to {OUTPUT_FILE}")
#     return df

# # Run
# if __name__ == "__main__":
#     scrape_yellowpages_rendered(BASE_URL, MAX_PAGES)



‚û°Ô∏è Scraping Page 1: https://www.yellowpages.com/brooklyn-ny/general-contractors
‚ö†Ô∏è Timeout on page 1 ‚Äî printing debug HTML:
<html class="no-js" lang="en-US"><!--<![endif]--><head>
<title>Attention Required! | Cloudflare</title>
<meta charset="UTF-8">
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<meta http-equiv="X-UA-Compatible" content="IE=Edge">
<meta name="robots" content="noindex, nofollow">
<meta name="viewport" content="width=device-width,initial-scale=1">
<link rel="stylesheet" id="cf_styles-css" href="/cdn-cgi/styles/cf.errors.css">
<!--[if lt IE 9]><link rel="stylesheet" id='cf_styles-ie-css' href="/cdn-cgi/styles/cf.errors.ie.css" /><![endif]-->
<style>body{margin:0;padding:0}</style>


<!--[if gte IE 10]><!-->
<script>
  if (!navigator.cookieEnabled) {
    window.addEventListener('DOMContentLoaded', function () {
      var cookieEl = document.getElementById('cookie-alert');
      cookieEl.style.display = 'block';
    })
  }
</script>
<!--<![

MaxRetryError: HTTPConnectionPool(host='localhost', port=52756): Max retries exceeded with url: /session/484f018fad2b817cd17b73761fae174d/source (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x10fc26ad0>: Failed to establish a new connection: [Errno 61] Connection refused'))

In [None]:
# Manual Page number input

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime

page_number = input("Enter page number: ")

def parse_listing(listing):
    try:
        company = listing.select_one(".business-name span").text.strip()
        phone = listing.select_one(".phones").text.strip() if listing.select_one(".phones") else ""
        address_street = listing.select_one(".street-address")
        address_locality = listing.select_one(".locality")
        full_address = " ".join(filter(None, [
            address_street.text.strip() if address_street else "",
            address_locality.text.strip() if address_locality else ""
        ]))
        website = listing.select_one(".track-visit-website")["href"] if listing.select_one(".track-visit-website") else "N/A"
        detail_link = "https://www.yellowpages.com" + listing.select_one(".business-name")["href"]

        return {
            "Company Name": company,
            "Industry": "Construction",
            "Contact Name": "",
            "Email Address": "",
            "Phone Number": phone,
            "Website URL": website,
            "Address": full_address,
            "Date Added": datetime.now().strftime("%-m/%-d/%y"),
            "Date Contacted": "",
            "Contacted": "",
            "Notes": detail_link,
            "My Offering": "$1,500 setup $20 per month (blog, landing & contact form)"
        }
    except Exception as e:
        print("Skipping listing due to parse error:", e)
        return None

def scrape_yellowpages_rendered(url):
    # Set up headless browser
    options = Options()
    options.add_argument("--headless")  # Use classic headless mode
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    driver.get(url)

    # Scroll to trigger full JS content load
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(2)

    try:
        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.CLASS_NAME, "result"))
        )
    except:
        print("Timeout waiting for listings to load.")
        print(driver.page_source[:1000])  # Debug: see what‚Äôs loaded
        driver.quit()
        return pd.DataFrame()


    listings = driver.find_elements(By.CLASS_NAME, "result")
    print(f"Found {len(listings)} listings.")

    data = []
    for listing in listings:
        soup = BeautifulSoup(listing.get_attribute("outerHTML"), "html.parser")
        parsed = parse_listing(soup)
        if parsed:
            data.append(parsed)

    driver.quit()

    # Save to CSV
    df = pd.DataFrame(data)
    df.to_csv(f"yellow_pages_general_contractors_{page_number}.csv", index=False)
    print(f"Saved to yellow_pages_general_contractors_{page_number}.csv")
    
    return df

# üîÅ Run it
if __name__ == "__main__":
    url = f"https://www.yellowpages.com/brooklyn-ny/general-contractors?page={page_number}"
    scrape_yellowpages_rendered(url)


Found 30 listings.
Saved to yellow_pages_general_contractors.csv


In [None]:
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
from openpyxl import load_workbook
from openpyxl.worksheet.datavalidation import DataValidation

def parse_listing(listing, source_url):
    try:
        company = listing.select_one(".business-name span").text.strip()
        phone = listing.select_one(".phones").text.strip() if listing.select_one(".phones") else ""
        address_street = listing.select_one(".street-address")
        address_locality = listing.select_one(".locality")
        full_address = " ".join(filter(None, [
            address_street.text.strip() if address_street else "",
            address_locality.text.strip() if address_locality else ""
        ]))
        website = listing.select_one(".track-visit-website")["href"] if listing.select_one(".track-visit-website") else "N/A"
        detail_link = "https://www.yellowpages.com" + listing.select_one(".business-name")["href"]

        return {
            "Company Name": company,
            "Industry": "Construction",
            "Contact Name": "",
            "Email Address": "",
            "Phone Number": phone,
            "Website URL": website,
            "Address": full_address,
            "Date Added": datetime.now().strftime("%-m/%-d/%y"),
            "Date Contacted": "",
            "Notes": detail_link,
            "Source": source_url,
            "Called": "",
            "Followed Up": "",
            "Closed": ""
        }
    except Exception as e:
        print("Skipping listing due to parse error:", e)
        return None

def add_checkboxes(filepath):
    wb = load_workbook(filepath)
    ws = wb.active
    
    checkbox_validation = DataValidation(type="list", formula1='"‚òê,‚òë"', allow_blank=True)
    ws.add_data_validation(checkbox_validation)
    
    # Find checkbox columns (Called, Followed Up, Closed)
    headers = {cell.value: cell.column for cell in ws[1]}
    checkbox_cols = ["Called", "Followed Up", "Closed"]
    
    for col_name in checkbox_cols:
        if col_name in headers:
            col_idx = headers[col_name]
            col_letter = ws.cell(row=1, column=col_idx).column_letter
            for row in range(2, ws.max_row + 1):
                cell = ws.cell(row=row, column=col_idx)
                cell.value = "‚òê"
                checkbox_validation.add(cell)
    
    wb.save(filepath)

def scrape_yellowpages_rendered(url):
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    driver.get(url)
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(2)

    try:
        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.CLASS_NAME, "result"))
        )
    except:
        print("Timeout waiting for listings to load.")
        print(driver.page_source[:1000])
        driver.quit()
        return pd.DataFrame()

    listings = driver.find_elements(By.CLASS_NAME, "result")
    print(f"Found {len(listings)} listings.")

    data = []
    for listing in listings:
        soup = BeautifulSoup(listing.get_attribute("outerHTML"), "html.parser")
        parsed = parse_listing(soup, url)
        if parsed:
            data.append(parsed)

    driver.quit()

    df = pd.DataFrame(data)
    output_file = "yellow_pages_general_contractors.xlsx"
    df.to_excel(output_file, index=False)
    
    # Add checkbox validation
    add_checkboxes(output_file)
    
    print(f"Saved to {output_file}")
    return df

if __name__ == "__main__":
    url = "https://www.yellowpages.com/brooklyn-ny/general-contractors"
    scrape_yellowpages_rendered(url)