In [None]:
import time
import re
import random
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
from openpyxl import load_workbook
from openpyxl.worksheet.datavalidation import DataValidation

# ============== CONFIGURATION ==============
# Targeting: Delis, corner stores, sandwich shops, restaurants needing digital menu boards

# === SEARCH TARGETS ===
# Run one at a time or in parallel notebooks
SEARCHES = [
    {"term": "delis", "label": "Deli"},
    {"term": "delicatessen", "label": "Deli"},
    {"term": "sandwich-shops", "label": "Sandwich Shop"},
    {"term": "bodegas", "label": "Bodega"},
    {"term": "grocery-stores", "label": "Grocery/Corner Store"},
    {"term": "convenience-stores", "label": "Convenience Store"},
    {"term": "pizza", "label": "Pizza Shop"},
    {"term": "fast-food-restaurants", "label": "Fast Food"},
    {"term": "take-out-restaurants", "label": "Takeout"},
    {"term": "cafes", "label": "Cafe"},
    {"term": "bakeries", "label": "Bakery"},
    {"term": "bagels", "label": "Bagel Shop"},
    {"term": "coffee-shops", "label": "Coffee Shop"},
    {"term": "juice-bars", "label": "Juice Bar"},
    {"term": "ice-cream-parlors", "label": "Ice Cream"},
    {"term": "chinese-restaurants", "label": "Chinese Restaurant"},
    {"term": "mexican-restaurants", "label": "Mexican Restaurant"},
    {"term": "caribbean-restaurants", "label": "Caribbean Restaurant"},
]

# === CURRENT SEARCH (change index for different category) ===
CURRENT_SEARCH_INDEX = 0
SEARCH_TERM = SEARCHES[CURRENT_SEARCH_INDEX]["term"]
INDUSTRY_LABEL = SEARCHES[CURRENT_SEARCH_INDEX]["label"]

# === NYC NEIGHBORHOODS ===
LOCATIONS = [
    "queens-ny",
    "brooklyn-ny",
    "bronx-ny",
    "manhattan-ny",
    "staten-island-ny",
    # Specific high-density areas
    "astoria-ny",
    "flushing-ny",
    "jackson-heights-ny",
    "jamaica-ny",
    "williamsburg-brooklyn-ny",
    "bushwick-brooklyn-ny",
    "bed-stuy-brooklyn-ny",
    "crown-heights-brooklyn-ny",
    "flatbush-brooklyn-ny",
    "sunset-park-brooklyn-ny",
    "bay-ridge-brooklyn-ny",
    "fordham-bronx-ny",
    "hunts-point-bronx-ny",
    "washington-heights-ny",
    "harlem-ny",
    "east-harlem-ny",
    "chinatown-ny",
]

# === CURRENT LOCATION (change for different area) ===
LOCATION = LOCATIONS[0]

# === PAGINATION ===
START_PAGE = 1
END_PAGE = 3

OUTPUT_FILE = f"yp_menus_{LOCATION}_{SEARCH_TERM}_p{START_PAGE}-{END_PAGE}.xlsx"

# === SETTINGS ===
FETCH_EMAILS = True
DEBUG = False
HEADLESS = False
MIN_DELAY = 5
MAX_DELAY = 10
PAGE_DELAY = 15
RESTART_DRIVER_EACH_PAGE = True
# ===========================================


def create_driver():
    options = Options()
    
    if HEADLESS:
        options.add_argument("--headless=new")
    
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--disable-gpu")
    options.add_argument("--window-size=1920,1080")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option('useAutomationExtension', False)
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
    return driver


def extract_email_from_website(driver, website_url):
    """Try to find email on company's own website"""
    if not website_url or website_url == "N/A":
        return ""
    
    try:
        if not website_url.startswith("http"):
            website_url = "https://" + website_url
        
        time.sleep(random.uniform(1, 2))
        driver.get(website_url)
        time.sleep(2)
        
        page_source = driver.page_source
        
        if "404" in driver.title or "not found" in driver.title.lower():
            return ""
        
        mailto_match = re.search(r'href=["\']mailto:([^"\'<>?\s]+)', page_source, re.IGNORECASE)
        if mailto_match:
            email = mailto_match.group(1).strip()
            if '@' in email:
                return email
        
        email_matches = re.findall(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', page_source)
        for email in email_matches:
            email_lower = email.lower()
            if not any(x in email_lower for x in ['example.com', 'domain.com', 'email.com', 'yoursite', 
                                                    'sentry.io', 'schema.org', 'json', 'wixpress', 
                                                    'googleapis', 'facebook', 'twitter', '.png', '.jpg']):
                return email
        
        base_url = website_url.rstrip('/')
        contact_pages = ['/contact', '/contact-us', '/about', '/about-us']
        
        for contact_path in contact_pages:
            try:
                driver.get(base_url + contact_path)
                time.sleep(1.5)
                contact_source = driver.page_source
                
                mailto_match = re.search(r'href=["\']mailto:([^"\'<>?\s]+)', contact_source, re.IGNORECASE)
                if mailto_match:
                    email = mailto_match.group(1).strip()
                    if '@' in email:
                        return email
                
                email_matches = re.findall(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', contact_source)
                for email in email_matches:
                    email_lower = email.lower()
                    if not any(x in email_lower for x in ['example.com', 'domain.com', 'email.com', 'yoursite',
                                                           'sentry.io', 'schema.org', 'json', 'wixpress',
                                                           'googleapis', 'facebook', 'twitter', '.png', '.jpg']):
                        return email
            except:
                continue
                
    except Exception:
        pass
    
    return ""


def extract_email_from_detail(driver, detail_url, website_url="", debug_save=False):
    """Try Yellow Pages first, then fall back to company website"""
    try:
        time.sleep(random.uniform(MIN_DELAY, MAX_DELAY))
        driver.get(detail_url)
        
        try:
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, ".business-info, .sales-info, #main-content, #cf-wrapper"))
            )
        except:
            pass
        
        time.sleep(2)
        page_source = driver.page_source
        
        is_blocked = "you have been blocked" in page_source.lower() or ("cloudflare" in page_source.lower() and "ray id" in page_source.lower())
        
        if is_blocked:
            if website_url:
                print(" [YP blocked, trying website]", end="")
                email = extract_email_from_website(driver, website_url)
                if email:
                    return email
            return "__BLOCKED__"
        
        if debug_save:
            with open("debug_page_source.html", "w", encoding="utf-8") as f:
                f.write(page_source)
            print(f" [DEBUG: saved]", end="")
        
        driver.execute_script("window.scrollTo(0, 800);")
        time.sleep(1)
        page_source = driver.page_source
        
        mailto_match = re.search(r'href=["\']mailto:([^"\'<>?\s]+)', page_source, re.IGNORECASE)
        if mailto_match:
            email = mailto_match.group(1).strip()
            if '@' in email and 'yellowpages' not in email.lower():
                return email
        
        try:
            email_elements = driver.find_elements(By.CSS_SELECTOR, "a.email-business, a[class*='email']")
            for el in email_elements:
                href = el.get_attribute("href") or ""
                if "mailto:" in href:
                    return href.replace("mailto:", "").split("?")[0].strip()
        except:
            pass
        
        try:
            mailto_links = driver.find_elements(By.CSS_SELECTOR, "a[href*='mailto:']")
            for link in mailto_links:
                href = link.get_attribute("href") or ""
                if "mailto:" in href:
                    email = href.replace("mailto:", "").split("?")[0].strip()
                    if '@' in email:
                        return email
        except:
            pass
        
        soup = BeautifulSoup(page_source, "html.parser")
        for link in soup.find_all("a", href=True):
            href = link["href"]
            if "mailto:" in href:
                email = href.replace("mailto:", "").split("?")[0].strip()
                if '@' in email and 'yellowpages' not in email.lower():
                    return email
        
        email_pattern = re.search(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', page_source)
        if email_pattern:
            email = email_pattern.group()
            if not any(x in email.lower() for x in ['example.com', 'domain.com', 'yellowpages', 'schema.org', 'json']):
                return email
        
        if website_url:
            print(" [trying website]", end="")
            email = extract_email_from_website(driver, website_url)
            if email:
                return email
            
    except Exception as e:
        print(f" [err: {e}]", end="")
    return ""


def parse_listing(listing):
    try:
        company = listing.select_one(".business-name span").text.strip()
        phone = listing.select_one(".phones").text.strip() if listing.select_one(".phones") else ""
        
        address_street = listing.select_one(".street-address")
        address_locality = listing.select_one(".locality")
        full_address = " ".join(filter(None, [
            address_street.text.strip() if address_street else "",
            address_locality.text.strip() if address_locality else ""
        ]))
        
        # Extract neighborhood from address for targeting
        neighborhood = ""
        if address_locality:
            locality_text = address_locality.text.strip()
            # Try to extract neighborhood before city
            parts = locality_text.split(",")
            if len(parts) >= 2:
                neighborhood = parts[0].strip()
        
        website_el = listing.select_one(".track-visit-website")
        website = website_el["href"] if website_el else ""
        
        detail_link = "https://www.yellowpages.com" + listing.select_one(".business-name")["href"]

        return {
            "#": None,
            "Company Name": company,
            "Industry": INDUSTRY_LABEL,
            "Neighborhood": neighborhood,
            "Contact Name": "",
            "Email Address": "",
            "Phone Number": phone,
            "Website URL": website,
            "Address": full_address,
            "Date Added": datetime.now().strftime("%-m/%-d/%y"),
            "Date Contacted": "",
            "Source": detail_link,
            "Notes": "",
            "Has Menu Board": "",  # For qualifying during outreach
            "Interested": "",
            "Called": "",
            "Followed Up": "",
            "Closed": ""
        }
    except Exception as e:
        print(f"  Skipping listing: {e}")
        return None


def add_checkboxes(filepath):
    wb = load_workbook(filepath)
    ws = wb.active
    
    checkbox_validation = DataValidation(type="list", formula1='"‚òê,‚òë"', allow_blank=True)
    ws.add_data_validation(checkbox_validation)
    
    headers = {cell.value: cell.column for cell in ws[1]}
    
    for col_name in ["Has Menu Board", "Interested", "Called", "Followed Up", "Closed"]:
        if col_name in headers:
            col_idx = headers[col_name]
            for row in range(2, ws.max_row + 1):
                cell = ws.cell(row=row, column=col_idx)
                cell.value = "‚òê"
                checkbox_validation.add(cell)
    
    wb.save(filepath)


def get_listings_from_page(driver):
    """Extract all listing data from current page"""
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(2)
    
    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, ".result"))
        )
    except:
        return []
    
    listings = driver.find_elements(By.CSS_SELECTOR, ".result")
    page_data = []
    
    for listing in listings:
        try:
            soup = BeautifulSoup(listing.get_attribute("outerHTML"), "html.parser")
            parsed = parse_listing(soup)
            if parsed:
                page_data.append(parsed)
        except:
            continue
    
    return page_data


def scrape_yellowpages():
    base_url = f"https://www.yellowpages.com/{LOCATION}/{SEARCH_TERM}"
    print(f"=" * 60)
    print(f"üçï Digital Menu Board Lead Scraper")
    print(f"=" * 60)
    print(f"Category: {INDUSTRY_LABEL} ({SEARCH_TERM})")
    print(f"Location: {LOCATION}")
    print(f"Pages: {START_PAGE} to {END_PAGE}")
    print(f"Fetch emails: {FETCH_EMAILS}")
    print(f"Output: {OUTPUT_FILE}")
    print(f"=" * 60 + "\n")
    
    driver = create_driver()
    all_data = []
    
    try:
        for page in range(START_PAGE, END_PAGE + 1):
            try:
                url = base_url if page == 1 else f"{base_url}?page={page}"
                
                print(f"[Page {page}] Loading {url}")
                driver.get(url)
                time.sleep(2)
                
                page_data = get_listings_from_page(driver)
                
                if not page_data:
                    print(f"  No listings found - stopping pagination")
                    break
                
                print(f"  Found {len(page_data)} listings")
                
                if FETCH_EMAILS:
                    emails_found = 0
                    for i, entry in enumerate(page_data):
                        try:
                            company_short = entry['Company Name'][:35].ljust(35)
                            print(f"  [{i+1:2}/{len(page_data)}] {company_short}", end="", flush=True)
                            
                            debug_save = DEBUG and (page == START_PAGE and i == 0)
                            email = extract_email_from_detail(
                                driver, 
                                entry["Source"], 
                                website_url=entry.get("Website URL", ""),
                                debug_save=debug_save
                            )
                            
                            if email == "__BLOCKED__":
                                print(f" ‚Üí (blocked, no website email found)")
                            elif email:
                                entry["Email Address"] = email
                                emails_found += 1
                                print(f" ‚Üí {email}")
                            else:
                                print(f" ‚Üí (no email)")
                        except Exception as e:
                            print(f" ‚Üí [error: {e}]")
                            continue
                    
                    print(f"  Page {page} complete: {emails_found}/{len(page_data)} emails found\n")
                
                all_data.extend(page_data)
                
                if all_data:
                    temp_df = pd.DataFrame(all_data)
                    for idx, row in enumerate(all_data, start=1):
                        row["#"] = idx
                    temp_df.to_excel(OUTPUT_FILE, index=False)
                    print(f"  üíæ Progress saved to {OUTPUT_FILE} ({len(all_data)} leads)\n")
                
                if page < END_PAGE:
                    if RESTART_DRIVER_EACH_PAGE:
                        print(f"  üîÑ Restarting browser for fresh session...")
                        try:
                            driver.quit()
                        except:
                            pass
                        time.sleep(3)
                        driver = create_driver()
                    
                    delay = random.uniform(PAGE_DELAY, PAGE_DELAY + 5)
                    print(f"  ‚è≥ Waiting {delay:.1f}s before next page...\n")
                    time.sleep(delay)
                    
            except Exception as page_error:
                print(f"\n  ‚ö†Ô∏è Error on page {page}: {page_error}")
                print(f"  Saving progress and continuing...\n")
                try:
                    driver.quit()
                except:
                    pass
                driver = create_driver()
                continue
                
    except Exception as e:
        print(f"\n‚ùå Fatal error: {e}")
    finally:
        try:
            driver.quit()
        except:
            pass
    
    if not all_data:
        print("No data collected!")
        return pd.DataFrame()
    
    for i, row in enumerate(all_data, start=1):
        row["#"] = i

    df = pd.DataFrame(all_data)
    df.to_excel(OUTPUT_FILE, index=False)
    add_checkboxes(OUTPUT_FILE)
    
    emails_count = sum(1 for row in all_data if row["Email Address"])
    print(f"=" * 60)
    print(f"‚úÖ COMPLETE!")
    print(f"=" * 60)
    print(f"Total leads: {len(all_data)}")
    print(f"With emails: {emails_count}")
    print(f"Saved to: {OUTPUT_FILE}")
    print(f"=" * 60)
    
    return df


# === BATCH SCRAPER - Run multiple categories/locations ===
def batch_scrape(search_indices=None, location_indices=None):
    """
    Run scraper across multiple categories and locations.
    
    Example:
        batch_scrape(search_indices=[0,1,2], location_indices=[0,1])
        # Scrapes: delis, delicatessen, sandwich-shops in queens-ny and brooklyn-ny
    """
    global SEARCH_TERM, INDUSTRY_LABEL, LOCATION, OUTPUT_FILE
    
    search_indices = search_indices or [0]
    location_indices = location_indices or [0]
    
    all_files = []
    
    for s_idx in search_indices:
        for l_idx in location_indices:
            SEARCH_TERM = SEARCHES[s_idx]["term"]
            INDUSTRY_LABEL = SEARCHES[s_idx]["label"]
            LOCATION = LOCATIONS[l_idx]
            OUTPUT_FILE = f"yp_menus_{LOCATION}_{SEARCH_TERM}_p{START_PAGE}-{END_PAGE}.xlsx"
            
            print(f"\n{'='*60}")
            print(f"üîÑ BATCH: {INDUSTRY_LABEL} in {LOCATION}")
            print(f"{'='*60}\n")
            
            scrape_yellowpages()
            all_files.append(OUTPUT_FILE)
            
            # Longer delay between different searches
            time.sleep(random.uniform(30, 60))
    
    return all_files


# === RUN SINGLE SCRAPE ===
if __name__ == "__main__":
    df = scrape_yellowpages()


# === MERGE ALL FILES (run separately after all scrapes complete) ===
"""
import pandas as pd
import glob

files = glob.glob("yp_menus_*.xlsx")
dfs = []
for f in files:
    try:
        df = pd.read_excel(f)
        dfs.append(df)
    except:
        continue

if dfs:
    combined = pd.concat(dfs, ignore_index=True)
    combined["#"] = range(1, len(combined) + 1)
    # Remove duplicates by phone or company name + address
    combined.drop_duplicates(subset=["Company Name", "Phone Number"], inplace=True)
    combined["#"] = range(1, len(combined) + 1)
    combined.to_excel("yp_menus_all_leads.xlsx", index=False)
    print(f"Merged {len(files)} files ‚Üí {len(combined)} unique leads")
"""

Merged 1 files ‚Üí 89 unique leads
