In [7]:
import os
import random
import time
import pandas as pd
from datetime import datetime
from itertools import zip_longest
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

EMAIL = os.getenv('APOLLO_EMAIL')
PASSWORD = os.getenv('APOLLO_PASSWORD')
URL = os.getenv('APOLLO_URL')

class ApolloScraper:
    def __init__(self, user_agents, url):
        '''Initialize the scraper with user agents, login credentials, and URL.'''
        self.user_agents = user_agents
        self.url = url
        self.driver = self.setup_webdriver()

    def setup_webdriver(self):
        '''Set up the Chrome WebDriver with random user-agent headers.'''
        service = Service()
        options = Options()
        options.add_argument(f"user-agent={random.choice(self.user_agents)}")
        # options.add_argument("--headless")
        self.driver = webdriver.Chrome(service=service, options=options)
        self.driver.maximize_window()
        return self.driver

    def login(self):
        '''Perform login to the Apollo website using email and password.'''
        print("🔄 Navigating to Apollo login page...")
        
        # Go directly to login page first (like colleague's solution)
        self.driver.get("https://app.apollo.io/#/login")
        time.sleep(5)  # Wait for page to load
        
        print("📧 Entering credentials...")
        
        # Use colleague's simple approach - direct CSS selectors, no clear()
        try:
            # Enter email
            email_input = self.driver.find_element(By.CSS_SELECTOR, 'input[name="email"]')
            email_input.send_keys(EMAIL)
            print(f"✅ Email entered: {EMAIL}")
            
            # Enter password  
            password_input = self.driver.find_element(By.CSS_SELECTOR, 'input[name="password"]')
            password_input.send_keys(PASSWORD)
            print("✅ Password entered")
            
            # Click login button
            login_button = self.driver.find_element(By.CSS_SELECTOR, 'button[type="submit"]')
            login_button.click()
            print("✅ Login button clicked")
            
            print("⏳ Logging in, please wait for Apollo to load after login...")
            time.sleep(10)  # Wait for login to complete
            
            # Now navigate to the actual search page
            print("🔄 Navigating to search results page...")
            self.driver.get(URL)  # Use the search URL from .env
            time.sleep(7)  # Wait for results to load
            
            print("✅ Successfully logged in and navigated to search page!")
            
        except Exception as e:
            print(f"❌ Login failed: {e}")
            print("🔍 Taking screenshot for debugging...")
            self.driver.save_screenshot("debug_login_error.png")
            print("📸 Screenshot saved as 'debug_login_error.png'")
            raise

    def save_soup_content(self, soup, page_num, save_type="all"):
        """Save soup content in multiple formats."""
        
        if save_type in ["all", "html"]:
            # Complete soup as HTML
            with open(f"soup_complete_page_{1}.html", "w", encoding="utf-8") as f:
                f.write(str(soup))
            print(f"💾 Saved complete soup: soup_complete_page_{page_num}.html")
        
        if save_type in ["all", "filtered"]:
            # Save filtered soup (after your cell filtering)
            with open(f"soup_filtered_page_{page_num}.html", "w", encoding="utf-8") as f:
                f.write(str(soup))
            print(f"💾 Saved filtered soup: soup_filtered_page_{page_num}.html")
        
        if save_type in ["all", "pretty"]:
            # Pretty-printed version for debugging
            with open(f"soup_pretty_page_{page_num}.html", "w", encoding="utf-8") as f:
                f.write(soup.prettify())
            print(f"💾 Saved pretty soup: soup_pretty_page_{page_num}.html")

    def get_page_soup(self):
        """Get BeautifulSoup object from current page source"""
        return BeautifulSoup(self.driver.page_source, 'html.parser')


    
    def quit(self):
        
        '''Close the WebDriver session.'''
        self.driver.quit()

if __name__ == "__main__":
    user_agents = ["Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36"]

    scraper = ApolloScraper(user_agents, URL)

    scraper.login()
    
    # Get soup from current page and save it
    soup = scraper.get_page_soup()
    scraper.save_soup_content(soup, "1", "html")
    
    scraper.quit()


The chromedriver version (137.0.7151.122) detected in PATH at /usr/local/bin/chromedriver might not be compatible with the detected chrome version (138.0.7204.169); currently, chromedriver 138.0.7204.168 is recommended for chrome 138.*, so it is advised to delete the driver in PATH and retry


🔄 Navigating to Apollo login page...
📧 Entering credentials...
✅ Email entered: tom.weijers@xomnia.com
✅ Password entered
✅ Login button clicked
⏳ Logging in, please wait for Apollo to load after login...
🔄 Navigating to search results page...
✅ Successfully logged in and navigated to search page!
💾 Saved complete soup: soup_complete_page_1.html


In [None]:


def filter_soup_to_first_three_cells(self, soup):
    """Filter HTML soup to keep only cells 2, 3, 4 per person row (Name, Job Title, Company)."""
    print("🔧 Filtering HTML to keep only cells 2, 3, 4 per person (Name, Job Title, Company)...")
    
    # Find all person rows
    person_rows = soup.find_all('div', {'role': 'row'})
    print(f"🔍 Found {len(person_rows)} person rows")
    
    filtered_rows = []
    for row in person_rows:
        # Find all cells in this row
        cells = row.find_all('div', {'role': 'cell'})
        
        if len(cells) >= 4:
            # Keep cells 2, 3, 4 (skip checkbox cell 1, get Name, Job Title, Company)
            relevant_cells = cells[1:4]  # This gets cells at index 1, 2, 3 (which are cells 2, 3, 4 in human counting)
            
            # Create new row with only relevant cells
            new_row = soup.new_tag('div', **row.attrs)
            for cell in relevant_cells:
                new_row.append(cell)
            
            filtered_rows.append(new_row)
    
    # Create new soup with only filtered rows
    filtered_soup = BeautifulSoup('<div class="filtered-content"></div>', 'html.parser')
    container = filtered_soup.find('div', class_='filtered-content')
    
    for row in filtered_rows:
        container.append(row)
    
    print(f"✅ Filtered to {len(filtered_rows)} rows with cells 2,3,4 each (Name, Job Title, Company)")
    return filtered_soup