In [5]:
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException
from selenium import webdriver
import time
import pandas as pd

In [105]:
import time, random
import pandas as pd
import urllib.parse
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.remote.webelement import WebElement

def get_jobs(keyword, num_jobs=40, verbose=False):
    """Scrape job listings from Glassdoor by attaching to an already open Chrome session."""

    # Attach to an already running Chrome session (start Chrome with --remote-debugging-port=9222)
    options = Options()
    options.add_experimental_option("debuggerAddress", "127.0.0.1:9222")
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=options)

    # Open Glassdoor job search page
    encoded_keyword = urllib.parse.quote_plus(keyword)
    url = f"https://www.glassdoor.com/Job/jobs.htm?sc.keyword={encoded_keyword}"
    driver.get(url)
    time.sleep(5)

    jobs = []
    page_num = 1

    while len(jobs) < num_jobs:
        print("Attempting to fetch new joblisting")
        try:
            # Wait for job cards to load
            job_cards = WebDriverWait(driver, 15).until(
                EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div.jobCard"))
            )

            if verbose:
                print(f" Found {len(job_cards)} job cards on page {page_num}.")

        except TimeoutException:
            print(" Timeout: No jobs loaded or page blocked.")
            break
        except Exception as e:
            print(f" Error loading jobs: {e}")
            break

        # Loop through job cards and extract information
        for i, card in enumerate(job_cards):
            if len(jobs) >= num_jobs:
                break

            try:
                # Extract job information using more specific selectors
                job = extract_job_data(card)

                # Only add job if we got at least the title or company
                if job["title"] or job["company"]:
                    jobs.append(job)

                    if verbose:
                        print(f" Scraped {len(jobs)}/{num_jobs}: {job['title']} @ {job['company']}")

            except Exception as e:
                if verbose:
                    print(f" Error scraping card {i + 1}: {e}")
                continue

        # Check if we have enough jobs
        if len(jobs) >= num_jobs:
            break

        # Go to next page
        try:
            # Try multiple selectors for next button
            next_selectors = [
                "button[aria-label='Next Page']",
                "button[aria-label='next']",
                "a[aria-label='Next Page']",
                "[data-test='pagination-next']"
            ]

            next_btn = None
            for selector in next_selectors:
                try:
                    next_btn = driver.find_element(By.CSS_SELECTOR, selector)
                    if next_btn and next_btn.is_enabled():
                        break
                except:
                    continue

            if next_btn and next_btn.is_enabled():
                driver.execute_script("arguments[0].click();", next_btn)
                page_num += 1
                time.sleep(random.uniform(3, 5))
            else:
                if verbose:
                    print(" No more pages available.")
                break

        except Exception as e:
            if verbose:
                print(f" No more pages or error navigating: {e}")
            break

    # Create DataFrame with all columns
    df = pd.DataFrame(jobs)

    # Ensure all expected columns exist
    expected_columns = ["title", "company", "location", "salary", "rating", "Posted", "Description"]
    for col in expected_columns:
        if col not in df.columns:
            df[col] = ""

    # Reorder columns
    df = df[expected_columns]

    print(f"\n Total jobs scraped: {len(df)}")
    return df

def extract_job_data(job_card_element: WebElement) -> dict:
    """Extracts job details from a single job card WebElement."""
    job_data = {
        "title": None,
        "company": None,
        "location": None,
        "salary": None
    }

    try:
        # Job Title
        title_element = job_card_element.find_element(By.CSS_SELECTOR, 'a[data-test="job-title"]')
        job_data["title"] = title_element.text
    except NoSuchElementException:
        print("Job title not found.")

    try:
        # Company
        company_element = job_card_element.find_element(By.CSS_SELECTOR, 'span[class^="EmployerProfile_compactEmployerName__"]')
        job_data["company"] = company_element.text
    except NoSuchElementException:
        print("Company not found.")

    try:
        # Location
        location_element = job_card_element.find_element(By.CSS_SELECTOR, '*[data-test="emp-location"]')
        job_data["location"] = location_element.text
    except NoSuchElementException:
        print("Location not found.")

    try:
        # Salary (handle cases where it's not present)
        salary_element = job_card_element.find_element(By.CSS_SELECTOR, '*[data-test="detailSalary"]')
        job_data["salary"] = salary_element.text
    except NoSuchElementException:
        job_data["salary"] = "N/A" # Set to 'N/A' if salary is not available

    try:
        # Rating (handle cases where it's not present)
        rating_element = job_card_element.find_element(By.CSS_SELECTOR, 'span[class^="rating-single-star_RatingText__"]')
        job_data["rating"] = rating_element.text
    except NoSuchElementException:
        job_data["rating"] = "0" # Set to 'N/A' if salary is not available

    return job_data


# Example usage:
if __name__ == "__main__":
    # Make sure Chrome is running with: 
    # chrome.exe --remote-debugging-port=9222
    print("Starting job scrapper")
    
    df = get_jobs("software engineer", num_jobs=20, verbose=True)
    print("\nFirst few jobs:")
    print(df)
    
    # Save to CSV
    # df.to_csv("glassdoor_jobs.csv", index=False)

Starting job scrapper
Attempting to fetch new joblisting
✅ Found 30 job cards on page 1.
✅ Scraped 1/20: Front End Software Engineer @ Sparksuite
✅ Scraped 2/20: Senior Backend Software Engineer @ CompScience
✅ Scraped 3/20: Software Engineer (Junior) @ Terran Robotics Inc.
✅ Scraped 4/20: Associate Software Engineer @ Protolabs
✅ Scraped 5/20: Software Engineer II @ Payscale
✅ Scraped 6/20: Full Stack Engineer @ Campfire
✅ Scraped 7/20: Software Engineer, Frontend - SevenRooms @ DoorDash USA
✅ Scraped 8/20: Assoc Engineer, Software @ T-Mobile USA, Inc.
✅ Scraped 9/20: Junior Full Stack Developer @ Vivacity Tech PBC
✅ Scraped 10/20: PHP Software Engineer @ INSHUR
✅ Scraped 11/20: Software Engineer @ Barco
✅ Scraped 12/20: Software Developer (Full Stack) @ Intelerad
✅ Scraped 13/20: Software Engineer - Backend @ Navan
✅ Scraped 14/20: Software Engineer @ FTS Inc
✅ Scraped 15/20: Full Stack Developer @ Minerva Coach
✅ Scraped 16/20: Software Engineer II @ Synapse Health
✅ Scraped 17/20: 