In [5]:
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException
from selenium import webdriver
import time
import pandas as pd

In [123]:
import time, random
import pandas as pd
import urllib.parse
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.remote.webelement import WebElement

def get_jobs(keyword, num_jobs=40, verbose=False):
    """Scrape job listings from Glassdoor by attaching to an already open Chrome session."""

    # Attach to an already running Chrome session (start Chrome with --remote-debugging-port=9222)
    options = Options()
    options.add_experimental_option("debuggerAddress", "127.0.0.1:9222")
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=options)

    # Open Glassdoor job search page
    encoded_keyword = urllib.parse.quote_plus(keyword)
    url = f"https://www.glassdoor.com/Job/jobs.htm?sc.keyword={encoded_keyword}"
    driver.get(url)
    time.sleep(5)

    jobs = []
    page_num = 1

    while len(jobs) < num_jobs:
        print(f"Attempting to fetch new job listings on page {page_num}")
        try:
            # Wait for job cards to load
            job_cards = WebDriverWait(driver, 15).until(
                EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div.jobCard"))
            )

            if verbose:
                print(f"✅ Found {len(job_cards)} job cards on page {page_num}.")

        except TimeoutException:
            print("⚠️ Timeout: No jobs loaded or page blocked.")
            break
        except Exception as e:
            print(f"⚠️ Error loading jobs: {e}")
            break

        # Loop through job cards and extract information
        for i, card in enumerate(job_cards):
            if len(jobs) >= num_jobs:
                break

            try:
                # Extract job information using more specific selectors
                job = extract_job_data(card, verbose)

                # Only add job if we got at least the title or company
                if job["title"] or job["company"]:
                    jobs.append(job)

                    if verbose:
                        print(f"✅ Scraped {len(jobs)}/{num_jobs}: {job['title']} @ {job['company']}")

            except Exception as e:
                if verbose:
                    print(f"⚠️ Error scraping card {i + 1}: {e}")
                continue

        # Check if we have enough jobs
        if len(jobs) >= num_jobs:
            break

        # Go to next page
        try:
            next_selectors = [
                "button[aria-label='Next Page']",
                "button[aria-label='next']",
                "a[aria-label='Next Page']",
                "[data-test='pagination-next']"
            ]

            next_btn = None
            for selector in next_selectors:
                try:
                    next_btn = driver.find_element(By.CSS_SELECTOR, selector)
                    if next_btn and next_btn.is_enabled():
                        break
                except:
                    continue

            if next_btn and next_btn.is_enabled():
                driver.execute_script("arguments[0].click();", next_btn)
                page_num += 1
                time.sleep(random.uniform(3, 5))
            else:
                if verbose:
                    print("ℹ️ No more pages available.")
                break

        except Exception as e:
            if verbose:
                print(f"ℹ️ No more pages or error navigating: {e}")
            break

    # Create DataFrame with all columns
    df = pd.DataFrame(jobs)

    # Ensure all expected columns exist
    expected_columns = ["title", "company", "location", "salary", "rating", "posted", "description"]
    for col in expected_columns:
        if col not in df.columns:
            df[col] = ""

    # Reorder columns
    df = df[expected_columns]

    print(f"\n📊 Total jobs scraped: {len(df)}")
    return df

def extract_job_data(job_card_element: WebElement, verbose=False) -> dict:
    """Extracts job details from a single job card WebElement."""
    job_data = {
        "title": None,
        "company": None,
        "location": None,
        "salary": None,
        "rating": None,
        "posted": None,
        "description": None
    }

    try:
        # Job Title
        title_element = job_card_element.find_element(By.CSS_SELECTOR, 'a[data-test="job-title"]')
        job_data["title"] = title_element.text
    except NoSuchElementException:
        job_data["title"] = "N/A"

    try:
        # Company
        company_element = job_card_element.find_element(By.CSS_SELECTOR, 'span[class^="EmployerProfile_compactEmployerName__"]')
        job_data["company"] = company_element.text
    except NoSuchElementException:
        job_data["company"] = "N/A"

    try:
        # Location
        location_element = job_card_element.find_element(By.CSS_SELECTOR, '*[data-test="emp-location"]')
        job_data["location"] = location_element.text
    except NoSuchElementException:
        job_data["location"] = "N/A"

    try:
        # Salary
        salary_element = job_card_element.find_element(By.CSS_SELECTOR, '*[data-test="detailSalary"]')
        job_data["salary"] = salary_element.text
    except NoSuchElementException:
        job_data["salary"] = "N/A"

    try:
        # Rating
        rating_element = job_card_element.find_element(By.CSS_SELECTOR, 'span[class^="rating-single-star_RatingText__"]')
        job_data["rating"] = rating_element.text
    except NoSuchElementException:
        job_data["rating"] = "N/A"

    try:
        # Posted Date (Check if the date is present inside a span with a specific class)
        posted_element = job_card_element.find_element(By.CSS_SELECTOR, 'span[class*="JobCard_postedDate"]')
        job_data["posted"] = posted_element.text
    except NoSuchElementException:
        job_data["posted"] = "N/A"

    try:
        # Job Description (Check if description exists directly in the card or needs to be clicked)
        description_element = job_card_element.find_element(By.CSS_SELECTOR, 'div[class*="jobDescription"]')
        job_data["description"] = description_element.text
    except NoSuchElementException:
        job_data["description"] = "N/A"

    if verbose:
        print(f"Job Data Extracted: {job_data}")

    return job_data


# Example usage:
if __name__ == "__main__":
    # Make sure Chrome is running with: 
    # chrome.exe --remote-debugging-port=9222
    print("Starting job scrapper")
    
    df = get_jobs("software engineer", num_jobs=100, verbose=True)
    print("\nFirst few jobs:")
    print(df)

Starting job scrapper
Attempting to fetch new job listings on page 1
✅ Found 30 job cards on page 1.
Job Data Extracted: {'title': '.NET Full Stack Software Engineer II', 'company': 'The Fresh Market Inc', 'location': 'Greensboro, NC', 'salary': '$74K - $105K (Glassdoor est.)', 'rating': '3.1', 'posted': 'N/A', 'description': 'N/A'}
✅ Scraped 1/100: .NET Full Stack Software Engineer II @ The Fresh Market Inc
Job Data Extracted: {'title': 'Software Engineer II, Backend', 'company': 'Included Health', 'location': 'Remote', 'salary': 'N/A', 'rating': '3.1', 'posted': 'N/A', 'description': 'N/A'}
✅ Scraped 2/100: Software Engineer II, Backend @ Included Health
Job Data Extracted: {'title': 'Software Engineer', 'company': 'HOJ INNOVATIONS, INC', 'location': 'Salt Lake City, UT', 'salary': '$75K - $90K (Employer provided)', 'rating': '4.3', 'posted': 'N/A', 'description': 'N/A'}
✅ Scraped 3/100: Software Engineer @ HOJ INNOVATIONS, INC
Job Data Extracted: {'title': 'Software Engineer', 'comp

In [124]:
df

Unnamed: 0,title,company,location,salary,rating,posted,description
0,.NET Full Stack Software Engineer II,The Fresh Market Inc,"Greensboro, NC",$74K - $105K (Glassdoor est.),3.1,,
1,"Software Engineer II, Backend",Included Health,Remote,,3.1,,
2,Software Engineer,"HOJ INNOVATIONS, INC","Salt Lake City, UT",$75K - $90K (Employer provided),4.3,,
3,Software Engineer,Nixon web tech,"Fremont, CA",$105K - $130K (Employer provided),,,
4,Django Software Engineer,AWRE Sports,Remote,$65K - $150K (Employer provided),,,
5,Software Engineer (Junior),Terran Robotics Inc.,Remote,$70K - $110K (Employer provided),,,
6,Full Stack Engineer,Campfire,"San Francisco, CA",$122K - $200K (Glassdoor est.),3.7,,
7,Full Stack Developer,IMMEDIATE MAILING SERVICE INC,"Liverpool, NY",$90K - $110K (Employer provided),2.9,,
8,Associate Software Engineer,Protolabs,North Carolina,$73K - $97K (Employer provided),2.7,,
9,Front End Software Engineer,Sparksuite,"Spring, TX",$88K - $96K (Employer provided),,,
