In [1]:
pip install selenium webdriver-manager pandas

Defaulting to user installation because normal site-packages is not writeable
Collecting webdriver-manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Collecting python-dotenv (from webdriver-manager)
  Downloading python_dotenv-1.1.1-py3-none-any.whl.metadata (24 kB)
Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl (27 kB)
Downloading python_dotenv-1.1.1-py3-none-any.whl (20 kB)
Installing collected packages: python-dotenv, webdriver-manager
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [webdriver-manager]
[1A[2KSuccessfully installed python-dotenv-1.1.1 webdriver-manager-4.0.2
Note: you may need to restart the kernel to use updated packages.


In [11]:
# Install dependencies first if needed
# !pip install selenium webdriver-manager pandas

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, ElementClickInterceptedException
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time

def get_jobs(keyword, num_jobs=10, verbose=True, slp_time=3):
    """
    Scrapes Glassdoor job listings into a pandas DataFrame.
    """
    
    options = Options()
    options.add_argument("--start-maximized")
    # options.add_argument("--headless")  # uncomment to run in background
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    
    # Open Glassdoor job search page
    url = f'https://www.glassdoor.com/Job/jobs.htm?sc.keyword="{keyword}"'
    print("Opening Glassdoor...")
    driver.get(url)
    time.sleep(slp_time)
    
    jobs = []
    
    while len(jobs) < num_jobs:
        time.sleep(slp_time)
        
        # Close signup pop-up if it appears
        try:
            popup_close = WebDriverWait(driver, 3).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, 'button[aria-label="Close"]'))
            )
            popup_close.click()
            if verbose:
                print("Signup popup closed")
        except TimeoutException:
            if verbose:
                print("No signup popup found")
        
        # Wait until job cards are visible
        try:
            job_cards = WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'li.react-job-listing'))
            )
        except TimeoutException:
            print("No job listings found on this page.")
            break
        
        for card in job_cards:
            if len(jobs) >= num_jobs:
                break
            try:
                card.click()
                time.sleep(1)
                
                # Extract job info
                job_title = card.find_element(By.CSS_SELECTOR, 'a[data-test="job-link"]').text
                company_name = card.find_element(By.CSS_SELECTOR, 'div[data-test="employer-name"]').text
                location = card.find_element(By.CSS_SELECTOR, 'div[data-test="location"]').text
                try:
                    salary = card.find_element(By.CSS_SELECTOR, 'span[data-test="detailSalary"]').text
                except NoSuchElementException:
                    salary = None
                try:
                    rating = card.find_element(By.CSS_SELECTOR, 'span[data-test="detailRating"]').text
                except NoSuchElementException:
                    rating = None
                
                jobs.append({
                    "Job Title": job_title,
                    "Company Name": company_name,
                    "Location": location,
                    "Salary": salary,
                    "Rating": rating
                })
                
                if verbose:
                    print(f"Scraped: {job_title} at {company_name}")
                    
            except Exception as e:
                if verbose:
                    print("Error scraping a card:", e)
                continue
        
        # Go to next page
        try:
            next_button = driver.find_element(By.CSS_SELECTOR, 'li.next a')
            next_button.click()
            time.sleep(slp_time)
        except NoSuchElementException:
            print("No more pages or reached scraping limit")
            break
    
    driver.quit()
    print(f"Scraping finished. Total jobs collected: {len(jobs)}")
    return pd.DataFrame(jobs)

In [3]:
df = get_jobs("data scientist", 5, True, 5)
df.head()

Scraping finished early. Got 0 jobs.


In [4]:
print("Starting scraper...")
df = get_jobs("data scientist", 5, True, 5)
print("Scraper finished.")
display(df.head())

Starting scraper...
Scraping finished early. Got 0 jobs.
Scraper finished.


In [5]:
df = get_jobs("data scientist", 3, True, 2)

Scraping finished early. Got 0 jobs.


In [12]:
import pandas as pd

# Scrape 5 data scientist jobs
df = get_jobs("data scientist", num_jobs=5, verbose=True, slp_time=3)

# Show first few rows
df.head()

Opening Glassdoor...
No signup popup found
No job listings found on this page.
Scraping finished. Total jobs collected: 0
