In [18]:
#import all necerssary libraries

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium_stealth import stealth
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait

import pandas as pd
import time
import random
import csv
import os

Decided to scrap the glassdoor job board to get insights into the demand for Power Bi versus Tableau in the global analytics job market. The scrapper will pretty much work like the myjobmag scrapper, upon isnpection of the website's HTML structure, once you search a job keyword, say 'Business Intelligence Analyst', you get redirected to a results page populated with several job cards. To get access to the full job description of the job, you have to click on the job card, which dynamically loads a window on the right with the full details. 

In [19]:
#function to initialize the browser

def init_driver():
#set up the browser using the options object

    options=webdriver.ChromeOptions()
    options.add_argument('--start-maximized')
    options.add_argument('--disable-blink-features=AutomationControlled')
    options.add_experimental_option('excludeSwitches',['enable_automation'])
    options.add_experimental_option('useAutomationExtension','False')

    driver = webdriver.Chrome(
        service=Service(ChromeDriverManager().install()),
        options=options
)   
    stealth (driver,
        languages=["en-US", "en"],
        vendor="Google Inc.",
        platform="Win32",
        webgl_vendor="Intel Inc.",
        renderer="Intel Iris OpenGL Engine",
        fix_hairline=True,
        )
    return driver



In [20]:
#adding the resume feature, if a job has already been scrapped, the scrapper will skip it

scraped_links = set()

if os.path.exists("glassdoor_data.csv"):
    with open("glassdoor_data.csv", "r", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for row in reader:
            scraped_links.add(row["link"])

print(f"Resuming... {len(scraped_links)} jobs already scraped.")

# append mode, so as not to overide data already in the csv
csv_file = open("glassdoor_data.csv", "a", newline="", encoding="utf-8")
writer = csv.DictWriter(csv_file, fieldnames=["title", "description", "link"])

# Write header only if file is empty
if os.stat("glassdoor_data.csv").st_size == 0:
    writer.writeheader()

Resuming... 351 jobs already scraped.


In [21]:
#initialize the driver
driver=init_driver()
base_url=('https://www.glassdoor.com/Job/jobs.htm?sc.occupationParam=power+bi%2C+tableau&sc.locationSeoString=United+States&locId=1&locT=N')
driver.get(base_url)
time.sleep(random.uniform(4,6))

#Pause for manual CAPTCHA solving
input("Please solve the CAPTCHA manually in the browser window, then press Enter to continue...")

click_count=0
while click_count<15: #only scrape the job cards in the first 20 'pages'
    try:
        show_more_button=WebDriverWait(driver,10).until(EC.element_to_be_clickable((By.CSS_SELECTOR,"button[data-test='load-more']")))
        driver.execute_script("arguments[0].scrollIntoView();", show_more_button)
        time.sleep(random.uniform(4,6))
        show_more_button.click()
        click_count += 1
        print(f'More jobs have been loaded {click_count} times')
    except Exception as e:
        print(f'Reason for failure {e}')
        time.sleep(random.uniform(1,2))
        driver.find_element(By.CSS_SELECTOR, "button[data-test='load-more']").click()


#after loading all those jobs-by clicking more 20 times, now find the href elements in the html
job_links=[]
 
link_elements = driver.find_elements(By.CSS_SELECTOR, "a[data-test='job-title']")
for element in link_elements:
    href=element.get_attribute('href')
    job_links.append(href)
    time.sleep(random.uniform(4,6))


More jobs have been loaded 1 times
More jobs have been loaded 2 times
More jobs have been loaded 3 times
More jobs have been loaded 4 times
More jobs have been loaded 5 times
More jobs have been loaded 6 times
More jobs have been loaded 7 times
More jobs have been loaded 8 times
More jobs have been loaded 9 times
More jobs have been loaded 10 times
More jobs have been loaded 11 times
More jobs have been loaded 12 times
More jobs have been loaded 13 times
More jobs have been loaded 14 times
More jobs have been loaded 15 times


In [22]:
for i, link in enumerate(job_links):
    try:
        driver.get(link)
        time.sleep(random.uniform(3, 5))

        #click the show button
        try:
            show_more = WebDriverWait(driver, 5).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, "button[data-test='show-more-cta']"))
            )
            show_more.click()
            time.sleep(2)
        except:
            print(" Show more not found on page")

        # Get job title
        try:
            title = driver.find_element(By.CSS_SELECTOR, "h1.heading_Heading__BqX5J").text.strip()
        except:
            title = "N/A"

        # Get job description- the paragraphs are all stored in different div containers
        try:
            container = driver.find_element(By.CLASS_NAME, "JobDetails_jobDescription__uW_fK")
            description = container.text.strip()

        except Exception as e:
            print(f"Scrapping failed: {e}")
            description = "N/A"


        writer.writerow({
            "title": title,
            "description": description,
            "link": link
        })
        csv_file.flush()

        print(f"Successfully Scraped: {title}")

    except Exception as e:
        print(f"There us and error scraping this link {link}: {e}")
        continue

    time.sleep(random.uniform(2, 4))

driver.quit()
csv_file.close()
print("Finished scraping Glassdoor jobs!")


Successfully Scraped: Power BI Crystal Reports
Successfully Scraped: Tableau Developer
Successfully Scraped: Power BI Developer
Successfully Scraped: Power BI Co-Op
Successfully Scraped: Power BI Designer
Successfully Scraped: Lead Power BI Developer
Successfully Scraped: Power BI Developer
Successfully Scraped: Power BI and Tableau Developer
Successfully Scraped: Power BI Admin
Successfully Scraped: Power BI Developer
 Show more not found on page
Successfully Scraped: Tableau Developer
Successfully Scraped: Data Analyst
Successfully Scraped: Power BI Developer/Analyst
Successfully Scraped: Sr. Power BI Developer
Successfully Scraped: Power Apps Developer
Successfully Scraped: Senior Power BI Developer
Successfully Scraped: Data Analyst - Power BI
Successfully Scraped: Power BI Developer- Clearance Required
Successfully Scraped: BI Engineer I
Successfully Scraped: Power BI Developer- Clearance Required
Successfully Scraped: Data Analyst
Successfully Scraped: Business Analyst (Entry Lev