## Glassdoor Scraper function

In [9]:
"""
author: Kenarapfaik
url: https://github.com/arapfaik/scraping-glassdoor-selenium
"""
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import time
import pandas as pd


def get_jobs(keyword, num_jobs, verbose, slp_time):
    
    '''Gathers jobs as a dataframe, scraped from Glassdoor'''
    
    #Initializing the webdriver
    options = webdriver.ChromeOptions()
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('window-size=1920x1480')
    #Uncomment the line below if you'd like to scrape without a new Chrome window every time.
    #options.add_argument('headless')
    
    #Chromedriver
    driver = webdriver.Chrome(ChromeDriverManager().install())
    driver.set_window_size(1920, 1480)
    
    url = "https://www.glassdoor.com/Job/jobs.htm?suggestCount=0&suggestChosen=false&clickSource=searchBtn&typedKeyword="+keyword+"&sc.keyword="+keyword+"&locT=&locId=&jobType="+"&includeNoSalaryJobs=false"
    #url = 'https://www.glassdoor.com/Job/jobs.htm?sc.keyword="' + keyword + '"&locT=C&locId=1147401&locKeyword=San%20Francisco,%20CA&jobType=all&fromAge=-1&minSalary=0&includeNoSalaryJobs=true&radius=100&cityId=-1&minRating=0.0&industryId=-1&sgocId=-1&seniorityType=all&companyId=-1&employerSizes=0&applicationType=0&remoteWorkType=0'
    driver.get(url)
    jobs = []

    while len(jobs) < num_jobs:  #If true, should be still looking for new jobs.

        #Let the page load. Change this number based on your internet speed.
        #Or, wait until the webpage is loaded, instead of hardcoding it.
        time.sleep(slp_time)

        #Test for the "Sign Up" prompt and get rid of it.
        try:
            driver.find_element_by_class_name("selected").click()
        except ElementClickInterceptedException:
            pass

        time.sleep(1)

        try:
            driver.find_element_by_css_selector('[alt="Close"]').click() #clicking to the X.
            print(' x out worked')
        except NoSuchElementException:
            print(' x out failed')
            pass

        
        #Going through each job in this page
        job_buttons = driver.find_elements_by_class_name("jl")  #jl for Job Listing. These are the buttons we're going to click.
        for job_button in job_buttons:  

            print("Progress: {}".format("" + str(len(jobs)) + "/" + str(num_jobs)))
            if len(jobs) >= num_jobs:
                break
            try:
                time.sleep(1)
                job_button.click()  #You might 
                time.sleep(1)
                collected_successfully = False
            except:
                continue
            
            while not collected_successfully:
                try:
                    company_name = driver.find_element_by_xpath('.//div[@class="employerName"]').text
                    location = driver.find_element_by_xpath('.//div[@class="location"]').text
                    job_title = driver.find_element_by_xpath('.//div[contains(@class, "title")]').text
                    job_description = driver.find_element_by_xpath('.//div[@class="jobDescriptionContent desc"]').text
                    collected_successfully = True
                except:
                    time.sleep(5)

            try:
                salary_estimate = driver.find_element_by_xpath('.//span[@class="gray salary"]').text
            except NoSuchElementException:
                salary_estimate = -1 #You need to set a "not found value. It's important."
            
            try:
                rating = driver.find_element_by_xpath('.//span[@class="rating"]').text
            except NoSuchElementException:
                rating = -1 #You need to set a "not found value. It's important."

            #Printing for debugging
            if verbose:
                print("Job Title: {}".format(job_title))
                print("Salary Estimate: {}".format(salary_estimate))
                print("Job Description: {}".format(job_description[:500]))
                print("Rating: {}".format(rating))
                print("Company Name: {}".format(company_name))
                print("Location: {}".format(location))

            #Going to the Company tab...
            #clicking on this:
            #<div class="tab" data-tab-type="overview"><span>Company</span></div>
            try:
                driver.find_element_by_xpath('.//div[@class="tab" and @data-tab-type="overview"]').click()

                try:
                    #<div class="infoEntity">
                    #    <label>Headquarters</label>
                    #    <span class="value">San Francisco, CA</span>
                    #</div>
                    headquarters = driver.find_element_by_xpath('.//div[@class="infoEntity"]//label[text()="Headquarters"]//following-sibling::*').text
                except NoSuchElementException:
                    headquarters = -1

                try:
                    size = driver.find_element_by_xpath('.//div[@class="infoEntity"]//label[text()="Size"]//following-sibling::*').text
                except NoSuchElementException:
                    size = -1

                try:
                    founded = driver.find_element_by_xpath('.//div[@class="infoEntity"]//label[text()="Founded"]//following-sibling::*').text
                except NoSuchElementException:
                    founded = -1

                try:
                    type_of_ownership = driver.find_element_by_xpath('.//div[@class="infoEntity"]//label[text()="Type"]//following-sibling::*').text
                except NoSuchElementException:
                    type_of_ownership = -1

                try:
                    industry = driver.find_element_by_xpath('.//div[@class="infoEntity"]//label[text()="Industry"]//following-sibling::*').text
                except NoSuchElementException:
                    industry = -1

                try:
                    sector = driver.find_element_by_xpath('.//div[@class="infoEntity"]//label[text()="Sector"]//following-sibling::*').text
                except NoSuchElementException:
                    sector = -1

                try:
                    revenue = driver.find_element_by_xpath('.//div[@class="infoEntity"]//label[text()="Revenue"]//following-sibling::*').text
                except NoSuchElementException:
                    revenue = -1

                try:
                    competitors = driver.find_element_by_xpath('.//div[@class="infoEntity"]//label[text()="Competitors"]//following-sibling::*').text
                except NoSuchElementException:
                    competitors = -1

            except NoSuchElementException:  #Rarely, some job postings do not have the "Company" tab.
                headquarters = -1
                size = -1
                founded = -1
                type_of_ownership = -1
                industry = -1
                sector = -1
                revenue = -1
                competitors = -1
            except:
                continue

                
            if verbose:
                print("Headquarters: {}".format(headquarters))
                print("Size: {}".format(size))
                print("Founded: {}".format(founded))
                print("Type of Ownership: {}".format(type_of_ownership))
                print("Industry: {}".format(industry))
                print("Sector: {}".format(sector))
                print("Revenue: {}".format(revenue))
                print("Competitors: {}".format(competitors))
                print("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@")

            jobs.append({"Job Title" : job_title,
            "Salary Estimate" : salary_estimate,
            "Job Description" : job_description,
            "Rating" : rating,
            "Company Name" : company_name,
            "Location" : location,
            "Headquarters" : headquarters,
            "Size" : size,
            "Founded" : founded,
            "Type of ownership" : type_of_ownership,
            "Industry" : industry,
            "Sector" : sector,
            "Revenue" : revenue,
            "Competitors" : competitors})
            #add job to jobs
            
            
        #Clicking on the "next page" button
        try:
            driver.find_element_by_xpath('.//li[@class="next"]//a').click()
        except NoSuchElementException:
            print("Scraping terminated before reaching target number of jobs. Needed {}, got {}.".format(num_jobs, len(jobs)))
            break

    return pd.DataFrame(jobs)  #This line converts the dictionary object into a pandas DataFrame.

In [10]:
df=get_jobs("data science", 2500, False, 14)

[WDM] - Current google-chrome version is 85.0.4183
[WDM] - Get LATEST driver version for 85.0.4183


 


[WDM] - Driver [C:\Users\Shaon\.wdm\drivers\chromedriver\win32\85.0.4183.87\chromedriver.exe] found in cache


 x out worked
Progress: 0/2500
Progress: 1/2500
Progress: 2/2500
Progress: 3/2500
Progress: 4/2500
Progress: 5/2500
Progress: 6/2500
Progress: 7/2500
Progress: 8/2500
Progress: 9/2500
Progress: 10/2500
Progress: 11/2500
Progress: 12/2500
Progress: 13/2500
Progress: 14/2500
Progress: 15/2500
Progress: 16/2500
Progress: 17/2500
Progress: 18/2500
Progress: 19/2500
Progress: 20/2500
Progress: 21/2500
Progress: 22/2500
Progress: 23/2500
Progress: 24/2500
Progress: 25/2500
Progress: 26/2500
Progress: 27/2500
Progress: 28/2500
Progress: 29/2500
 x out failed
Progress: 30/2500
Progress: 31/2500
Progress: 32/2500
Progress: 33/2500
Progress: 34/2500
Progress: 35/2500
Progress: 36/2500
Progress: 37/2500
Progress: 38/2500
Progress: 39/2500
Progress: 40/2500
Progress: 41/2500
Progress: 42/2500
Progress: 43/2500
Progress: 44/2500
Progress: 45/2500
Progress: 46/2500
Progress: 47/2500
Progress: 48/2500
Progress: 49/2500
Progress: 50/2500
Progress: 51/2500
Progress: 52/2500
Progress: 53/2500
Progress: 

Progress: 400/2500
Progress: 401/2500
Progress: 402/2500
Progress: 403/2500
Progress: 404/2500
Progress: 405/2500
Progress: 406/2500
Progress: 407/2500
Progress: 408/2500
Progress: 409/2500
Progress: 410/2500
Progress: 411/2500
Progress: 412/2500
Progress: 413/2500
Progress: 414/2500
Progress: 415/2500
Progress: 416/2500
Progress: 417/2500
Progress: 418/2500
Progress: 419/2500
Progress: 420/2500
Progress: 421/2500
Progress: 422/2500
Progress: 423/2500
Progress: 424/2500
Progress: 425/2500
Progress: 426/2500
Progress: 427/2500
Progress: 428/2500
 x out failed
Progress: 429/2500
Progress: 430/2500
Progress: 431/2500
Progress: 432/2500
Progress: 433/2500
Progress: 434/2500
Progress: 435/2500
Progress: 436/2500
Progress: 437/2500
Progress: 438/2500
Progress: 439/2500
Progress: 440/2500
Progress: 441/2500
Progress: 442/2500
Progress: 443/2500
Progress: 444/2500
Progress: 445/2500
Progress: 446/2500
Progress: 447/2500
Progress: 448/2500
Progress: 449/2500
Progress: 450/2500
Progress: 451/250

Progress: 768/2500
Progress: 769/2500
Progress: 770/2500
Progress: 771/2500
Progress: 772/2500
Progress: 773/2500
Progress: 774/2500
Progress: 775/2500
Progress: 776/2500
Progress: 776/2500
Progress: 777/2500
Progress: 777/2500
Progress: 778/2500
Progress: 779/2500
Progress: 780/2500
Progress: 781/2500
Progress: 782/2500
Progress: 783/2500
Progress: 784/2500
Progress: 784/2500
Progress: 785/2500
Progress: 785/2500
Progress: 786/2500
Progress: 786/2500
Progress: 787/2500
Progress: 788/2500
Progress: 789/2500
Progress: 790/2500
 x out failed
Progress: 791/2500
Progress: 792/2500
Progress: 793/2500
Progress: 794/2500
Progress: 795/2500
Progress: 796/2500
Progress: 797/2500
Progress: 798/2500
Progress: 799/2500
Progress: 800/2500
Progress: 801/2500
Progress: 801/2500
Progress: 802/2500
Progress: 802/2500
Progress: 803/2500
Progress: 804/2500
Progress: 805/2500
Progress: 806/2500
Progress: 807/2500
Progress: 808/2500
Progress: 809/2500
Progress: 809/2500
Progress: 810/2500
Progress: 810/250

In [12]:
df.tail()

Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,Industry,Sector,Revenue,Competitors
1029,Data Scientist,$229K-$350K (Glassdoor est.),Aireon has deployed the world’s first and glob...,4.7,Aireon\n4.7,"McLean, VA",-1,51 to 200 Employees,2011,Company - Private,Aerospace & Defense,Aerospace & Defense,Unknown / Non-Applicable,-1
1030,Machine Learning Engineer,$229K-$350K (Glassdoor est.),ABOUT BARK\n\nBARK is a company building produ...,2.9,BARK\n2.9,"New York, NY",-1,201 to 500 Employees,2011,Company - Private,Pet & Pet Supplies Stores,Retail,Unknown / Non-Applicable,-1
1031,Data Scientist - Medicare Advantage Generalist,$229K-$350K (Glassdoor est.),A bit about us:\n\nWe’re on a mission to chang...,4.3,Devoted Health\n4.3,Remote,-1,201 to 500 Employees,2017,Company - Private,Health Care Services & Hospitals,Health Care,Unknown / Non-Applicable,-1
1032,"Data Scientist 2 - Cross Function (Assortment,...",$229K-$350K (Glassdoor est.),Job Description\n\n\nData Scientist 2\nOff-Pri...,3.7,Nordstrom\n3.7,"Los Angeles, CA",-1,10000+ Employees,1901,Company - Public,"Department, Clothing, & Shoe Stores",Retail,$10+ billion (USD),-1
1033,Data Scientist,$229K-$350K (Glassdoor est.),"Colorado Springs, Colorado\nSkills : SQL,R,C++...",4.1,Collabera\n4.1,"Colorado Springs, CO",-1,10000+ Employees,1991,Company - Private,IT Services,Information Technology,$500 million to $1 billion (USD),-1


In [14]:
df.drop(columns =['Headquarters', 'Competitors'], inplace=True)

In [15]:
df.head()

Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Size,Founded,Type of ownership,Industry,Sector,Revenue
0,Software Engineer - Data Science,$60K-$127K (Glassdoor est.),Join ClearEdge and be a part of the team of me...,4.1,ClearEdge\n4.1,"Annapolis Junction, MD",51 to 200 Employees,2002,Company - Private,Computer Hardware & Software,Information Technology,$5 to $10 million (USD)
1,Quality Systems Data Science Engineer,$60K-$127K (Glassdoor est.),It is our people behind life’s passions who wi...,3.7,Mercury Marine\n3.7,"Fond du Lac, WI",1001 to 5000 Employees,1939,Company - Public,Consumer Products Manufacturing,Manufacturing,$2 to $5 billion (USD)
2,Inside Sales Representative (Headquarters),$60K-$127K (Glassdoor est.),Becoming a Sales Professional at Red Ventures ...,2.5,Red Ventures - Sales\n2.5,"Charlotte, NC",1001 to 5000 Employees,2000,Company - Private,Advertising & Marketing,Business Services,$1 to $2 billion (USD)
3,Data Scientist,$60K-$127K (Glassdoor est.),Braxton Technologies is looking for a data sci...,1.6,Braxton Science & Technology Group\n1.6,"Colorado Springs, CO",51 to 200 Employees,-1,Company - Private,-1,-1,Less than $1 million (USD)
4,Data Science Engineer Full Stack - AI,$60K-$127K (Glassdoor est.),Data Science Engineer Full Stack - AI\n\nOverv...,2.8,BryterCX\n2.8,United States,51 to 200 Employees,2000,Company - Private,Enterprise Software & Network Solutions,Information Technology,$25 to $50 million (USD)


In [16]:
df.to_csv("indeed_ds_data.csv",encoding='utf-8')

In [17]:
del df

In [18]:
#invoke garbage collector to free ram
import gc
gc.collect()

1946