In [15]:
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException
from selenium import webdriver
import time
import pandas as pd
from webdriver_manager.chrome import ChromeDriverManager
from sys import stdout

In [18]:
def get_jobs(keyword, num_jobs, verbose, slp_time):
    
    '''Gathers jobs as a dataframe, scraped from Glassdoor'''
    
    try: 
        #Initializing the webdriver
        options = webdriver.ChromeOptions()

        #Uncomment the line below if you'd like to scrape without a new Chrome window every time.
        #options.add_argument('headless')

        #Change the path to where chromedriver is in your home folder.
        driver = webdriver.Chrome(ChromeDriverManager().install())
        driver.set_window_size(1120, 1000)

        url =  'https://www.glassdoor.com/Job/jobs.htm?suggestCount=0&suggestChosen=false&clickSource=searchBtn&typedKeyword="' + keyword + '"bioinformatics&sc.keyword=bioinformatics&locT=&locId=&jobType='

        driver.get(url)
        jobs = []

        while len(jobs) < num_jobs:  #If true, should be still looking for new jobs.

            #Let the page load. Change this number based on your internet speed.
            #Or, wait until the webpage is loaded, instead of hardcoding it.
            time.sleep(slp_time)

            #Test for the "Sign Up" prompt and get rid of it.
            try:
                driver.find_element_by_class_name("selected").click()
            except ElementClickInterceptedException:
                pass
            time.sleep(.1)

            try:
                driver.find_element_by_css_selector('[alt="Close"]').click()
               #driver.find_element_by_class_name("ModalStyle__xBtn___29PT9").click()  #clicking to the X.
            except NoSuchElementException:
                pass


            #Going through each job in this page
            job_buttons = driver.find_elements_by_class_name("jl")  #jl for Job Listing. These are the buttons we're going to click.
            for job_button in job_buttons: 

                progress = "Progress: {}".format("" + str(len(jobs)) + "/" + str(num_jobs))
                stdout.write("\r%s" % progress)
                stdout.flush()
                #print("Progress: {}".format("" + str(len(jobs)) + "/" + str(num_jobs)))
                if len(jobs) >= num_jobs:
                    break

                driver.execute_script("arguments[0].click();", job_button)
                #job_button.click()  #You might 
                time.sleep(1)
                collected_successfully = False

                while not collected_successfully:
                    try:
                        company_name = driver.find_element_by_xpath('.//div[@class="employerName"]').text
                        location = driver.find_element_by_xpath('.//div[@class="location"]').text
                        job_title = driver.find_element_by_xpath('.//div[contains(@class, "title")]').text
                        job_description = driver.find_element_by_xpath('.//div[@class="jobDescriptionContent desc"]').text
                        collected_successfully = True
                    except:
                        time.sleep(5)

                try:
                    salary_estimate = driver.find_element_by_xpath('.//span[@class="gray salary"]').text
                except NoSuchElementException:
                    salary_estimate = -1 #You need to set a "not found value. It's important."

                try:
                    rating = driver.find_element_by_xpath('.//span[@class="rating"]').text
                except NoSuchElementException:
                    rating = -1 #You need to set a "not found value. It's important."

                #Printing for debugging
                if verbose:
                    print("Job Title: {}".format(job_title))
                    print("Salary Estimate: {}".format(salary_estimate))
                    print("Job Description: {}".format(job_description[:500]))
                    print("Rating: {}".format(rating))
                    print("Company Name: {}".format(company_name))
                    print("Location: {}".format(location))

                #Going to the Company tab...
                #clicking on this:
                #<div class="tab" data-tab-type="overview"><span>Company</span></div>
                try:
                    #driver.find_element_by_xpath('.//div[@class="tab" and @data-tab-type="overview"]').click()
                    element = driver.find_element_by_xpath('.//div[@class="tab" and @data-tab-type="overview"]')
                    driver.execute_script("arguments[0].click();", element)
                    try:
                        #<div class="infoEntity">
                        #    <label>Headquarters</label>
                        #    <span class="value">San Francisco, CA</span>
                        #</div>
                        headquarters = driver.find_element_by_xpath('.//div[@class="infoEntity"]//label[text()="Headquarters"]//following-sibling::*').text
                    except NoSuchElementException:
                        headquarters = -1

                    try:
                        size = driver.find_element_by_xpath('.//div[@class="infoEntity"]//label[text()="Size"]//following-sibling::*').text
                    except NoSuchElementException:
                        size = -1

                    try:
                        founded = driver.find_element_by_xpath('.//div[@class="infoEntity"]//label[text()="Founded"]//following-sibling::*').text
                    except NoSuchElementException:
                        founded = -1

                    try:
                        type_of_ownership = driver.find_element_by_xpath('.//div[@class="infoEntity"]//label[text()="Type"]//following-sibling::*').text
                    except NoSuchElementException:
                        type_of_ownership = -1

                    try:
                        industry = driver.find_element_by_xpath('.//div[@class="infoEntity"]//label[text()="Industry"]//following-sibling::*').text
                    except NoSuchElementException:
                        industry = -1

                    try:
                        sector = driver.find_element_by_xpath('.//div[@class="infoEntity"]//label[text()="Sector"]//following-sibling::*').text
                    except NoSuchElementException:
                        sector = -1

                    try:
                        revenue = driver.find_element_by_xpath('.//div[@class="infoEntity"]//label[text()="Revenue"]//following-sibling::*').text
                    except NoSuchElementException:
                        revenue = -1

                    try:
                        competitors = driver.find_element_by_xpath('.//div[@class="infoEntity"]//label[text()="Competitors"]//following-sibling::*').text
                    except NoSuchElementException:
                        competitors = -1

                except NoSuchElementException:  #Rarely, some job postings do not have the "Company" tab.
                    headquarters = -1
                    size = -1
                    founded = -1
                    type_of_ownership = -1
                    industry = -1
                    sector = -1
                    revenue = -1
                    competitors = -1


                if verbose:
                    print("Headquarters: {}".format(headquarters))
                    print("Size: {}".format(size))
                    print("Founded: {}".format(founded))
                    print("Type of Ownership: {}".format(type_of_ownership))
                    print("Industry: {}".format(industry))
                    print("Sector: {}".format(sector))
                    print("Revenue: {}".format(revenue))
                    print("Competitors: {}".format(competitors))
                    print("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@")

                jobs.append({"Job Title" : job_title,
                "Salary Estimate" : salary_estimate,
                "Job Description" : job_description,
                "Rating" : rating,
                "Company Name" : company_name,
                "Location" : location,
                "Headquarters" : headquarters,
                "Size" : size,
                "Founded" : founded,
                "Type of ownership" : type_of_ownership,
                "Industry" : industry,
                "Sector" : sector,
                "Revenue" : revenue,
                "Competitors" : competitors})
                #add job to jobs

            #Clicking on the "next page" button
            try:
                driver.find_element_by_xpath('.//li[@class="next"]//a').click()
            except NoSuchElementException:
                print("Scraping terminated before reaching target number of jobs. Needed {}, got {}.".format(num_jobs, len(jobs)))
                break

        return pd.DataFrame(jobs)  #This line converts the dictionary object into a pandas DataFrame.
    except KeyboardInterrupt:
        print("Script was interrupted. Returning current gathered data...")
        return pd.DataFrame(jobs)

In [19]:
#This line will open a new chrome window and start the scraping.
df = get_jobs("data scientist", 100, False, 15)
df.to_csv("bioinf-100jobs-dataset.csv", index=False)
print("\n DONE!")
df.head()

[WDM] - Current google-chrome version is 83.0.4103
[WDM] - Get LATEST driver version for 83.0.4103
[WDM] - Driver [/home/nikicajea/.wdm/drivers/chromedriver/linux64/83.0.4103.39/chromedriver] found in cache


 
Progress: 100/100
 DONE!


Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,Industry,Sector,Revenue,Competitors
0,Clinical Bioinformatics Engineer,$65K-$118K (Glassdoor est.),Posting Title\nClinical Bioinformatics Enginee...,3.9,Novartis\n3.9,"Cambridge, MA","Basel, Switzerland",10000+ employees,1996,Company - Public,Biotech & Pharmaceuticals,Biotech & Pharmaceuticals,$10+ billion (USD),-1
1,"Development Scientist I, Bioinformatics",$65K-$118K (Glassdoor est.),New England Biolabs is seeking a computational...,4.9,New England Biolabs\n4.9,"Ipswich, MA","Ipswich, MA",201 to 500 employees,1974,Company - Private,Biotech & Pharmaceuticals,Biotech & Pharmaceuticals,Unknown / Non-Applicable,"Thermo Fisher Scientific, Enzymatics, Illumina"
2,Senior Data Scientist,$65K-$118K (Glassdoor est.),"SkySync is a dynamic, fast-paced, venture-back...",5.0,SkySync\n5.0,"Ann Arbor, MI","Ann Arbor, MI",51 to 200 employees,2011,Company - Private,Computer Hardware & Software,Information Technology,Unknown / Non-Applicable,-1
3,Cardiopulmonary Specialist - PAH,$65K-$118K (Glassdoor est.),What we do\n\nUnited Therapeutics Corporation ...,3.2,United Therapeutics\n3.2,"Detroit, MI","Silver Spring, MD",501 to 1000 employees,1996,Company - Public,Biotech & Pharmaceuticals,Biotech & Pharmaceuticals,$1 to $2 billion (USD),-1
4,Data Analyst,$65K-$118K (Glassdoor est.),PerBlue is looking for an aspiring Data Analys...,4.3,PerBlue\n4.3,"Madison, WI","Madison, WI",51 to 200 employees,2010,Company - Private,Video Games,Media,Unknown / Non-Applicable,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,"Bioinformatics Scientist, Human Genetics",$139K-$166K (Glassdoor est.),The Position\n\n\nGenentech is inviting applic...,4.0,Genentech\n4.0,"South San Francisco, CA","South San Francisco, CA",10000+ employees,1976,Subsidiary or Business Segment,Biotech & Pharmaceuticals,Biotech & Pharmaceuticals,$10+ billion (USD),-1
96,Bioinformatics Scientist - NIH- Research Trian...,$139K-$166K (Glassdoor est.),Bioinformatics Scientist - NIH - Research Tria...,3.4,Kelly\n3.4,"Durham, NC","Troy, MI",5001 to 10000 employees,1946,Company - Public,Staffing & Outsourcing,Business Services,$5 to $10 billion (USD),"Adecco, ManpowerGroup, Allegis Corporation"
97,Research Associate,$139K-$166K (Glassdoor est.),The Department of Biostatistics and Informatic...,-1,University of Colorado Anschutz Medical Campus,"Aurora, CO",-1,-1,-1,-1,-1,-1,-1,-1
98,Data Analyst,$139K-$166K (Glassdoor est.),As we navigate through these unprecedented tim...,3.4,Midland Credit Management\n3.4,"San Diego, CA","San Diego, CA",1001 to 5000 employees,1953,Subsidiary or Business Segment,Banks & Credit Unions,Finance,$1 to $2 billion (USD),PRA Group
