# Extracting Job URLs and Job Titles

In [26]:
#importing the necessary packages for web scraping
from selenium import webdriver
from bs4 import BeautifulSoup
import time
import re
import pandas as pd

In [27]:
#importing the Chrome browser extensions
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

In [28]:
# specify the url strings for the company's job posting website
url = 'https://vizient.wd1.myworkdayjobs.com/Vizient_Careers'
driver = webdriver.Chrome()
driver.get(url)
time.sleep(1)

In [29]:
# get job titles and links for each page and click the next button to go to the next page until no more
job_title=[]
job_link=[]

next_button = driver.find_element('xpath', '//*[@aria-label="next"]') 

while next_button:
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    job_title.extend([td.text for td in soup.findAll("a", {"data-automation-id": "jobTitle"})])
    job_link.extend(['https://vizient.wd1.myworkdayjobs.com' + td['href'] for td in soup.findAll("a", {"data-automation-id": "jobTitle"})])
    try:
        next_button.click()
        time.sleep(1)
    except: break
    

In [30]:
# create a dataframe that contains job titles and links for all job categories
df_title_link = pd.DataFrame(zip(job_title, job_link), columns=['JOB_TITLE', 'JOB_LINK'])

# drop the duplicates
df_title_link = df_title_link.drop_duplicates()

In [31]:
df_title_link.head()

Unnamed: 0,JOB_TITLE,JOB_LINK
0,Analyst - Pharmacy Analytics,https://vizient.wd1.myworkdayjobs.com/en-US/Vi...
1,Lead Analyst - Dedicated support Oklahoma/Arka...,https://vizient.wd1.myworkdayjobs.com/en-US/Vi...
2,Price Management Analyst,https://vizient.wd1.myworkdayjobs.com/en-US/Vi...
3,Networks Manager,https://vizient.wd1.myworkdayjobs.com/en-US/Vi...
4,Senior Software Quality Engineer,https://vizient.wd1.myworkdayjobs.com/en-US/Vi...


In [32]:
df_title_link.shape

(100, 2)

# Extract job description and qualifications

In [33]:
# retrieve job qualifications and descriptions
def job_description(title, link):
    qualifications = []
    description = []
    jobtitle = []
    joblink = []

    driver=webdriver.Chrome('chromedriver',options=chrome_options)
    for i in range(len(link)):
        URL=link[i]
        driver.get(URL)
        time.sleep(1)
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        s = ''
        d = ''

        text_pattern_descrp_1 = re.compile("Responsibilities:", re.IGNORECASE)
        
        
        text_pattern_qual_1 = re.compile("Qualifications:", re.IGNORECASE)        
        
        
        #trys all of the qualification metrics
        try:
            tag = soup.find("b", text=text_pattern_qual_1).findNext("ul")
            s = s + tag.text
        except: pass
        
               
        #trys all of the description metrics
        try:
            tag = soup.find("b", text=text_pattern_descrp_1).findNext("ul")
            d = tag.text
        except: pass
        
        qualifications.append(s)
        description.append(d)   
        jobtitle.append(title[i])
        joblink.append(link[i])
        
    driver.quit()            
 
    return jobtitle, link, qualifications, description

In [34]:
title, link, qual, descrp = job_description(df_title_link['JOB_TITLE'].values, df_title_link['JOB_LINK'].values)
print(len(qual), len(descrp))

100 100


In [35]:
#create a dataframe from qualifications and descriptions
df = pd.DataFrame(zip(title, qual, link, descrp))
df.columns = ['TITLE', 'QUALIFICATIONS', 'LINK', 'DESCRIPTION']
df['COMPANY'] = 'Vizient'
df = df.iloc[:, [4, 0, 1, 2, 3]]

In [36]:
df.head()

Unnamed: 0,COMPANY,TITLE,QUALIFICATIONS,LINK,DESCRIPTION
0,Vizient,Analyst - Pharmacy Analytics,Relevant degree preferred; graduate degree des...,https://vizient.wd1.myworkdayjobs.com/en-US/Vi...,Provide operational support for pharmacy portf...
1,Vizient,Lead Analyst - Dedicated support Oklahoma/Arka...,,https://vizient.wd1.myworkdayjobs.com/en-US/Vi...,"Serve as lead for analytics requests, includin..."
2,Vizient,Price Management Analyst,Relevant degree preferred. 2 or more years’ re...,https://vizient.wd1.myworkdayjobs.com/en-US/Vi...,"Interact with members, contracted vendors and ..."
3,Vizient,Networks Manager,Relevant degree preferred.2 or more years of r...,https://vizient.wd1.myworkdayjobs.com/en-US/Vi...,Maintain knowledge of network and Vizient offe...
4,Vizient,Senior Software Quality Engineer,Relevant degree preferred. Degree in Computer ...,https://vizient.wd1.myworkdayjobs.com/en-US/Vi...,Evaluate and implement software test automatio...


In [37]:
#remove the duplicated jobs
df_nodup = df.drop_duplicates()
print("There are {} jobs from vizient.".format(df_nodup.shape[0]))

# save the output file
df_nodup.to_csv('vizient_cleaned_jobs_usa.csv')

There are 100 jobs from Nvidia.
