In [46]:
from selenium import webdriver
from bs4 import BeautifulSoup
import time
import re
import pandas as pd

In [47]:
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

In [48]:
# specify the url strings for the company's job posting website
url = 'https://vst.wd5.myworkdayjobs.com/en-US/vistra_careers'
driver = webdriver.Chrome()
driver.implicitly_wait(1)
driver.get(url)

In [49]:
# get job titles and links for each page and click the next button to go to the next page until no more
job_title=[]
job_link=[]

next_button = driver.find_element('xpath', '//*[@aria-label="next"]')  
while next_button:
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    job_title.extend([td.text for td in soup.findAll("a", {"data-automation-id": "jobTitle"})])
    job_link.extend(['https://vst.wd5.myworkdayjobs.com' + td['href'] for td in soup.findAll("a", {"data-automation-id": "jobTitle"})])
    try:
        next_button.click()
        time.sleep(1)
    except: break
    

In [50]:
# create a dataframe that contains job titles and links for all job categories
df_title_link = pd.DataFrame(zip(job_title, job_link), columns=['JOB_TITLE', 'JOB_LINK'])

# drop the duplicates
df_title_link = df_title_link.drop_duplicates()

In [51]:
df_title_link.head()

Unnamed: 0,JOB_TITLE,JOB_LINK
0,"Account Manager, Sr",https://vst.wd5.myworkdayjobs.com/en-US/vistra...
1,Lead Developer,https://vst.wd5.myworkdayjobs.com/en-US/vistra...
2,Lead Microsoft 365 Security Engineer,https://vst.wd5.myworkdayjobs.com/en-US/vistra...
3,Retail Contract Analyst IV,https://vst.wd5.myworkdayjobs.com/en-US/vistra...
4,Contractor Coordinator,https://vst.wd5.myworkdayjobs.com/en-US/vistra...


In [52]:
print(job_link[14])

https://vst.wd5.myworkdayjobs.com/en-US/vistra_careers/job/Irving-Texas/Sr-Financial-Analyst_40008992-2


# Extract Job Description and Qualifications

In [53]:
# retrieve job qualifications and descriptions
def job_description(title, link):
    qualifications = []
    description = []
    jobtitle = []
    joblink = []

    driver=webdriver.Chrome('chromedriver',options=chrome_options)
    for i in range(len(link)):
        URL=link[i]
        driver.get(URL)
        time.sleep(1)
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        s = ''
        d = ''

        #text_pattern_descrp_1 = re.compile("Key Accountabilities", re.IGNORECASE)
        text_pattern_descrp_1 = re.compile("Job Description", re.IGNORECASE)
        
        
        text_pattern_qual_1 = re.compile("Education, Experience, & Skill Requirements", re.IGNORECASE)        
        text_pattern_qual_2 = re.compile("Key Metrics", re.IGNORECASE)  
        text_pattern_qual_3 = re.compile("Key Accountabilities (directly or through others)", re.IGNORECASE)
        text_pattern_qual_4 = re.compile("Responsibilities", re.IGNORECASE)
        text_pattern_qual_5 = re.compile("Key Responsibilities", re.IGNORECASE)
        text_pattern_qual_6 = re.compile("ESSENTIAL DUTIES AND RESPONSIBILITIES", re.IGNORECASE)
        text_pattern_qual_7 = re.compile("EDUCATION/EXPERIENCE", re.IGNORECASE)
        
        #trys all of the qualification metrics
        try:
            tag = soup.find("", text=text_pattern_qual_1).findNext("ul")
            s = s + tag.text
        except: pass
        
        try:
            tag = soup.find('', text=text_pattern_qual_2).findNext("ul")
            s = s + tag.text
        except: pass
        
        try:
            tag = soup.find("", text = text_pattern_qual_3).findNext("ul")
            s = s + tag.text
        except: pass
               
        #trys all of the description metrics
        try:
            tag = soup.find("", text=text_pattern_descrp_1).findNext("ul")
            d = tag.text
        except: pass
      
        qualifications.append(s)
        description.append(d)   
        jobtitle.append(title[i])
        joblink.append(link[i])
        
    driver.quit()            
 
    return jobtitle, link, qualifications, description

In [54]:
title, link, qual, descrp = job_description(df_title_link['JOB_TITLE'].values, df_title_link['JOB_LINK'].values)
print(len(qual), len(descrp))

126 126


In [55]:

#create a dataframe from qualifications and descriptions
df = pd.DataFrame(zip(title, qual, link, descrp))
df.columns = ['TITLE', 'QUALIFICATIONS', 'LINK', 'DESCRIPTION']
df['COMPANY'] = 'Vistra'
df = df.iloc[:, [4, 0, 1, 2, 3]]

In [56]:

#remove the duplicated jobs
df_nodup = df.drop_duplicates()
print("There are {} jobs from Vistra.".format(df_nodup.shape[0]))

# save the output file
df_nodup.to_csv('Vistra_jobs_cleaned.csv')


There are 126 jobs from Vistra.


In [58]:
df_nodup

Unnamed: 0,COMPANY,TITLE,QUALIFICATIONS,LINK,DESCRIPTION
0,Vistra,"Account Manager, Sr",High School Diploma or equivalency requiredExp...,https://vst.wd5.myworkdayjobs.com/en-US/vistra...,"Support Large Business, Door-to-Door, Apartmen..."
1,Vistra,Lead Developer,Responsible for designing solutions and workin...,https://vst.wd5.myworkdayjobs.com/en-US/vistra...,Responsible for designing solutions and workin...
2,Vistra,Lead Microsoft 365 Security Engineer,Experienced gained through college degree prog...,https://vst.wd5.myworkdayjobs.com/en-US/vistra...,Takes ownership over execution of the roadmap ...
3,Vistra,Retail Contract Analyst IV,2-4+ years in contract related work experience...,https://vst.wd5.myworkdayjobs.com/en-US/vistra...,Create non-standard contract documents which w...
4,Vistra,Contractor Coordinator,High School Diploma or Equivalent.5-7 years of...,https://vst.wd5.myworkdayjobs.com/en-US/vistra...,Requires understanding of the functional opera...
...,...,...,...,...,...
121,Vistra,Business Sales Executive - Dynegy,,https://vst.wd5.myworkdayjobs.com/en-US/vistra...,Contact commercial customer from a defined pro...
122,Vistra,Principle Developer - Wholesale Applications,,https://vst.wd5.myworkdayjobs.com/en-US/vistra...,
123,Vistra,Plant Technician 1-4 (I&E) - Odessa,"Maintain breakers, switch gear, switch yards, ...",https://vst.wd5.myworkdayjobs.com/en-US/vistra...,"Maintain breakers, switch gear, switch yards, ..."
124,Vistra,Senior Full Stack Java/Angular Developer,,https://vst.wd5.myworkdayjobs.com/en-US/vistra...,Experienced gained through college degree prog...
