In [167]:
from selenium import webdriver
from bs4 import BeautifulSoup
import time

In [168]:
import pandas as pd
import numpy as np

In [169]:
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

In [170]:
# we will scrape jobs from IBM in US and Canada
url = 'https://eeho.fa.us2.oraclecloud.com/hcmUI/CandidateExperience/en/sites/CX_1/requisitions?location=United+States&locationId=300000000149325&locationLevel=country&mode=location'
driver = webdriver.Chrome()
driver.implicitly_wait(2)
driver.get(url)

found = False
while found == False:
    try:
        next_button = driver.find_element('xpath', '//*[@class="search-results-load-more-btn"]')  
        found = True
    except:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(1)



#driver.execute_script("window.scrollTo(0, 0);")


In [171]:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(1)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(1)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(1)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(1)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(1)

driver.execute_script("window.scrollTo(0, 0);")
time.sleep(1)
           
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(1)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(1)

In [172]:
next_button = driver.find_element('xpath', '//*[@class="search-results-load-more-btn"]')  
while next_button:
    next_button.click()
    time.sleep(5)
    try:
        next_button = driver.find_element('xpath', '//*[@class="search-results-load-more-btn"]') 
    except:
        break

In [173]:
# we first get the total number of jobs and number of jobs per page
soup = BeautifulSoup(driver.page_source, 'html.parser')
job_title=[]
job_link=[]

job_title.extend(t.text for t in soup.find_all("h3", {"class": "job-title"}))
job_link.extend(t['href'] for t in soup.find_all("a", {"class": "joblist-tile"}))

In [174]:
len(job_title), len(job_link)

(1676, 1676)

In [175]:
# create a dataframe that contains job titles and links for all job categories
df_title_link = pd.DataFrame(zip(job_title, job_link), columns=['JOB_TITLE', 'JOB_LINK'])

In [176]:
df_title_link.head()

Unnamed: 0,JOB_TITLE,JOB_LINK
0,Senior Developer - Ab Initio,https://eeho.fa.us2.oraclecloud.com/hcmUI/Cand...
1,Senior Software Developer,https://eeho.fa.us2.oraclecloud.com/hcmUI/Cand...
2,Oracle Hospitality | Software Developer - Java...,https://eeho.fa.us2.oraclecloud.com/hcmUI/Cand...
3,International Tax Manager - 1 Year Contract Po...,https://eeho.fa.us2.oraclecloud.com/hcmUI/Cand...
4,Principal Software Developer,https://eeho.fa.us2.oraclecloud.com/hcmUI/Cand...


In [177]:
# drop the duplicates
df_title_link = df_title_link.drop_duplicates()

In [178]:
df_title_link.shape

(1676, 2)

In [179]:
# save the titles and links to csv
df_title_link.to_csv('./oracle_jobs_titles_links_all.csv')

## Extract job descriptions and qualifications for each job

In [180]:
# load in the job titles and links from the csv file
df_title_link = pd.read_csv('./oracle_jobs_titles_links_all.csv', index_col=0)
df_title_link.head()

Unnamed: 0,JOB_TITLE,JOB_LINK
0,Senior Developer - Ab Initio,https://eeho.fa.us2.oraclecloud.com/hcmUI/Cand...
1,Senior Software Developer,https://eeho.fa.us2.oraclecloud.com/hcmUI/Cand...
2,Oracle Hospitality | Software Developer - Java...,https://eeho.fa.us2.oraclecloud.com/hcmUI/Cand...
3,International Tax Manager - 1 Year Contract Po...,https://eeho.fa.us2.oraclecloud.com/hcmUI/Cand...
4,Principal Software Developer,https://eeho.fa.us2.oraclecloud.com/hcmUI/Cand...


In [182]:
# retrieve job qualifications and descriptions
def job_description(title, link):
    qualifications = []
    description = []
    jobtitle = []
    joblink = []
    
    driver=webdriver.Chrome('chromedriver',options=chrome_options)
    for i in range(len(link)):
        URL=link[i]
        driver.get(URL)
        time.sleep(2)
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        s = ''
        d = ''
        r = ''
        
        # job qualifications are inside of the responsibilities section
        try:
            tag1 = soup.find("div", {'data-bind': "html: job().responsibilities"})
            
            # for responsibilities we retrieve the entire text under tag1
            r = tag1.text
            
            # for qualifications we retrieve the text under 'ul' only
            for t in tag1.find_all('ul'): 
                s = s + ' ' + t.text
        except: pass
        
        try:
            # for descriptions we retrieve the entire text under tag2, however, a lot of 
            # job descriptions are under qualifications section. So we will combined them 
            tag2 = soup.find("div", {'data-bind': "html: job().description"})
            d = tag2.text + " " + r
        except: pass      
        
        qualifications.append(s)
        description.append(d)   
        jobtitle.append(title[i])
        joblink.append(link[i])
        
    driver.quit()            
 
    return jobtitle, link, qualifications, description

In [187]:
# retrieve the qualification for each job.
title, link, qual, descrp = job_description(df_title_link['JOB_TITLE'].values, df_title_link['JOB_LINK'].values)

In [188]:
descrp[100:105]

['',
 "Do you want to get a foot in the door and perhaps even get the chance to interview early for one of the best technology jobs in the world? \xa0If so, you may be an ideal candidate for our internship program.Gain invaluable experience in what it is like to work at a leading global hardware and software systems innovator - Oracle. \xa0Your colleagues are industry experts and knowledgeable veterans. \xa0Your contributions will enhance real Oracle products and services. \xa0By the end of your internship you will be connected to a powerful network of professionals, managers, and executives.Completed first year of undergraduate degree in computer science, computer engineering, electrical engineering, or related area of study. \xa0Strong academic achievement. \xa0Strong interest in technical area of study. The Program:\nOur future success depends on hiring great people like you: elite, early in career talent who are looking to power next-generation services and solutions and are commit

In [189]:
def remove_char(s):
  s_new = s.replace('\n', ' ').replace('\xa0', ' ')
  while '  ' in s_new:
    s_new = s_new.replace('  ', ' ')
  return s_new

In [190]:
qual_cleaned = [remove_char(q) for q in qual]
descrp_cleaned = [remove_char(d) for d in descrp]

In [200]:
#create a dataframe from basic qualification and preferred qualification
df = pd.DataFrame(zip(title, qual_cleaned, link, descrp_cleaned))
df.columns = ['TITLE', 'QUALIFICATIONS', 'LINK', 'DESCRIPTION']
df['COMPANY'] = 'Oracle'
df = df.iloc[:, [4, 0, 1, 2, 3]]
df.head()

Unnamed: 0,COMPANY,TITLE,QUALIFICATIONS,LINK,DESCRIPTION
0,Oracle,Senior Developer - Ab Initio,,https://eeho.fa.us2.oraclecloud.com/hcmUI/Cand...,Intermediate consulting position operating in...
1,Oracle,Senior Software Developer,Bachelor’s degree in Computer Science or equi...,https://eeho.fa.us2.oraclecloud.com/hcmUI/Cand...,"Design, develop, troubleshoot and debug softw..."
2,Oracle,Oracle Hospitality | Software Developer - Java...,Perform software development tasks: designing...,https://eeho.fa.us2.oraclecloud.com/hcmUI/Cand...,"Design, develop, troubleshoot and debug softw..."
3,Oracle,International Tax Manager - 1 Year Contract Po...,,https://eeho.fa.us2.oraclecloud.com/hcmUI/Cand...,
4,Oracle,Principal Software Developer,Bachelor’s degree in Computer Science or equi...,https://eeho.fa.us2.oraclecloud.com/hcmUI/Cand...,"Design, develop, troubleshoot and debug softw..."


In [201]:
# if 'QUALIFICATIONS' column is empty, replace it with DESCRIPTION column
for i in range(df.shape[0]):
    if len(df['QUALIFICATIONS'][i]) < 2 : 
        df['QUALIFICATIONS'][i] = df['DESCRIPTION'][i]

In [202]:
# look for empty QUALIFICATION entries
def get_empty(df, colname) :
  empty_idx=[]
  for i in range(len(df[colname])):
    count = len(df[colname][i])
    if count<=10 : empty_idx.append(i)
    else: pass
  return empty_idx

empty_mini_q = get_empty(df, 'QUALIFICATIONS')
print(len(empty_mini_q))

# drop the empty Qualification entries
df.drop(empty_mini_q, inplace=True)
df.shape

170


(1506, 5)

In [204]:
#remove the duplicated jobs
df_nodup = df.drop_duplicates()
df_nodup.shape

(1506, 5)

In [205]:
df_nodup.to_csv('oracle_all_jobs_cleaned.csv')