In [1]:
from selenium import webdriver
from bs4 import BeautifulSoup
import time
import pandas as pd

In [2]:
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

In [3]:
# we will scrape jobs from IBM in US and Canada
url = 'https://www.ibm.com/careers/us-en/search/?filters=primary_country:CA,primary_country:US'
driver = webdriver.Chrome()
driver.implicitly_wait(2)
driver.get(url)

In [4]:
# we first get the total number of jobs and number of jobs per page
soup = BeautifulSoup(driver.page_source, 'html.parser')
result = soup.find("div", {"class": "UpperList_quantityJobs__eDIK8"}).text
jobs_per_page = int(result.split()[1])
total_jobs = int(result.split()[3])

# now calculate the total number of pages
total_pages = total_jobs//jobs_per_page + 1
total_pages

45

In [5]:
job_title=[]
job_link=[]

# get the jobs on the first page
soup = BeautifulSoup(driver.page_source, 'html.parser')
tags = soup.find_all("a", {"class": "cds--link bx--card__footer undefined"})
for t in tags:
    job_link.append(t['href'])   
    job_title.append(t.parent.find("h3", {"class": "bx--card__heading"}).text)  

In [6]:
time.sleep(2)
# handle the cookies message window
try:
    next_button = driver.find_element('xpath', '//*[@id="truste-consent-button"]')  
    next_button.click()
except:
    pass

In [7]:
# now loop through all the pages by clicking 'next' button on each page
current_page = 1

while current_page < total_pages:
    next_button = driver.find_element('xpath', '//*[@aria-labelledby="tooltip-6"]') 
    driver.execute_script("arguments[0].click();", next_button)
    time.sleep(1)

    soup = BeautifulSoup(driver.page_source, 'html.parser')
    tags = soup.find_all("a", {"class": "cds--link bx--card__footer undefined"})
    for t in tags:
        job_link.append(t['href'])   
        job_title.append(t.parent.find("h3", {"class": "bx--card__heading"}).text)  
    
    current_page += 1

In [8]:
len(job_title), len(job_link)

(666, 666)

In [9]:
# create a dataframe that contains job titles and links for all job categories
df_title_link = pd.DataFrame(zip(job_title, job_link), columns=['JOB_TITLE', 'JOB_LINK'])

# drop the duplicates
df_title_link = df_title_link.drop_duplicates()

## Extract job descriptions and qualifications for each job

In [10]:
# retrieve job qualifications and descriptions
def job_description(title, link):
    qualifications = []
    description = []
    jobtitle = []
    joblink = []
    
    driver=webdriver.Chrome('chromedriver',options=chrome_options)
    for i in range(len(link)):
        URL=link[i]
        driver.get(URL)
        time.sleep(1)
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        s = ''
        d = ''
              
        # get the job qualifications
        try:
            tag1 = soup.find("span", text="Required Technical and Professional Expertise")
            tag = tag1.findNext(['ul', 'ol'])
            s = tag.text
        except: pass
        
        try:
            tag2 = soup.find("span", text="Preferred Technical and Professional Expertise")
            tag = tag1.findNext(['ul', 'ol'])
            s = s + ". " + tag.text
        except: pass

        # get the job responsibilities
        try:
            tag1 = soup.find("span", text="Your Role and Responsibilities")
            tag = tag1.findNext(['ul', 'ol'])
            d = tag.text
        except: pass        
        
        qualifications.append(s)
        description.append(d)   
        jobtitle.append(title[i])
        joblink.append(link[i])
        
    driver.quit()            
 
    return jobtitle, link, qualifications, description

In [11]:
# retrieve the qualification for each job.
title, link, qual, descrp = job_description(df_title_link['JOB_TITLE'].values, df_title_link['JOB_LINK'].values)

In [12]:
def remove_char(s):
  s_new = s.replace('\n', ' ').replace('\xa0', ' ')
  while '  ' in s_new:
    s_new = s_new.replace('  ', ' ')
  return s_new

In [13]:
qual_cleaned = [remove_char(q) for q in qual]
descrp_cleaned = [remove_char(d) for d in descrp]

In [14]:
#create a dataframe from basic qualification and preferred qualification
df = pd.DataFrame(zip(title, qual_cleaned, link, descrp_cleaned))
df.columns = ['TITLE', 'QUALIFICATIONS', 'LINK', 'DESCRIPTION']
df['COMPANY'] = 'IBM'
df = df.iloc[:, [4, 0, 1, 2, 3]]

In [15]:
# look for empty QUALIFICATION entries
def get_empty(df, colname) :
  empty_idx=[]
  for i in range(len(df[colname])):
    count = len(df[colname][i])
    if count<=10 : empty_idx.append(i)
    else: pass
  return empty_idx

empty_mini_q = get_empty(df, 'QUALIFICATIONS')

# drop the empty Qualification entries
df.drop(empty_mini_q, inplace=True)

In [17]:
#remove the duplicated jobs
df_nodup = df.drop_duplicates()
print("There are {} jobs from IBM.".format(df_nodup.shape[0]))

df_nodup.to_csv('ibm_all_jobs_cleaned.csv')

There are 624 jobs from IBM.
