In [1]:
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import pandas as pd

In [2]:
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

In [3]:
url='https://jobs.intel.com/en/search-jobs'
driver = webdriver.Chrome()
driver.implicitly_wait(2)
driver.get(url)

In [4]:
# handle the pop up cookies message
try:
    next_button = driver.find_element('xpath', '//*[@id="igdpr-button"]')
    next_button.click()
except: pass

In [5]:
# get job titles and links for jobs on each page
# click button to go to the next page

job_title=[]
job_link=[]

while True:
    time.sleep(2)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    tags = soup.find_all('a', {'class': 'job-title-link'})
    job_link.extend(['https://jobs.intel.com' + t['href'] for t in tags])
    job_title.extend([t.find('h2').text for t in tags])
    
    try:
        next_button = driver.find_element('xpath', '//*[@class="next"]')
        driver.execute_script("arguments[0].click();", next_button)
    except: break

In [6]:
len(job_link)

424

In [7]:
def replace_char(s):
  s_new = s.replace('\n', ' ').replace('\xa0', ' ')
  while '  ' in s_new:
    s_new = s_new.replace('  ', ' ')
  return s_new

In [8]:
job_title = [replace_char(job) for job in job_title]

In [9]:
# create a dataframe that contains job titles and links for all job categories
df_title_link = pd.DataFrame(zip(job_title, job_link), columns=['JOB_TITLE', 'JOB_LINK'])

# drop the duplicates
df_title_link = df_title_link.drop_duplicates()
print(len(df_title_link))

409


In [11]:
# save the titles and links to csv
#df_title_link.to_csv('intel_jobs_titles_links_all.csv')

## Extract job descriptions and qualifications for each job

In [12]:
# load in the job titles and links from the csv file
#df_title_link = pd.read_csv('intel_jobs_titles_links_all.csv', index_col=0)
#df_title_link.head()

In [13]:
# retrieve job qualifications and descriptions
def job_description(title, link):
    qualifications = []
    description = []
    jobtitle = []
    joblink = []
    
    driver=webdriver.Chrome('chromedriver',options=chrome_options)
    for i in range(len(link)):
        URL=link[i]
        driver.get(URL)
        time.sleep(1)
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        s = ''
        d = ''

        # get job qualifications
        # retrieve the text between "Qualifications" and the next section with tag "h2"
        try:
            tag = soup.find('h2', text='Qualifications').findNextSibling()
            ul=0
            space_count = 0
            while (tag.name!='h2') & (ul<2):
                if tag.name == 'ul':
                    ul += 1
                s = s + " " + tag.text
                space_count += 1
                try: tag = tag.findNextSibling() 
                except: pass
            
            # if there are no 'ul' or other text blocks found, then look for text under tag 'br'
            if len(s)<=space_count : 
                try:
                    tag = soup.find('h2', text='Qualifications').findNextSibling('br')
                    s = s + " " + tag.next_element
                except: pass
                    
        except: pass

        # get job descriptions
        # retrieve the text between "Job Description" and the next section with tag "h2"
        try:
            tag = soup.find('h2', text='Job Description').findNextSibling()
            ul=0
            space_count = 0
            while (tag.name!='h2'):
                d = d + " " + tag.text
                space_count += 1
                try: tag = tag.findNextSibling() 
                except: pass
            
            # if there are no 'ul' or other text blocks found, then look for text under tag 'br'
            if len(s)<=space_count : 
                try:
                    tag = soup.find('h2', text='Job Description').findNextSibling('br')
                    d = d + " " + tag.next_element
                except: pass
        except: pass
        
        qualifications.append(s)
        description.append(d)   
        jobtitle.append(title[i])
        joblink.append(link[i])

    driver.quit()            
 
    return jobtitle, link, qualifications, description

In [14]:
# retrieve the qualification for each job.
title, link, qual, descrp = job_description(df_title_link['JOB_TITLE'].values, \
                                            df_title_link['JOB_LINK'].values)

In [16]:
qual_cleaned = [replace_char(q) for q in qual]
descrp_cleaned = [replace_char(d) for d in descrp]

In [17]:
#create a dataframe from basic qualification and preferred qualification
df = pd.DataFrame(zip(title, qual_cleaned, link, descrp_cleaned))
df.columns = ['TITLE', 'QUALIFICATIONS', 'LINK', 'DESCRIPTION']
df['COMPANY'] = 'Intel'
df = df.iloc[:, [4, 0, 1, 2, 3]]

In [18]:
# look for empty QUALIFICATION entries
def get_empty(df, colname) :
  empty_idx=[]
  for i in range(len(df[colname])):
    count = len(df[colname][i])
    if count<=10 : empty_idx.append(i)
    else: pass
  return empty_idx

empty_mini_q = get_empty(df, 'QUALIFICATIONS')
print(len(empty_mini_q))

# drop the empty Qualification entries
df.drop(empty_mini_q, inplace=True)

0


In [20]:
#remove the duplicated jobs
df_nodup = df.drop_duplicates()
print("There are {} jobs from Intel.".format(df_nodup.shape[0]))

df_nodup.to_csv('intel_all_jobs_cleaned.csv')

There are 409 jobs from Intel.
