In [1]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import re
import pandas as pd

In [4]:
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

In [5]:
# specify the url strings for the company's job posting website
# technology jobs
url1 = 'https://careers.ti.com/search-jobs/?keyword=technology&pg='
url2 = ''

In [26]:
def ti_get_jobs(url1, url2):
    """
    retrieve job titles and job links from each page
    """
    job_title=[]
    job_link=[]
    page_num = 0

    driver=webdriver.Chrome(options = chrome_options)
    URL = url1 + str(page_num) + url2
    driver.get(URL)
    time.sleep(1)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    # will exit while loop when soup.find_all returns None
    while soup.find_all("div", {"class": "jobTitle"}) :
        for td in soup.findAll("div", {"class": "jobTitle"}):
            job_title.append(td.text)
            job_link.append(['https://careers.ti.com' + td.findNext('a')['href']])
        driver.quit()

        page_num += 1
        driver=webdriver.Chrome(options = chrome_options)
        URL = url1 + str(page_num) + url2
        driver.get(URL)
        time.sleep(2)
        soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    driver.quit() 
    return job_title,job_link

In [27]:
job_title, job_link = ti_get_jobs(url1, url2)

In [75]:
def remove_char(s):
  s_new = s.replace('\n', ' ').replace('\xa0', ' ')
  while '  ' in s_new:
    s_new = s_new.replace('  ', ' ')
  return s_new

In [40]:
job_link_cleaned = [s[0] for s in job_link
                    
# create a dataframe that contains job titles and links for all job categories
df_title_link = pd.DataFrame(zip(job_title, job_link_cleaned), columns=['JOB_TITLE', 'JOB_LINK'])

# drop the duplicates
df_title_link = df_title_link.drop_duplicates()

## Extract job descriptions and qualifications for each job

In [144]:
# retrieve job qualifications and descriptions
def job_description(title, link):
    qualifications = []
    description = []
    jobtitle = []
    joblink = []

    driver=webdriver.Chrome('chromedriver',options=chrome_options)
    for i in range(len(link)):
        URL=link[i]
        driver.get(URL)
        time.sleep(2)
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        s = ''
        d = ''

        try:
          tag = soup.find(re.compile("(b|strong)"), text=re.compile("(Minimum Reqirements|Minimum reqirements)")).findNext("ul")
          if tag:
            s = s + " " + tag.text
        except: pass
        
        try:
          tag = soup.find(re.compile("(b|strong)"), text=re.compile("(Preferred|Required)")).findNext("ul")
          if tag:
            s = s + " " + tag.text
        except: pass

        # retrieve job descriptions. This will only work if the descriptions are listed as 
        # bullet points under tag "ul"
        try:
          tag = soup.find("span", text="Apply online").findNext('ul')
          d = tag.text
        except: pass
        
        qualifications.append(s)
        description.append(d)   
        jobtitle.append(title[i])
        joblink.append(link[i])
        
    driver.quit()            
 
    return jobtitle, link, qualifications, description

In [148]:
# retrieve the qualification for each job.
title, link, qual, descrp = job_description(df_title_link['JOB_TITLE'].values, df_title_link['JOB_LINK'].values)

In [149]:
qual_cleaned = [remove_char(q) for q in qual]
descrp_cleaned = [remove_char(d) for d in descrp]

In [150]:
#create a dataframe from basic qualification and preferred qualification
df = pd.DataFrame(zip(title, qual_cleaned, link, descrp_cleaned))
df.columns = ['TITLE', 'QUALIFICATIONS', 'LINK', 'DESCRIPTION']
df['COMPANY'] = 'TI'
df = df.iloc[:, [4, 0, 1, 2, 3]]

Unnamed: 0,COMPANY,TITLE,QUALIFICATIONS,LINK,DESCRIPTION
0,TI,Systems Engineering Intern (m/f/d) - Automotive,"Understanding of power supply topologies, con...",https://careers.ti.com/job/17665412/systems-en...,Develop power supply solutions for strategic ...
1,TI,Facilities Professional Mechanical Engineer,Good exposure and knowledge of semiconductor ...,https://careers.ti.com/job/17665009/facilities...,Responsible to lead and manage all Mechanical...
2,TI,Systems Engineering Intern (m/f/d) - Industrial,Experience with lab equipment such as oscillo...,https://careers.ti.com/job/17665008/systems-en...,"Working with product lines, customers, sales ..."
3,TI,MEMS / BEOL Integration Engineer,Expert knowledge of MEMS / BEOL Integration :...,https://careers.ti.com/job/17664067/mems-beol-...,"Work in cross-functional teams, with circuit ..."
4,TI,Fire Protection Specialist,Strong verbal and written communication skill...,https://careers.ti.com/job/17658114/fire-prote...,"Interpretation of building code, life safety ..."


In [151]:
# look for empty QUALIFICATION entries
def get_empty(df, colname) :
  empty_idx=[]
  for i in range(len(df[colname])):
    count = len(df[colname][i])
    if count<=10 : empty_idx.append(i)
    else: pass
  return empty_idx

empty_mini_q = get_empty(df, 'QUALIFICATIONS')
print(len(empty_mini_q))

# drop the empty Qualification entries
df.drop(empty_mini_q, inplace=True)

66


(408, 5)

In [155]:
#remove the duplicated jobs
df_nodup = df.drop_duplicates()
print("There are {} jobs from Texas Instruments.".format(df_nodup.shape[0]))

df_nodup.to_csv('ti_technology_jobs_cleaned.csv')