In [40]:
import pandas as pd
import numpy as np
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException, ElementClickInterceptedException
from time import sleep
from random import uniform
from datetime import datetime
import re

# Define .csv

In [41]:
def new_jobs_df():
    columns = [
        'url',
        'company_name', 
        'job_title', 
        'location', 
        'employment_type',
        'job_level', 
        'job_function', 
        'salary_lower', 
        'salary_upper', 
        'salary_period',
        'num_applications', 
        'posted_date',
        'address',
        'experience',
        'closing_date',
        'job_description'
    ]
    return pd.DataFrame(columns=columns)

# Functions

In [42]:
# Define dictionaries/lists for categories
locations = ['Central', 'East', 'North', 'South', 'West', 'Islandwide']
employment_types = ['Permanent', 'Full Time', 'Part Time', 'Contract', 'Flexi-work', 'Temporary', 'Freelance', 'Internship/Attachment']
job_levels = ['Senior Management', 'Middle Management', 'Manager', 'Professional', 'Senior Executive', 'Executive', 'Junior Executive', 'Non-executive', 'Fresh/Entry Level']

# Example strings
example_strings = [
    "CentralContractJunior ExecutiveEngineering ...",
    "IslandwideContract ...Fresh/Entry LevelEducation And Training ...",
    "SouthInternship/AttachmentFresh/Entry LevelHealthcare / Pharmaceutical",
    "IslandwideContract ...ProfessionalBanking And Finance ...",
    "Contract ...ProfessionalBanking And Finance ..."
]

# Function to split the string and determine values for each category
def parse_contract_type(string):
    clean_str = string.replace(" ...", "")
    res_location, res_employment_type, res_job_level, res_job_function = None, None, None, None
    for location in locations:
        if location in clean_str:
            res_location = location
            clean_str = clean_str.replace(location, '')
            break
    for employment_type in employment_types:
        if employment_type in clean_str:
            res_employment_type = employment_type
            clean_str = clean_str.replace(employment_type, '')
            break
    for job_level in job_levels:
        if job_level in clean_str:
            res_job_level = job_level
            clean_str = clean_str.replace(job_level, '')
            break
    res_job_function = clean_str.strip()

    return res_location, res_employment_type, res_job_level, res_job_function  

# Print test
for i in example_strings:
    print(list(parse_contract_type(i)))

['Central', 'Contract', 'Executive', 'Junior Engineering']
['Islandwide', 'Contract', 'Fresh/Entry Level', 'Education And Training']
['South', 'Internship/Attachment', 'Fresh/Entry Level', 'Healthcare / Pharmaceutical']
['Islandwide', 'Contract', 'Professional', 'Banking And Finance']
[None, 'Contract', 'Professional', 'Banking And Finance']


In [43]:
def parse_salary_range(salary_range):
    # Extract upper and lower salary values
    salary_values = re.findall(r'(\d+(?:,\d+)?)', salary_range)
    salary_lower = int(salary_values[0].replace(',', '')) if salary_values else None
    salary_upper = int(salary_values[1].replace(',', '')) if len(salary_values) > 1 else None
    
    return salary_lower, salary_upper

In [44]:
def parse_applications(applications):
    if applications:
        # Extract only the integer number
        match = re.search(r'\d+', applications)
        num_applications = int(match.group()) if match else None
    else:
        num_applications = None
    return num_applications

In [45]:
def parse_posting_info(posting_info):
    # Define the pattern to match "applicationsPosted" and "Closing on" as separators
    pattern = r'( applicationPosted | applicationsPosted |Closing on )'

    applications, filler_1, posted_date, filler_2, closing_date = re.split(pattern, posting_info)

    return applications, posted_date, closing_date

posting_info_eg = [
    "4 applicationsPosted 30 Mar 2024Closing on 29 Apr 2024",
    "30 applicationsPosted 29 Mar 2024Closing on 05 Apr 2024",
    "240 applicationsPosted 29 Mar 2024Closing on 28 Apr 2024",
    "0 applicationPosted 31 Mar 2024Closing on 30 Apr 2024",
    "1 applicationPosted 31 Mar 2024Closing on 30 Apr 2024"
]

# Split each string and print the results
for string in posting_info_eg:
    print(parse_posting_info(string))

('4', '30 Mar 2024', '29 Apr 2024')
('30', '29 Mar 2024', '05 Apr 2024')
('240', '29 Mar 2024', '28 Apr 2024')
('0', '31 Mar 2024', '30 Apr 2024')
('1', '31 Mar 2024', '30 Apr 2024')


In [46]:
def parse_card_info(df, card, row_no):
    lines = card.split('\n')
    res = df.copy()

    company_name, job_title, contract_type, salary_range, salary_period, applications, posted_date = (None, None, None, None, None, None, None)
    
    if(len(lines) > 8): 
        '''
        for cases such as
        PETROS-CONSULTING PTE. LTD.RECRUITER
        for ST ENGINEERING IHQ PTE. LTD.
        '''
        company_name = lines[1].replace("for ", "")
        job_title = lines[2]
        contract_type = lines[3]
        salary_range = lines[5]
        salary_period = lines[6]
    else:
        company_name = lines[0]
        job_title = lines[1]
        contract_type = lines[2]
        salary_range = lines[4]
        salary_period = lines[5]
    
    res.at[row_no, 'company_name'] = company_name
    res.at[row_no, 'job_title'] = job_title
    res.at[row_no, 'salary_period'] = salary_period

    # Parse contract_type
    location, employment_type, job_level, job_function = parse_contract_type(contract_type)
    res.at[row_no, 'location'] = location
    res.at[row_no, 'employment_type'] = employment_type
    res.at[row_no, 'job_level'] = job_level
    res.at[row_no, 'job_function'] = job_function
    
    # Parse salary_range
    salary_lower, salary_upper = parse_salary_range(salary_range)
    res.at[row_no, 'salary_lower'] = salary_lower
    res.at[row_no, 'salary_upper'] = salary_upper
    
    # Parse applications
    num_applications = applications #parse_applications(applications)
    res.at[row_no, 'num_applications'] = num_applications

    return res

# Web Scrape function

In [47]:
df = new_jobs_df()
df.loc[0] = None 
df

Unnamed: 0,url,company_name,job_title,location,employment_type,job_level,job_function,salary_lower,salary_upper,salary_period,num_applications,posted_date,address,experience,closing_date,job_description
0,,,,,,,,,,,,,,,,


In [48]:
def job_page_scrape():
    # res = df.copy()
    global df
    driver = webdriver.Chrome()
    wait = WebDriverWait(driver, 1)
    for row_no in range(df.shape[0]):
        url = df.at[row_no, 'url']
        print(f"ROW NO: {row_no}, URL: {url}")
        if url is np.nan:
            continue
        driver.get(url)
        try:
            job_description = wait.until(EC.presence_of_element_located((By.ID, "job_description"))).text
            df.at[row_no, 'job_description'] = job_description
            # print(job_description)
        except:
            print("no job description")

        try:
            posting_info = driver.find_element(By.XPATH, "//section[@id='job-details']/div[@class='w-70-l w-60-ms w-100 pr2-l pr2-ms relative']/div[@class='bg-white pa4'][1]/div[@class='jobInfo w-100 dib v-top relative']/section[2]").text
            # print(posting_info)
            applications, posted_date, closing_date = parse_posting_info(posting_info)

            cur_date = datetime.strptime(posted_date, '%d %b %Y')

            df.at[row_no, 'num_applications'] = applications
            df.at[row_no, 'posted_date'] = posted_date
            df.at[row_no, 'closing_date'] = closing_date
            # print(f"{posted_date}, {closing_date}, {applications}")
        except:
            print("no posted info")

        # try: # <p data-testid="job-details-info-min-experience" class="black-80 f6 fw4 mv1 dib pr3 mr1 icon-bw-period">2 years exp</p>
        #     experience = driver.find_element(By.XPATH, "//p[@data-testid='job-details-info-min-experience']").text.replace(" years exp", "").replace(" year exp", "")
        #     df.at[row_no, 'experience'] = experience
        #     # print(experience)
        # except:
        #     print("no experience")

        # try:            
        #     address = driver.find_element(By.XPATH, "//div[@class='db mt3 mb4']/section[@class='w-100 pb3 pb0-ns lh-copy']/p[@class='black-80 f6 fw4 mv1 dib pr3 mr1 icon-bw-location']/a[@class='link brand-sec underline-hover']").text
        #     df.at[row_no, 'address'] = address
        #     # print(address)
        # except:
        #     print("no address")
    # return res

In [49]:
def scrape_mycareersfuture_all(search_list, furthest_date='01 Jan 2023'):
    '''
    search_list: list of string search terms
    stopping at specific furthest date not enabled
    '''
    global df
    
    row_no = 0
    
    furthest_date = datetime.strptime(furthest_date, '%d %b %Y')
    cur_date = datetime.today()

    for search_term in search_list:
        driver = webdriver.Chrome()
        wait = WebDriverWait(driver, 1)
        page_no = 0
        driver.get(f"https://www.mycareersfuture.gov.sg/search?search={search_term}&sortBy=new_posting_date&page={page_no}")
        proceed_to_next_page = True

        while proceed_to_next_page and cur_date > furthest_date:
            for card_id in range(22):
                print(f"ROW NO: {row_no}, CARD NO: {card_id}, PAGE NO: {page_no}")
                try: 
                    card_element = wait.until(EC.element_to_be_clickable((By.ID, f"job-card-{card_id}")))
                    card = card_element.text
                    # print(card)
                    df.loc[row_no] = None
                    df = parse_card_info(df, card, row_no)
                except TimeoutException as e:
                    print(f"no card error: {e.msg}")
                    break
                try:
                    url = card_element.find_element(By.TAG_NAME, "a").get_attribute("href")
                    # print(f"url: {url}")
                    df.at[row_no, 'url'] = url
                except TimeoutException as e:
                    print(f"url error: {e.msg}")
                row_no += 1
            
            try:
                wait.until(EC.element_to_be_clickable((By.XPATH, "//button[@data-testid='pagination-button--❯']"))).click()
                print('...next page')
                page_no += 1
            except TimeoutException as e:
                '''no more pages left'''
                print("...no pages left")
                proceed_to_next_page = False
    driver.quit()

    # df = job_page_scrape(df)
    job_page_scrape()
    # return df

In [50]:
# all_data = scrape_mycareersfuture_all(["ai", "data analytics", "data science", "machine learning"])
scrape_mycareersfuture_all(["ai", "data analytics", "data science", "machine learning"])

ROW NO: 0, CARD NO: 0, PAGE NO: 0


  res.at[row_no, 'company_name'] = company_name
  res.at[row_no, 'job_title'] = job_title
  res.at[row_no, 'salary_period'] = salary_period
  res.at[row_no, 'location'] = location
  res.at[row_no, 'employment_type'] = employment_type
  res.at[row_no, 'job_level'] = job_level
  res.at[row_no, 'job_function'] = job_function
  df.at[row_no, 'url'] = url


ROW NO: 1, CARD NO: 1, PAGE NO: 0
ROW NO: 2, CARD NO: 2, PAGE NO: 0
ROW NO: 3, CARD NO: 3, PAGE NO: 0
ROW NO: 4, CARD NO: 4, PAGE NO: 0
ROW NO: 5, CARD NO: 5, PAGE NO: 0
ROW NO: 6, CARD NO: 6, PAGE NO: 0
ROW NO: 7, CARD NO: 7, PAGE NO: 0
ROW NO: 8, CARD NO: 8, PAGE NO: 0
ROW NO: 9, CARD NO: 9, PAGE NO: 0
ROW NO: 10, CARD NO: 10, PAGE NO: 0
ROW NO: 11, CARD NO: 11, PAGE NO: 0
ROW NO: 12, CARD NO: 12, PAGE NO: 0
ROW NO: 13, CARD NO: 13, PAGE NO: 0
ROW NO: 14, CARD NO: 14, PAGE NO: 0
ROW NO: 15, CARD NO: 15, PAGE NO: 0
ROW NO: 16, CARD NO: 16, PAGE NO: 0
ROW NO: 17, CARD NO: 17, PAGE NO: 0
ROW NO: 18, CARD NO: 18, PAGE NO: 0
ROW NO: 19, CARD NO: 19, PAGE NO: 0
ROW NO: 20, CARD NO: 20, PAGE NO: 0
ROW NO: 21, CARD NO: 21, PAGE NO: 0
...next page
ROW NO: 22, CARD NO: 0, PAGE NO: 1
ROW NO: 23, CARD NO: 1, PAGE NO: 1
ROW NO: 24, CARD NO: 2, PAGE NO: 1
ROW NO: 25, CARD NO: 3, PAGE NO: 1
ROW NO: 26, CARD NO: 4, PAGE NO: 1
ROW NO: 27, CARD NO: 5, PAGE NO: 1
ROW NO: 28, CARD NO: 6, PAGE NO: 1
ROW 

COMPANY DESCRIPTION
Beyondsoft International (Singapore) Pte. Ltd. was set up in 2007 and established as the regional headquarters for the Southeast Asia (SEA) and European markets in September 2015. Based on our vision of "Using technology to promote social progress, economic development and become a global customer preferred partner" and our concept of "Beyond your expectations", Beyondsoft is committed to provide our customers in countries along the "Belt and Road" with comprehensive solutions and products and creating commercial value for customers to realizing continuous businesses development.
Our core business includes:
IT development services providing customers with IT consulting, software research and development, software and hardware testing, system integration and operation and maintenance, data analysis and other services;
New retail solutions and products through intelligent products, helping small and medium-sized enterprises (SMEs) realize the digital transformation of

ROW NO: 1, URL: https://www.mycareersfuture.gov.sg/job/information-technology/software-engineer-dbsg-beyondsoft-international-d297e04a21fa7e851db99d783b019dfc?source=MCF&event=Search
ROW NO: 2, URL: https://www.mycareersfuture.gov.sg/job/engineering/ai-engineer-prestige-media-management-804b032c1841d817dca5b822c0a82d6e?source=MCF&event=SuggestedJob
ROW NO: 3, URL: https://www.mycareersfuture.gov.sg/job/information-technology/generative-ai-strategist-amazon-web-services-singapore-28764f36a4ee91c2265ca43be96f7dc1?source=MCF&event=Search
ROW NO: 4, URL: https://www.mycareersfuture.gov.sg/job/information-technology/product-manager-singapore-telecommunications-fd2441172d2363b678bcf4f418ab337d?source=MCF&event=Search
ROW NO: 5, URL: https://www.mycareersfuture.gov.sg/job/engineering/ai-asic-engineer-ic-design-entry-level-expedera-rd-1b0b0d9922df36fd285b04baf2be2864?source=MCF&event=Search
no posted info
ROW NO: 6, URL: https://www.mycareersfuture.gov.sg/job/information-technology/ai-rpa-robo

In [51]:
df

Unnamed: 0,url,company_name,job_title,location,employment_type,job_level,job_function,salary_lower,salary_upper,salary_period,num_applications,posted_date,address,experience,closing_date,job_description
0,https://www.mycareersfuture.gov.sg/job/informa...,BEYONDSOFT INTERNATIONAL (SINGAPORE) PTE. LTD.,Artificial Intelligence (AI) Application Devel...,East,Full Time,Executive,Information Technology,6000.0,8000.0,Monthly,1,25 Apr 2024,,,25 May 2024,Roles & Responsibilities\nCOMPANY DESCRIPTION\...
1,https://www.mycareersfuture.gov.sg/job/informa...,BEYONDSOFT INTERNATIONAL (SINGAPORE) PTE. LTD.,Software Engineer (AI / ML) - DBSG,East,Full Time,Executive,Information Technology,7000.0,8500.0,Monthly,2,25 Apr 2024,,,25 May 2024,Roles & Responsibilities\nCOMPANY DESCRIPTION\...
2,https://www.mycareersfuture.gov.sg/job/enginee...,PRESTIGE MEDIA MANAGEMENT PTE. LTD.,AI ENGINEER,Islandwide,Full Time,Professional,Engineering5 Years Exp,5000.0,11000.0,Monthly,15,01 Apr 2024,,,01 May 2024,Roles & Responsibilities\nHee Labs is a Silico...
3,https://www.mycareersfuture.gov.sg/job/informa...,AMAZON WEB SERVICES SINGAPORE PRIVATE LIMITED,Generative AI Strategist,Central,Permanent,Professional,Information Technology,13000.0,25000.0,Monthly,67,25 Apr 2024,,,25 May 2024,Roles & Responsibilities\nAmazon Web Services ...
4,https://www.mycareersfuture.gov.sg/job/informa...,SINGAPORE TELECOMMUNICATIONS LIMITED,Product Manager (AI & Data),East,Full Time,Manager,Information Technology,5000.0,10000.0,Monthly,4,25 Apr 2024,,,09 May 2024,Roles & Responsibilities\nBe a part of somethi...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1258,https://www.mycareersfuture.gov.sg/job/educati...,NANYANG TECHNOLOGICAL UNIVERSITY,Senior Lecturer in Information Engineering and...,West,Full Time,Professional,Education And Training,6000.0,12000.0,Monthly,16,26 Mar 2024,,,25 Apr 2024,"Roles & Responsibilities\nEstablished in 1981,..."
1259,https://www.mycareersfuture.gov.sg/job/educati...,NANYANG TECHNOLOGICAL UNIVERSITY,Assistant Professor for Intelligent Systems,West,Full Time,Professional,Education And Training,7000.0,12000.0,Monthly,2,26 Mar 2024,,,25 Apr 2024,Roles & Responsibilities\nTenure-track Assista...
1260,https://www.mycareersfuture.gov.sg/job/healthc...,CHUGAI PHARMABODY RESEARCH PTE. LTD.,Senior Scientist (Lead Optimization),South,Permanent,Professional,Healthcare / Pharmaceutical,5500.0,7000.0,Monthly,19,26 Mar 2024,,,25 Apr 2024,Roles & Responsibilities\nCPR is seeking for a...
1261,https://www.mycareersfuture.gov.sg/job/enginee...,LEXBUILD INTERNATIONAL PTE. LTD.,Automotive Engineer,West,Permanent,Executive,Non-Engineering,5000.0,7000.0,Monthly,8,26 Mar 2024,,,25 Apr 2024,Roles & Responsibilities\nJob Description\nDev...


In [52]:
df.to_csv("combined_careersfuture_jobs.csv", index = False)

In [53]:
# driver = webdriver.Chrome()
# wait = WebDriverWait(driver, 3)
# search_term = "machine learning" 
# page_no = 0
# driver.get(f"https://www.mycareersfuture.gov.sg/search?search={search_term}&sortBy=new_posting_date&page={page_no}")
# try:
#     card_element = wait.until(EC.element_to_be_clickable((By.XPATH, f"//div[@id='job-card-{0}']/div[@class='card relative']/a[@class='bg-white mb3 w-100 dib v-top pa3 no-underline flex-ns flex-wrap JobCard__card___22xP3']")))
#     print(card_element.get_attribute("href"))
#     # url = wait.until(EC.element_to_be_clickable((By.XPATH, f"//div[@id='job-card-{card_id}']/div[@class='card relative']/a[@class='bg-white mb3 w-100 dib v-top pa3 no-underline flex-ns flex-wrap JobCard__card___22xP3']"))).get_attribute("href")
#     # print(f"url: {url}")
# except TimeoutException as err:
#     print(err.msg)
# driver.quit()