In [1]:
# Install if you have never used these: unblock the lines below to install if needed

#!pip install webdriver-manager
#!pip3 install lxml
#!pip3 install selenium
#!pip3 install webdriver_manager
#!pip install --upgrade pip
#!pip install -U selenium

In [1]:
# --------- import necessary modules -------

# For webscraping
from bs4 import BeautifulSoup

# Parsing and creating xml data
from lxml import etree as et

# Store data as a csv file written out
from csv import writer

# In general to use with timing our function calls to Indeed
import time

# Assist with creating incremental timing for our scraping to seem more human
from time import sleep

# Dataframe stuff
import pandas as pd

# Random integer for more realistic timing for clicks, buttons and searches during scraping
from random import randint

# Multi Threading
import threading

# Threading:
from concurrent.futures import ThreadPoolExecutor, wait

In [2]:
import selenium

# Check version I am running
selenium.__version__

'4.19.0'

In [3]:
# Selenium 4:

from selenium import webdriver

# Starting/Stopping Driver: can specify ports or location but not remote access
from selenium.webdriver.chrome.service import Service as ChromeService

# Manages Binaries needed for WebDriver without installing anything directly
from webdriver_manager.chrome import ChromeDriverManager

In [4]:
# Allows searchs similar to beautiful soup: find_all
from selenium.webdriver.common.by import By

# Try to establish wait times for the page to load
from selenium.webdriver.support.ui import WebDriverWait

# Wait for specific condition based on defined task: web elements, boolean are examples
from selenium.webdriver.support import expected_conditions as EC

# Used for keyboard movements, up/down, left/right,delete, etc
from selenium.webdriver.common.keys import Keys

# Locate elements on page and throw error if they do not exist
from selenium.common.exceptions import NoSuchElementException

In [5]:
import csv
from time import sleep
from random import randint
from datetime import datetime

job_ = ['business+analyst','data+analyst','software+engineer','machine+learning']
loc = 'Singapore'

paginaton_url_ = 'https://sg.indeed.com/jobs?q={}&l={}&sort=date&start={}'

driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
p_ = []
salary_list_ = []
company_list_ = []
location_list_ = []
job_description_list_ = []
posted_date_list_ = []
url_list_ = []

# Loop through each job and location combination
for job in job_:
    for i in range(0, 15):  # Scrape 15 pages for each job for latest job postings
        driver.get(paginaton_url_.format(job, loc, i * 10))
        sleep(randint(2, 5))

        # Scroll through the page and obtain job listings one by one
        job_page = driver.find_element(By.ID, "mosaic-jobResults")
        jobs = job_page.find_elements(By.CLASS_NAME, "job_seen_beacon")

        for idx, jj in enumerate(jobs):
                    try:
                        # Scroll to the current job listing
                        driver.execute_script("arguments[0].scrollIntoView(true);", jj)
                        sleep(1)

                        # Click on the job listing
                        jj.click()
                        sleep(2)  # Add a short pause after clicking

                    except:
                        close_button = driver.find_element(By.CSS_SELECTOR, "button[aria-label='close']")
                        close_button.click()
                        sleep(1)  # Add a short pause after closing the pop-up

                    # job title
                    try:
                        job_title = driver.find_element(By.CLASS_NAME, "jobsearch-JobInfoHeader-title-container").text
                        p_.append(job_title)
                    except:
                        p_.append(None)
                        
                    # salary
                    try:
                        salary = driver.find_element(By.CSS_SELECTOR, "div[data-testid='jobsearch-OtherJobDetailsContainer']").text
                        salary_list_.append(salary)
                    except:
                        salary_list_.append(None)
            

                    # company name
                    try:
                        company = driver.find_element(By.CSS_SELECTOR, "div[data-testid='inlineHeader-companyName']").text
                        company_list_.append(company)
                    except:
                        company_list_.append(None)    

                    # company location
                    try:
                        location = driver.find_element(By.CSS_SELECTOR, "div[data-testid='inlineHeader-companyLocation']").text
                        location_list_.append(location)
                    except:
                        location_list_.append(None)

                    # job description
                    try:
                        job_description = driver.find_element(By.ID, "jobDescriptionText").text
                        job_description_list_.append(job_description)
                    except:
                        job_description_list_.append(None)
                    
                    # posting date
                    try:
                        posted_date_element = driver.find_element(By.CSS_SELECTOR, "span[data-testid='myJobsStateDate']")
                        posted_date = posted_date_element.get_attribute("textContent")
                        posted_date_list_.append(posted_date)
                    except:
                        posted_date_list_.append(None)
                    
                    # URL
                    try:
                        job_url = driver.current_url
                        url_list_.append(job_url)
                    except:
                        url_list_.append(None)
                    
            
driver.quit()

# Write data to CSV file
with open('indeed_jobs_raw.csv', 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['Job Title', 'Company', 'Salary', 'Job Type', 'Location', 'Job Description', 'Posted Date Raw', 'Today Date', 'Job Posting Date','URL']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    today_date = datetime.now()
    today_date = today_date.strftime("%d/%m/%Y")
    
    writer.writeheader()
    for job, company, salary, location, job_description, posted_date, url in zip(p_, company_list_, salary_list_, location_list_, job_description_list_, posted_date_list_, url_list_):
        writer.writerow({'Job Title': job, 'Company': company, 'Salary': salary, 'Location': location, 'Job Description': job_description, 'Posted Date Raw': posted_date, 'Today Date': today_date, 'URL': url})


## PREPROCESSING FOR INDEED

In [6]:
data = pd.read_csv('indeed_jobs_raw.csv')
data

Unnamed: 0,Job Title,Company,Salary,Job Type,Location,Job Description,Posted Date Raw,Today Date,Job Posting Date,URL
0,Commercial Analyst Intern\n- job post,foodpanda,Contract,,Singapore 069542,Company Description\n\n“To be the most loved e...,PostedJust posted,24/04/2024,,https://sg.indeed.com/jobs?q=business+analyst&...
1,Country Real Estate Services (RES) Analyst\n- ...,Citi,Full-time,,Singapore,Whether you’re at the start of your career or ...,PostedJust posted,24/04/2024,,https://sg.indeed.com/jobs?q=business+analyst&...
2,"Senior Data Analyst, O&T Quality Assurance - I...",Citi,Full-time,,Singapore,The O&T Chief Operating Office (O&T COO) is a ...,PostedJust posted,24/04/2024,,https://sg.indeed.com/jobs?q=business+analyst&...
3,,,,,,,PostedJust posted,24/04/2024,,https://sg.indeed.com/jobs?q=business+analyst&...
4,IT Business Analyst\n- job post,PRUDENTIAL ASSURANCE COMPANY SINGAPORE (PTE) L...,"$5,000 - $7,000 a month - Permanent, Full-time",,Singapore,"Job Summary:\nIn this role, you will have an i...",PostedJust posted,24/04/2024,,https://sg.indeed.com/jobs?q=business+analyst&...
...,...,...,...,...,...,...,...,...,...,...
895,Mid-Market Account Executive\n- job post,Rubrik Job Board,,,Singapore,The Account Executive will take extreme owners...,PostedPosted 13 days ago,24/04/2024,,https://sg.indeed.com/jobs?q=machine+learning&...
896,"Big Data Engineer, Data Platform\n- job post",NodeFlair,"$6,500 - $13,000 a month",,Singapore,"Job Summary\n\nSalary\nS$6,500 - S$13,000 / Mo...",PostedPosted 13 days ago,24/04/2024,,https://sg.indeed.com/jobs?q=machine+learning&...
897,Head of Machine Learning\n- job post,Robert Walters,,,Singapore,Our client is seeking a passionate and experie...,PostedPosted 13 days ago,24/04/2024,,https://sg.indeed.com/jobs?q=machine+learning&...
898,Information Security Engineer\n- job post,DSO National Laboratories,,,Singapore 118225,Responsibilities\nDSO National Laboratories (D...,PostedPosted 13 days ago,24/04/2024,,https://sg.indeed.com/jobs?q=machine+learning&...


In [9]:
# Remove null rows
data.dropna(subset=['Job Title'], inplace=True)
data

Unnamed: 0,Job Title,Company,Salary,Job Type,Location,Job Description,Posted Date Raw,Today Date,Job Posting Date,URL
0,Commercial Analyst Intern\n,foodpanda,Contract,,Singapore 069542,Company Description\n\n“To be the most loved e...,PostedJust posted,24/04/2024,,https://sg.indeed.com/jobs?q=business+analyst&...
1,Country Real Estate Services (RES) Analyst\n,Citi,Full-time,,Singapore,Whether you’re at the start of your career or ...,PostedJust posted,24/04/2024,,https://sg.indeed.com/jobs?q=business+analyst&...
2,"Senior Data Analyst, O&T Quality Assurance",Citi,Full-time,,Singapore,The O&T Chief Operating Office (O&T COO) is a ...,PostedJust posted,24/04/2024,,https://sg.indeed.com/jobs?q=business+analyst&...
4,IT Business Analyst\n- job post,PRUDENTIAL ASSURANCE COMPANY SINGAPORE (PTE) L...,"$5,000 - $7,000 a month - Permanent, Full-time",,Singapore,"Job Summary:\nIn this role, you will have an i...",PostedJust posted,24/04/2024,,https://sg.indeed.com/jobs?q=business+analyst&...
5,#SGunited Jobs Business Analyst\n- job post,ITCAN PTE. LIMITED,"$5,500 - $6,500 a month - Full-time",,Singapore,Responsibilities :\nRe-engineer the end-to-end...,PostedJust posted,24/04/2024,,https://sg.indeed.com/jobs?q=business+analyst&...
...,...,...,...,...,...,...,...,...,...,...
895,Mid-Market Account Executive\n- job post,Rubrik Job Board,,,Singapore,The Account Executive will take extreme owners...,PostedPosted 13 days ago,24/04/2024,,https://sg.indeed.com/jobs?q=machine+learning&...
896,"Big Data Engineer, Data Platform\n- job post",NodeFlair,"$6,500 - $13,000 a month",,Singapore,"Job Summary\n\nSalary\nS$6,500 - S$13,000 / Mo...",PostedPosted 13 days ago,24/04/2024,,https://sg.indeed.com/jobs?q=machine+learning&...
897,Head of Machine Learning\n- job post,Robert Walters,,,Singapore,Our client is seeking a passionate and experie...,PostedPosted 13 days ago,24/04/2024,,https://sg.indeed.com/jobs?q=machine+learning&...
898,Information Security Engineer\n- job post,DSO National Laboratories,,,Singapore 118225,Responsibilities\nDSO National Laboratories (D...,PostedPosted 13 days ago,24/04/2024,,https://sg.indeed.com/jobs?q=machine+learning&...


In [10]:
# Remove additional text of -job post in Job Title Col
for index, row in data.iterrows():
    parts = row['Job Title'].split('-', 1)
    data.loc[index, 'Job Title'] = parts [0]

data

Unnamed: 0,Job Title,Company,Salary,Job Type,Location,Job Description,Posted Date Raw,Today Date,Job Posting Date,URL
0,Commercial Analyst Intern\n,foodpanda,Contract,,Singapore 069542,Company Description\n\n“To be the most loved e...,PostedJust posted,24/04/2024,,https://sg.indeed.com/jobs?q=business+analyst&...
1,Country Real Estate Services (RES) Analyst\n,Citi,Full-time,,Singapore,Whether you’re at the start of your career or ...,PostedJust posted,24/04/2024,,https://sg.indeed.com/jobs?q=business+analyst&...
2,"Senior Data Analyst, O&T Quality Assurance",Citi,Full-time,,Singapore,The O&T Chief Operating Office (O&T COO) is a ...,PostedJust posted,24/04/2024,,https://sg.indeed.com/jobs?q=business+analyst&...
4,IT Business Analyst\n,PRUDENTIAL ASSURANCE COMPANY SINGAPORE (PTE) L...,"$5,000 - $7,000 a month - Permanent, Full-time",,Singapore,"Job Summary:\nIn this role, you will have an i...",PostedJust posted,24/04/2024,,https://sg.indeed.com/jobs?q=business+analyst&...
5,#SGunited Jobs Business Analyst\n,ITCAN PTE. LIMITED,"$5,500 - $6,500 a month - Full-time",,Singapore,Responsibilities :\nRe-engineer the end-to-end...,PostedJust posted,24/04/2024,,https://sg.indeed.com/jobs?q=business+analyst&...
...,...,...,...,...,...,...,...,...,...,...
895,Mid,Rubrik Job Board,,,Singapore,The Account Executive will take extreme owners...,PostedPosted 13 days ago,24/04/2024,,https://sg.indeed.com/jobs?q=machine+learning&...
896,"Big Data Engineer, Data Platform\n",NodeFlair,"$6,500 - $13,000 a month",,Singapore,"Job Summary\n\nSalary\nS$6,500 - S$13,000 / Mo...",PostedPosted 13 days ago,24/04/2024,,https://sg.indeed.com/jobs?q=machine+learning&...
897,Head of Machine Learning\n,Robert Walters,,,Singapore,Our client is seeking a passionate and experie...,PostedPosted 13 days ago,24/04/2024,,https://sg.indeed.com/jobs?q=machine+learning&...
898,Information Security Engineer\n,DSO National Laboratories,,,Singapore 118225,Responsibilities\nDSO National Laboratories (D...,PostedPosted 13 days ago,24/04/2024,,https://sg.indeed.com/jobs?q=machine+learning&...


In [15]:
# To seperate the salary and Job Type during web scraping
for index, row in data.iterrows():
    if row['Salary'] == "nan":
        pass
    elif '$' not in str(row['Salary']):
        data.loc[index, 'Job Type'] = row['Salary']
        data.loc[index, 'Salary'] = None
    elif ("Up to" in row['Salary'] or "From" in row['Salary']) and row['Salary'].count('-') >= 1:
        parts = row['Salary'].split('-', 1)
        data.loc[index, 'Salary'] = parts[0] 
        data.loc[index, 'Job Type'] = parts[1]
    elif row['Salary'].count('-') >= 2:
        parts = row['Salary'].split('-', 2)
        data.loc[index, 'Salary'] = parts[0] + '-' + parts[1]
        data.loc[index, 'Job Type'] = parts[2]

data

Unnamed: 0,Job Title,Company,Salary,Job Type,Location,Job Description,Posted Date Raw,Today Date,Job Posting Date,URL
0,Commercial Analyst Intern\n,foodpanda,,,Singapore 069542,Company Description\n\n“To be the most loved e...,PostedJust posted,24/04/2024,2024-04-24,https://sg.indeed.com/jobs?q=business+analyst&...
1,Country Real Estate Services (RES) Analyst\n,Citi,,,Singapore,Whether you’re at the start of your career or ...,PostedJust posted,24/04/2024,2024-04-24,https://sg.indeed.com/jobs?q=business+analyst&...
2,"Senior Data Analyst, O&T Quality Assurance",Citi,,,Singapore,The O&T Chief Operating Office (O&T COO) is a ...,PostedJust posted,24/04/2024,2024-04-24,https://sg.indeed.com/jobs?q=business+analyst&...
4,IT Business Analyst\n,PRUDENTIAL ASSURANCE COMPANY SINGAPORE (PTE) L...,"$5,000 - $7,000 a month","Permanent, Full-time",Singapore,"Job Summary:\nIn this role, you will have an i...",PostedJust posted,24/04/2024,2024-04-24,https://sg.indeed.com/jobs?q=business+analyst&...
5,#SGunited Jobs Business Analyst\n,ITCAN PTE. LIMITED,"$5,500 - $6,500 a month",Full-time,Singapore,Responsibilities :\nRe-engineer the end-to-end...,PostedJust posted,24/04/2024,2024-04-24,https://sg.indeed.com/jobs?q=business+analyst&...
...,...,...,...,...,...,...,...,...,...,...
895,Mid,Rubrik Job Board,,,Singapore,The Account Executive will take extreme owners...,PostedPosted 13 days ago,24/04/2024,2024-04-11,https://sg.indeed.com/jobs?q=machine+learning&...
896,"Big Data Engineer, Data Platform\n",NodeFlair,"$6,500 - $13,000 a month",,Singapore,"Job Summary\n\nSalary\nS$6,500 - S$13,000 / Mo...",PostedPosted 13 days ago,24/04/2024,2024-04-11,https://sg.indeed.com/jobs?q=machine+learning&...
897,Head of Machine Learning\n,Robert Walters,,,Singapore,Our client is seeking a passionate and experie...,PostedPosted 13 days ago,24/04/2024,2024-04-11,https://sg.indeed.com/jobs?q=machine+learning&...
898,Information Security Engineer\n,DSO National Laboratories,,,Singapore 118225,Responsibilities\nDSO National Laboratories (D...,PostedPosted 13 days ago,24/04/2024,2024-04-11,https://sg.indeed.com/jobs?q=machine+learning&...


In [13]:
# To indicate the listing date from posted date information
import re
from datetime import datetime, timedelta

# Convert text to datetime for manipulation 
data['Today Date'] = pd.to_datetime(data['Today Date'], format='%d/%m/%Y')

# Convert 'Today Date' to string with 'day/month/year' format
data['Today Date'] = data['Today Date'].dt.strftime('%d/%m/%Y')

for index, row in data.iterrows():
    if not re.search(r'\d', row['Posted Date Raw']):
        data.loc[index, 'Job Posting Date'] = row['Today Date']
    else:
        days = re.findall(r'\d+', row['Posted Date Raw'])
        days_int = sum(map(int, days))
        listing_date = pd.to_datetime(row['Today Date'], format='%d/%m/%Y') - pd.DateOffset(days=days_int)
        data.loc[index, 'Job Posting Date'] = listing_date.strftime('%d/%m/%Y')  # Change the format here

# Convert 'Job Posting Date' column to datetime dtype
data['Job Posting Date'] = pd.to_datetime(data['Job Posting Date'], format='%d/%m/%Y')

data

Unnamed: 0,Job Title,Company,Salary,Job Type,Location,Job Description,Posted Date Raw,Today Date,Job Posting Date,URL
0,Commercial Analyst Intern\n,foodpanda,,Contract,Singapore 069542,Company Description\n\n“To be the most loved e...,PostedJust posted,24/04/2024,2024-04-24,https://sg.indeed.com/jobs?q=business+analyst&...
1,Country Real Estate Services (RES) Analyst\n,Citi,,Full-time,Singapore,Whether you’re at the start of your career or ...,PostedJust posted,24/04/2024,2024-04-24,https://sg.indeed.com/jobs?q=business+analyst&...
2,"Senior Data Analyst, O&T Quality Assurance",Citi,,Full-time,Singapore,The O&T Chief Operating Office (O&T COO) is a ...,PostedJust posted,24/04/2024,2024-04-24,https://sg.indeed.com/jobs?q=business+analyst&...
4,IT Business Analyst\n,PRUDENTIAL ASSURANCE COMPANY SINGAPORE (PTE) L...,"$5,000 - $7,000 a month","Permanent, Full-time",Singapore,"Job Summary:\nIn this role, you will have an i...",PostedJust posted,24/04/2024,2024-04-24,https://sg.indeed.com/jobs?q=business+analyst&...
5,#SGunited Jobs Business Analyst\n,ITCAN PTE. LIMITED,"$5,500 - $6,500 a month",Full-time,Singapore,Responsibilities :\nRe-engineer the end-to-end...,PostedJust posted,24/04/2024,2024-04-24,https://sg.indeed.com/jobs?q=business+analyst&...
...,...,...,...,...,...,...,...,...,...,...
895,Mid,Rubrik Job Board,,,Singapore,The Account Executive will take extreme owners...,PostedPosted 13 days ago,24/04/2024,2024-04-11,https://sg.indeed.com/jobs?q=machine+learning&...
896,"Big Data Engineer, Data Platform\n",NodeFlair,"$6,500 - $13,000 a month",,Singapore,"Job Summary\n\nSalary\nS$6,500 - S$13,000 / Mo...",PostedPosted 13 days ago,24/04/2024,2024-04-11,https://sg.indeed.com/jobs?q=machine+learning&...
897,Head of Machine Learning\n,Robert Walters,,,Singapore,Our client is seeking a passionate and experie...,PostedPosted 13 days ago,24/04/2024,2024-04-11,https://sg.indeed.com/jobs?q=machine+learning&...
898,Information Security Engineer\n,DSO National Laboratories,,,Singapore 118225,Responsibilities\nDSO National Laboratories (D...,PostedPosted 13 days ago,24/04/2024,2024-04-11,https://sg.indeed.com/jobs?q=machine+learning&...


In [16]:
data.to_csv('indeed_jobs_modified.csv', index=False)