In [1]:
import requests, time, random
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By

from bs4 import BeautifulSoup
from lxml import etree

from os import listdir, getcwd, makedirs
from os.path import isfile, join, exists
from datetime import datetime
import pandas as pd



class JobPost:
    def __init__(self, title, poster, date, link, desc, tags):
        self.title = title
        self.poster = poster
        self.link = link
        self.date = date
        self.desc = desc
        self.tags = tags

def scrape_html(res: str, verbose = False):
    df = pd.DataFrame({
        "Title": [],
        "Author": [],
        "Date": [],
        "Description": [],
        "Link": [],
        "Tags": []
    })
    xpath = "/html/body/section[1]/div/div[2]/div[2]/div[1]/div" # should return list of job post elements

    # res contains html text
    soup = BeautifulSoup(res, "html.parser")
    dom = etree.HTML(str(soup))
    subdoms = dom.xpath(xpath)

    for subdom in subdoms:
        subdroot = subdom.xpath("./a")
        subdtitle = subdom.xpath("./a/dl/dt/h4") # zooming into title
        subdposter = subdom.xpath("./a/p")
        subddate = subdom.xpath("./a/p/em")
        # subdpay = subdom.xpath("./a/dl[2]") # DOES NOT WORK
        subdtdesc = subdom.xpath("./div/a")
        subdtags = subdom.xpath("./div[2]/a") # list of tags
        # print(subdtags2)
        # if subdtags:
        #     print(print(etree.tostring(subdtags[0], pretty_print=True).decode()))
        if subdtitle:
            _title = subdtitle[0].text
            if verbose:
                print("Found job post: " + _title)
        else:
            continue
        if subdroot:
            _link = subdroot[0].get("href")
        if subdposter:
            _poster = subdposter[0].text.strip()
            _poster = _poster.rstrip(" •")
        if subddate:
            _date = subddate[0].text.strip()
            _date = _date.lstrip("Posted on ")
        if subdtdesc:
            _tdesc = subdtdesc[0].text.strip()
        if subdtags:
            _subdtags = [x.text for x in subdtags]
        while None in _subdtags:
            _subdtags.remove(None)
        
        # joblist.append(JobPost(_title, _poster, _date, _link, _tdesc, _subdtags))
        new_row = pd.Series({
            "Title": _title.strip(), 
            "Author": _poster.strip(), 
            "Date": _date.strip(), 
            "Description": _tdesc.strip(), 
            "Link": _link.strip(), 
            "Tags": ",".join(_subdtags)
            })
        df = pd.concat([df, new_row.to_frame().T], ignore_index=True)
    return df

def scrape_olj_web(pages = 2, search_key = None, verbose = False):
    # joblist = list()
    df = pd.DataFrame({
        "Title": [],
        "Author": [],
        "Date": [],
        "Description": [],
        "Link": [],
        "Tags": []
    })
    date = datetime.now().strftime("%m-%d-%Y-%H-%M")
    if search_key:
        folder = str(date) + "_" + search_key
    else:
        folder = str(date)
    if not exists(folder):
        makedirs(folder)
    driver = webdriver.Firefox()

    base_url = "https://www.onlinejobs.ph/jobseekers/jobsearch/"
    if search_key != None:
        add_url = "?jobkeyword=%s&skill_tags=&gig=on&partTime=on&fullTime=on&isFromJobsearchForm=1" % search_key
    else:
        add_url = ""
    ind = 1
    for i in range(pages):
        ind = max(i * 30, 1)
        new_url = base_url + str(ind) + add_url
        print("Fetching url: ", new_url)
        driver.get(new_url)
        res = driver.page_source
        html_source_code = driver.execute_script("return document.body.innerHTML;")

        # joblist += scrape_html(res, verbose=verbose)
        df = pd.concat([df,scrape_html(res, verbose=verbose)])

        with open("./"+folder+"/olscrape"+str(i)+".html", "w") as file:
            file.write(res)
            file.close()
        sleep_time = random.randint(23,64)
        print("Sleeping for: %s sec(s)..." % (sleep_time))
        time.sleep(sleep_time)

    driver.quit()

    return df



In [2]:
jobs = scrape_olj_web(pages=10, verbose=False, search_key="developer")
jobs.head()

Fetching url:  https://www.onlinejobs.ph/jobseekers/jobsearch/1?jobkeyword=developer&skill_tags=&gig=on&partTime=on&fullTime=on&isFromJobsearchForm=1
Sleeping for: 23 sec(s)...
Fetching url:  https://www.onlinejobs.ph/jobseekers/jobsearch/30?jobkeyword=developer&skill_tags=&gig=on&partTime=on&fullTime=on&isFromJobsearchForm=1
Sleeping for: 62 sec(s)...
Fetching url:  https://www.onlinejobs.ph/jobseekers/jobsearch/60?jobkeyword=developer&skill_tags=&gig=on&partTime=on&fullTime=on&isFromJobsearchForm=1
Sleeping for: 38 sec(s)...
Fetching url:  https://www.onlinejobs.ph/jobseekers/jobsearch/90?jobkeyword=developer&skill_tags=&gig=on&partTime=on&fullTime=on&isFromJobsearchForm=1
Sleeping for: 62 sec(s)...
Fetching url:  https://www.onlinejobs.ph/jobseekers/jobsearch/120?jobkeyword=developer&skill_tags=&gig=on&partTime=on&fullTime=on&isFromJobsearchForm=1
Sleeping for: 48 sec(s)...
Fetching url:  https://www.onlinejobs.ph/jobseekers/jobsearch/150?jobkeyword=developer&skill_tags=&gig=on&part

Unnamed: 0,Title,Author,Date,Description,Link,Tags
0,Website Developer,Carlos Garza,"Dec 10, 2024",We are seeking a creative and skilled Website ...,/jobseekers/job/Website-Developer-1277963,"Wordpress,Website Builder"
1,Developer Needed for AI-Powered Educational Ap...,John-Michael Kuczynski,"Dec 10, 2024",We're seeking a developer to create a prototyp...,/jobseekers/job/Developer-Needed-for-AI-Powere...,"Fullstack,Mobile Development,API Development"
2,Full Stack Developer,denis estimon,"Dec 10, 2024",a talented Full Stack Developer proficient in ...,/jobseekers/job/Full-Stack-Developer-1277870,"React JS,Next JS,PostgreSQL"
3,Web Developer,Arash Law - PH,"Dec 10, 2024",Job Overview:,/jobseekers/job/Web-Developer-1259554,"Web Programming,Javascript,Fullstack"
4,SENIOR AUTOMATION DEVELOPER,Nick Cornelius,"Dec 10, 2024",We're looking for an exceptional Senior Develo...,/jobseekers/job/SENIOR-AUTOMATION-DEVELOPER-12...,"Marketing Automation,API Development,Team Lead..."


In [3]:
jobs.iloc[11]["Link"]

'/jobseekers/job/Full-Stack-Developer-1226483'

In [None]:
jobs.to_csv(datetime.now().strftime("%m-%d-%Y-%H-%M")+".csv", sep=',', index=False, encoding='utf-8')