In [311]:
import pandas as pd
import selenium.common.exceptions
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver import Remote, ChromeOptions
from selenium.webdriver.chromium.remote_connection import ChromiumRemoteConnection

JOBS_URL = "https://www.linkedin.com/jobs/search"

USE_PROXY = False

SCRAP_FILE = "./scrapped_data/job_listings1.csv"

if USE_PROXY:

    AUTH = f'{USER}:{PASS}'
    SBR_WEBDRIVER = f'https://{AUTH}@brd.superproxy.io:9515'


    print('Connecting to Scraping Browser...')
    sbr_connection = ChromiumRemoteConnection(SBR_WEBDRIVER, 'goog', 'chrome')
    driver =  Remote(command_executor=sbr_connection, options=ChromeOptions())
else:
    driver = webdriver.Chrome()

driver.get(JOBS_URL)


Connecting to Scraping Browser...


In [242]:
driver.get_screenshot_as_file("destination.png")

True

In [312]:
# from container import parse_job_listing
import time
import re
import pprint
# from constants import BUTTONS

BUTTONS = {
    "SHOW_MORE": [By.CLASS_NAME, """""show-more-less-html__button show-more-less-button
        show-more-less-html__button--more
        ml-0.5"""],
    "JOB_PAGE": [By.CSS_SELECTOR, "body > div.base-serp-page > div > section"],
    "DESCRIPTION": [By.CSS_SELECTOR, "body > div.base-serp-page > div > section > div.details-pane__content.details-pane__content--show > div > section.core-section-container.my-3.description > div > div > section > div"],
    "COMPANY_ID": [By.CSS_SELECTOR, "body > div.base-serp-page > div > section > div.details-pane__content.details-pane__content--show > section > div > div.top-card-layout__entity-info-container.flex.flex-wrap.papabear\:flex-nowrap > div > h4 > div.face-pile.flex.see-who-was-hired > a"],
    "TOP_BAR": [By.CSS_SELECTOR, "body > div.base-serp-page > div > section > div.details-pane__content.details-pane__content--show > section > div"],
    "BOTTOM_BAR": [By.CSS_SELECTOR, "body > div.base-serp-page > div > section > div.details-pane__content.details-pane__content--show > div > section.core-section-container.my-3.description > div > ul"],
    "LOAD_MORE": [By.CSS_SELECTOR, "#main-content > section > button"]
}

COLS = ['company_id', 'description', 'formatted_experience_level', 'formatted_work_type', 'industries', 'job_function', 'job_id', 'location', 'title', 'work_type']

In [313]:

def fetch_jobs(limit):
    def fetch_jobs_list():
        return driver.find_element(By.CSS_SELECTOR, '#main-content > section > ul').find_elements(By.TAG_NAME, "li")
    my_list = fetch_jobs_list()
    n = len(my_list)
    # print(f"Starting length: {len(my_list)}")
    i = 0
    STOP = False
    while not STOP:
        for j, item in enumerate(my_list):
            if i > limit:
                print("Finished Scraping")
                STOP = True
                break
            i += 1
            yield item
            time.sleep(1.5)
            # yield item
        else:
            time.sleep(0.5)
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            try:
                driver.find_element(*BUTTONS["LOAD_MORE"]).click()
            except Exception:
                pass
            time.sleep(1)
            n = len(my_list)
            updated = fetch_jobs_list()
            while n == len(updated):
                time.sleep(0.5)
                updated = fetch_jobs_list()
            # print(f'my_list len {n} updated len {len(updated)}')
            my_list = updated[n:]
            # print(f"new len {len(my_list)}")


In [None]:
jobs_scrapped = 0
pattern_companyId = r"facetCurrentCompany%3D(\d+)"
pattern_jobId = r"currentJobId=(\d+)"

new_data = pd.DataFrame(columns=COLS)
data = pd.read_csv(SCRAP_FILE)

def scrap_job_element(job_element):
    job_element.click()
    job_data = {}
    time.sleep(1)
    ERROR_FLAG = False

    _job_id = job_data["job_id"] = int(re.search(pattern_jobId, driver.current_url).group(1))

    # print(type(_job_id))
    # print(_job_id in data["job_id"].values)
    if _job_id in new_data["job_id"].values or _job_id in data["job_id"].values:
        print("(already scrapped)", end="")
        return job_data, True

    job_data["description"] = driver.find_element(*BUTTONS["DESCRIPTION"]).text.strip()
    job_data["company_id"] = re.search(pattern_companyId, driver.find_element(*BUTTONS["COMPANY_ID"]).get_attribute("href")).group(1)

    top_bar = driver.find_element(*BUTTONS["TOP_BAR"]).text.split("\n")
    job_data["title"] = top_bar[0]
    job_data["location"] = " ".join(top_bar[1].split(" ")[1:])
    bottom_bar = driver.find_element(*BUTTONS["BOTTOM_BAR"]).text.split("\n")
    for i in range(0, len(bottom_bar), 2):
        if bottom_bar[i] == "Seniority level":
            job_data["formatted_experience_level"] = "" if bottom_bar[i+1] == "Not Applicable" else bottom_bar[i+1]
        elif bottom_bar[i] == "Employment type":
            job_data["work_type"] = bottom_bar[i+1].upper()
            job_data["formatted_work_type"] = bottom_bar[i+1]
        elif bottom_bar[i] == "Industries":
            job_data["industries"] = bottom_bar[i+1]
        elif bottom_bar[i] == "Job function":
            job_data["job_function"] = bottom_bar[i+1]
        else:
            print("Something went wrong, won't scrape this!")
            ERROR_FLAG = True
            break
    return job_data, ERROR_FLAG


for i, job_element in enumerate(fetch_jobs(100)):
    SUCCESS = False
    LOAD_TRIES = 0
    while not SUCCESS:
        try:
            s = time.time()
            job_element.click()
            job_data, error = scrap_job_element(job_element)
            job_id = job_data["job_id"]
            print(f'[{jobs_scrapped}/{i}] - Scraped job - {job_id} took: {time.time() - s:.2f} sec ', end="")
            if error:
                print("ABORT")
                SUCCESS = True
                break
            print()
            new_data = new_data._append(pd.Series(job_data), ignore_index=True)
            SUCCESS = True
            jobs_scrapped += 1
            time.sleep(1)
            new_data.to_csv("scrapped_data/job_listings2.csv", index=False, header=True)
            if jobs_scrapped % 10 == 0:
                print(f'\t--- SCRAPPED {jobs_scrapped} JOBS ---')
                time.sleep(2)

        except selenium.common.exceptions.NoSuchElementException:
            LOAD_TRIES += 1
            time.sleep(0.1)
            element_before.click()
            if LOAD_TRIES == 5:
                time.sleep(15)
            elif LOAD_TRIES == 8:
                SUCCESS = True
                break
            time.sleep(1)

        except selenium.common.exceptions.WebDriverException:
            print("WebDriver Exception waiting and trying again...")
            time.sleep(2)
    element_before = job_element
print("DONE SCRAPPING")
new_data

In [317]:
df1 = pd.read_csv("scrapped_data/job_listings1.csv")
df2 = pd.read_csv("scrapped_data/job_listings2.csv")

In [316]:
pd.concat([df1, df2]).to_csv("scrapped_data/job_listings1.csv", index=False)

In [318]:
df1

Unnamed: 0,company_id,description,formatted_experience_level,formatted_work_type,industries,job_function,job_id,location,title,work_type
0,1584738,Company Description\n\nSmartRecruiters is a va...,Associate,Full-time,"Technology, Information and Internet and Softw...",Strategy/Planning,3878432283,United States,Events Manager,FULL-TIME
1,101701751,Job Title: Remote Travel Data Entry Specialist...,Entry level,Full-time,Internet Publishing,Administrative,3878442071,"Venture Planners Chicago, IL",Data Entry Clerk - Remote,FULL-TIME
2,22615,The Neonatal ICU at Huntsville Hospital for Wo...,Mid-Senior level,Other,Hospitals and Health Care,Health Care Provider,3878285098,"Hospital Huntsville, AL",RN Staff-Neonatal ICU-PT-3rd Shift,OTHER
3,101701751,Job Title: Administrative Assistant/Customer S...,Entry level,Full-time,Internet Publishing,Administrative,3878438366,Venture Planners United States,Administrative Assistant/Customer Service,FULL-TIME
4,3055,Company Description\n\nDiscover the essence of...,Associate,Full-time,Hospitality,Other,3878432792,"Wailea, HI",Revenue Analyst (Full-time),FULL-TIME
...,...,...,...,...,...,...,...,...,...,...
228,99218917,This is a remote position.\n\nJunior Virtual A...,Entry level,Full-time,IT Services and IT Consulting,Information Technology,3878774793,"Learning Career New York, NY",Junior Virtual Assistant,FULL-TIME
229,2998,Help Help Requirements Conditions of Employmen...,Entry level,Full-time,Government Administration,Other,3879000341,"Department of Homeland Security Tustin, CA",Asylum Officer,FULL-TIME
230,18348748,Description\n\nWho We Are - MANSCAPED® is a le...,Mid-Senior level,Full-time,Personal Care Product Manufacturing,Accounting/Auditing and Finance,3878784776,United States,"Manager, Accounts Payable",FULL-TIME
231,99218917,This is a remote position.\n\nJunior React Dev...,Entry level,Full-time,IT Services and IT Consulting,Information Technology,3878779047,"Learning Career New York, NY",Junior React Developer,FULL-TIME
