# Part I - getting companies with jobs

## Imports, classes and methods

In [51]:
import asyncio
import aiohttp
import random
import socket
from bs4 import BeautifulSoup
import requests
import json
import csv
from IPython.display import display, Markdown, Latex

# This code provided by Bright Data, slightly modified for our purposes (midifications are commented)

super_proxy = socket.gethostbyname('brd.superproxy.io') # the Bright Data superproxy ip

class SingleSessionRetriever:
    url = "http://%s-session-%s:%s@"+super_proxy+":%d"
    port = 22225

    def __init__(self, username, password):
        self._username = username
        self._password = password
        self._reset_session()

    def _reset_session(self):
        session_id = str(random.random())
        self._proxy = self.url % (self._username, session_id, self._password, SingleSessionRetriever.port)
    
    async def retrieve(self, url, timeout, retrieve_func):
        
        async with aiohttp.ClientSession() as session:
            try:
                async with session.get(url, proxy=self._proxy, timeout=timeout) as response:
                    if response.ok:
                        content =  await response.content.read()
                        # we added the retrieve function, to customize the content we want to retrieve and make this class more generic
                        return retrieve_func(content)
                    else:
                        return response.status
            except Exception as e:
                return 0


class MultiSessionRetriever:
    
    def __init__(self, username, password, session_requests_limit, lock):
        self._username = username
        self._password = password
        self.session_requests_limit = session_requests_limit
        self._sessions_stack = []
        self._requests = 0
        self.max_tries = 10
        self.lock = lock # added this to be able to modify courutine objects (like courutine dictionaries)

    async def retrieve(self, urls, timeout, parallel_sessions_limit, callback, retrieve_func, data_container):
        semaphore = asyncio.Semaphore(parallel_sessions_limit)
        tasks = [self._retrieve_single(url, timeout, semaphore, callback, retrieve_func, data_container) for url in urls]
        await asyncio.gather(*tasks)

    async def _retrieve_single(self, url, timeout, semaphore, callback, retrieve_func, data_container):
        async with semaphore:
            result = 0
            tries = 0
            while type(result) ==  type(0) and tries <= self.max_tries:
                tries += 1
                if not self._sessions_stack or self._requests >= self.session_requests_limit:
                    if self._sessions_stack:
                        self._requests = 0
                    session_retriever = SingleSessionRetriever(self._username, self._password)
                    self._sessions_stack.append(session_retriever)
                else:
                    session_retriever = self._sessions_stack[-1]
                self._requests += 1
                result = await session_retriever.retrieve(url, timeout, retrieve_func)
                # the additional check below is for errors that require the session restart
                if type(result) == type(0):
                    self._requests = self.session_requests_limit
            if result is not None:
                # data_container is modifies in the callback function
                await callback(url, result, data_container, self.lock)
            elif tries > self.max_tries:
                print(f'Failed: {url}')

In [10]:
# These methods are for checking the status of companies jobs (found, not found, some error)

# The scraping method, checks for occurence of "See jobs" button
def seek_comps_with_jobs(content):
    soup = BeautifulSoup(content, 'html.parser')
    # sometimes we get the "join to Linkedin" page instead of the company page, if so, return 0 (0 means error here)
    join_btn = soup.find('a', class_="join-form__form-body-submit-button")
    if join_btn:
        return 0
    # check for the "See jobs" button
    jobs_btn = soup.find('a', class_="top-card-layout__cta mt-2 ml-1.5 h-auto babybear:flex-auto top-card-layout__cta--primary btn-md btn-primary")
    return "No jobs" if not jobs_btn else "Jobs found!"

# This checks the scraping result and stores it in answers dictionary
async def get_answer(url, result, answers, lock):
    async with lock:
        # answers are "Jobs found!", "No jobs" or some integer that indicates and error
        answers[url] = result
    if result == "Jobs found!":
        print(f'{len([ans for ans in answers.values() if ans == "Jobs found!"])}, checked: {len(answers)},\t last URL: {url}')

# The main scraping method for getting the jobs status from companies pages (based on the method provided by Bright Data)
# This needs the urls from the above cell
async def get_answers(answers, urls):
    lock = asyncio.Lock()
    req_timeout = 30
    n_parallel_exit_nodes = 50
    switch_ip_every_n_req = 20
    retriever = MultiSessionRetriever('brd-customer-hl_80709a30-zone-datacenter_proxy_vlad', 'c423aw7kxrym', switch_ip_every_n_req, lock)
    await retriever.retrieve(urls, req_timeout, n_parallel_exit_nodes, get_answer, seek_comps_with_jobs, answers)

## Getting companies with jobs

In [2]:
# These are all the tech companies we downloaded from Databricks (those with the meta_industry "Technologies"), filtered by next conditions:
# 1. Company size is not null and larger than 10 employees 2. The "about" section is not empty (not null)
# These conditions restrict the companies list to those companies that more likely (in our opinion) are hiring
with open('tech_comps.csv', newline='') as f:
    urls = [url[0] for url in csv.reader(f)]

urls[:10]

['https://www.linkedin.com/company/community-compassion-services-inc',
 'https://www.linkedin.com/company/vernet-group',
 'https://www.linkedin.com/company/cloudsnooze',
 'https://www.linkedin.com/company/asian-pacific-seafood-llc',
 'https://www.linkedin.com/company/dnvtec',
 'https://www.linkedin.com/company/dealer-image-pro',
 'https://www.linkedin.com/company/novel-commerce',
 'https://www.linkedin.com/company/honest-networks',
 'https://www.linkedin.com/company/geeksontime',
 'https://www.linkedin.com/company/10web']

In [7]:
# Get the jobs status from all Tech companies (those with "Technology" meta industry)
answers = {}
print("Companies with jobs found:")
await get_answers(answers, urls)

Companies with jobs found:
1, checked: 13,	 last URL: https://www.linkedin.com/company/honest-networks
2, checked: 19,	 last URL: https://www.linkedin.com/company/abotts-consulting
3, checked: 31,	 last URL: https://www.linkedin.com/company/datawave-technologies-inc
4, checked: 40,	 last URL: https://www.linkedin.com/company/bunnystudio
5, checked: 46,	 last URL: https://www.linkedin.com/company/promptemr
6, checked: 47,	 last URL: https://www.linkedin.com/company/focusbroadband
7, checked: 60,	 last URL: https://www.linkedin.com/company/scalableco
8, checked: 68,	 last URL: https://www.linkedin.com/company/asd-global
9, checked: 85,	 last URL: https://www.linkedin.com/company/adaptive-telehealth
10, checked: 94,	 last URL: https://www.linkedin.com/company/acuityinternational
11, checked: 119,	 last URL: https://www.linkedin.com/company/american-data
12, checked: 126,	 last URL: https://www.linkedin.com/company/amches
13, checked: 128,	 last URL: https://www.linkedin.com/company/americ

In [8]:
# This is the number of all the Tech companies from the companies set 
len(answers)

11705

In [9]:
# What is the % of companies with jobs, out of all Tech companies we have?
jobs_urls = [key for key, value in answers.items() if value == "Jobs found!"]

In [10]:
print(str(round(len(jobs_urls) / len(answers), 4) * 100) + "%")

9.2%


In [11]:
# output answers counts (we need "Jobs found!" messages to know how many companies with jobs we have)
for ans in set(answers.values()):
    print(f'{ans}: {len([val for val in answers.values() if val == ans])}')

No jobs: 10444
Jobs found!: 1077
404: 139
999: 45


In [12]:
# store the links of companies with jobs
with open('tech_comps_with_jobs.csv', 'w', newline='') as file:
    for url in jobs_urls:
        file.write(url + "\n")

## Hiring / not hiring companies - additional scrapping of all tech companies 
(all company sizes, including Null, and those with empty "about" sections)

This is for the first research question: <strong>How likely for a company from each industry and size to be a hiring company and what is
the distribution of jobs number</strong>?

In [6]:
# These are all the tech companies we downloaded from Databricks (those with the meta_industry "Technologies")
with open('tech_comps_all.csv', newline='') as f:
    urls_all = [url[0] for url in csv.reader(f)]

urls_all[:10]

['https://www.linkedin.com/company/kern-web-design',
 'https://www.linkedin.com/company/5-and-2-studio',
 'https://www.linkedin.com/company/autech-x',
 'https://www.linkedin.com/company/t-t-network-solutions-llc',
 'https://www.linkedin.com/company/just-in-time-tech',
 'https://www.linkedin.com/company/potomacsedge',
 'https://www.linkedin.com/company/magatecno',
 'https://www.linkedin.com/company/integrisoft-llc',
 'https://www.linkedin.com/company/copland-software-inc',
 'https://www.linkedin.com/company/network-newsc']

In [11]:
# Get the jobs status from all Tech companies (those with "Technology" meta industry)
answers_all = {}
print("Companies with jobs found:")
await get_answers(answers_all, urls_all)

Companies with jobs found:
1, checked: 504,	 last URL: https://www.linkedin.com/company/sow-inc
2, checked: 534,	 last URL: https://www.linkedin.com/company/kern-and-kern-llc
3, checked: 621,	 last URL: https://www.linkedin.com/company/patriot-marketing
4, checked: 639,	 last URL: https://www.linkedin.com/company/landmark-resources
5, checked: 4077,	 last URL: https://www.linkedin.com/company/balkans-io
6, checked: 4572,	 last URL: https://www.linkedin.com/company/honest-networks
7, checked: 4668,	 last URL: https://www.linkedin.com/company/focusbroadband
8, checked: 4711,	 last URL: https://www.linkedin.com/company/promptemr
9, checked: 4733,	 last URL: https://www.linkedin.com/company/abotts-consulting
10, checked: 4746,	 last URL: https://www.linkedin.com/company/datawave-technologies-inc
11, checked: 4791,	 last URL: https://www.linkedin.com/company/scalableco
12, checked: 4816,	 last URL: https://www.linkedin.com/company/acuityinternational
13, checked: 4823,	 last URL: https://ww

In [16]:
for ans in set(answers_all.values()):
    print(f'{ans}: {len([val for val in answers_all.values() if val == ans])}')

0: 4
451: 2
No jobs: 54576
999: 115
429: 1
Jobs found!: 1166
404: 614
502: 1


In [25]:
def write_answers(answers, append=False):
    with open('comps_hiring_answers.csv', 'a' if append else 'w', newline='', encoding='utf-8') as csv_file:  
                    writer = csv.writer(csv_file)
                    writer.writerow(["url", "hiring"])
                    for url, answer in answers.items():
                        if answer == "Jobs found!":
                            answer = True
                        elif answer == "No jobs":
                            answer = False
                        else:
                            answer = None
                        writer.writerow([url, answer])

In [39]:
write_answers(answers_all)

In [29]:
urls_failed = [url for url in answers_all if answers_all[url] not in ["Jobs found!", "No jobs"]]

In [30]:
len(urls_failed)

737

In [31]:
# second attempt
answers_remaining = {}
print("Companies with jobs found (second attempt, on pages with errors only):")
await get_answers(answers_remaining, urls_failed)

Companies with jobs found (second attempt, on pages with errors only):
1, checked: 241,	 last URL: https://www.linkedin.com/company/deepengineai
2, checked: 336,	 last URL: https://www.linkedin.com/company/cars-com
3, checked: 465,	 last URL: https://www.linkedin.com/company/fern-api
4, checked: 671,	 last URL: https://www.linkedin.com/company/journalize-io


In [40]:
for url, ans in answers_remaining.items():
    answers_all[url] = ans
write_answers(answers_all)

# Part II - scraping the jobs data

## Imports and methods

In [47]:
from seleniumwire import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common import exceptions
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import requests
from time import time, sleep, strptime
import random
import csv
from datetime import datetime
import json

# Set up Chrome WebDriver
service = Service(ChromeDriverManager().install())
field_size_limit = csv.field_size_limit(10**9)

In [14]:
# These methods are for the selenium webdriver operations

# Gets options for the selenium webdriver, change the proxy to yours or don't use proxy (set set_proxy=False in the scraping code tab)
def get_options(headless=True, set_proxy=True):
    host = "brd.superproxy.io"
    username = 'brd-customer-hl_80709a30-zone-datacenter_proxy_vlad'
    password = 'c423aw7kxrym'
    port = 22225
    proxy = f"{username}:{password}@{host}:{port}"
    
    seleniumwire_options = {'proxy': {'http': f'http://{proxy}', 'https': f'https://{proxy}'}}

    chrome_options = webdriver.ChromeOptions()
    if set_proxy:
        chrome_options.add_argument(f'--proxy-server={proxy}')
    chrome_options.add_argument('--lang=en-US')
    chrome_prefs = {"profile.managed_default_content_settings.images": 2}
    chrome_options.add_experimental_option("prefs", chrome_prefs)
    
    if headless:
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--disable-gpu')

    return chrome_options, seleniumwire_options


# This restarts the selenium wevdriver (Chrome), or starts it if wasn't started before
def restart_webdriver(driver, url, headless=True, set_proxy=True, max_attempts=5, print_errors=True):
    if driver: 
        driver.quit()
    attempts = 0
    driver = None
    while attempts <= max_attempts:
        try:
            chrome_options, seleniumwire_options = get_options(headless=headless, set_proxy=set_proxy)
            driver = webdriver.Chrome(service=service, seleniumwire_options=seleniumwire_options, options=chrome_options)
            driver.maximize_window()
            driver.get(comp_url)
            break
        except:
            if print_errors: print(f"Failed to connect, attempt: {attempts + 1}")
            attempts += 1
    if not driver:
        return None
        raise Exception("Failed to connect, max attempts reached.")
    return driver


# Tries to load the page, retrying if got the "join" page or some page error 
# (if got the "join" page - restarts the driver, if page error - refreshes 10 times by default), tries 10 times by default
def load_till_success(driver, url, refreshes=10, max_attempts=10, print_errors=True, headless=True, set_proxy=True):
    load_successful = False
    attempts = 0
    while not load_successful and attempts <= max_attempts:
        sleep(1)
        try:
            join_element = driver.find_element(By.CLASS_NAME, 'authwall-join-form__title')
            if print_errors: print("Load unsuccessful (join page), restarting...")
            driver = restart_webdriver(driver, url, headless=headless, set_proxy=set_proxy)
            load_successful = False
        except exceptions.NoSuchElementException:
            for i in range(refreshes):
                try:
                    error_element = driver.find_element(By.CLASS_NAME, 'neterror')
                    if print_errors: print("Load unsuccessful (page error), refreshing...")
                    driver.refresh()
                    sleep(1)
                    load_successful = False
                    if i == refreshes - 1:
                        if print_errors: print("Load unsuccessful (refreshes exeeded), restarting...")
                        driver = restart_webdriver(driver, url, headless=headless, set_proxy=set_proxy)
                        load_successful = False
                except exceptions.NoSuchElementException:
                    load_successful = True
                    break
                except Exception:
                    driver = restart_webdriver(driver, url, headless=headless, set_proxy=set_proxy)
                    load_successful = False
        except Exception:
                driver = restart_webdriver(driver, url, headless=headless, set_proxy=set_proxy)
                load_successful = False
            
        attempts += 1
        
    if not load_successful:
        if print_errors: print("Reached maximum attempts when trying to load the page")
        
    return driver, load_successful


# Gets text of element that contains some text, after parsing with beautiful soup
def get_element_text(soup, type_str, class_str, decode=False):
    try:
        element = soup.find(type_str, class_=class_str)
        if decode:
            text = element.decode_contents()
        else:
            text = element.get_text(strip=True)
    except:
        text = None

    return text


# In addition of scraping the job pages links from the list of jobs, there is additional interesting data we wanted to obtain: benefits and datetime
# This formats the benefits data of all jobs from the company's jobs list
def format_benefits(jobs_links):
    benefits = ['Actively Hiring', 'Be an early applicant', 'Medical insurance', 'Paid paternity leave']
    for link in jobs_links:
        if link["benefits"]:
            for b in benefits:
                if b in link["benefits"]:
                    link["benefits"] = b
                    ok = True
                    break
                     

# This returns jobs links which datetime (aka post data) starts from data_limit (by default - 1800-01-01, aka no limit)
def choose_fresh_links(jobs_links, date_limit="1800-01-01"):
    date_limit = strptime(date_limit, "%Y-%m-%d")
    jobs_links = [link for link in jobs_links if strptime(link['datetime'] if link['datetime'] else "1800-01-01", "%Y-%m-%d") >= date_limit]

In [25]:
# These methods are for asyncronyous jobs scraping using MultiSessionRetriever from Part 1, after got links from companys jobs list

# This scrapes the job data after loading the job page
async def get_job_data(content):
    soup = BeautifulSoup(content, 'html.parser')
    join_btn = soup.find('a', class_="join-form__form-body-submit-button")
    if join_btn:
        return 0
        
    job_dict = {}
    job_dict["title"] = get_element_text(soup, 'h1', 
             'top-card-layout__title font-sans text-lg papabear:text-xl font-bold leading-open text-color-text mb-0 topcard__title')
    job_dict["description"] = get_element_text(soup, 'div', 
             'show-more-less-html__markup show-more-less-html__markup--clamp-after-5 relative overflow-hidden', decode=True)
    if not job_dict["description"]:
        return 0
        
    job_dict["location"] = get_element_text(soup, 'span', 'topcard__flavor topcard__flavor--bullet')
    job_dict["post_time"] = get_element_text(soup, 'span', 'posted-time-ago__text topcard__flavor--metadata')
    
    job_dict["criterias"] = {}
    criteria_list = soup.find('ul', class_='description__job-criteria-list')
    if criteria_list:
        criteria_items = criteria_list.find_all('li', class_='description__job-criteria-item')
        for item in criteria_items:
            subheader = item.find('h3', class_='description__job-criteria-subheader').text.strip()
            criteria_text = item.find('span', class_='description__job-criteria-text--criteria').text.strip()
            job_dict["criterias"][subheader] = criteria_text

    return job_dict


# Appends scraped job data to a data_containter - storage of all jobs data of a single company
async def append_job_data(url, data, data_container, lock):
    async with lock:
        if type(data) == type(0):
            data_container["failed"][url] = data, data_container["current_idx"]
        else: 
            data_copy = await data
            data_copy["url"] = url
            idx = next((i for i, d in enumerate(data_container["jobs_links"]) if d.get("url") == url), None)
            data_copy["datetime"] = data_container["jobs_links"][idx]["datetime"]
            data_copy["benefits"] = data_container["jobs_links"][idx]["benefits"]
            data_container["jobs"].append(data_copy)
    

# The main jobs data scraping function - scrapes all jobs of a single company, needs the MultiSessionRetriever from part I
async def get_jobs(comp_url, jobs_links, retriever, data_container):
    req_timeout = 30
    n_parallel_exit_nodes = 50

    loop = asyncio.get_event_loop()
    urls = [link["url"] for link in jobs_links]
    await retriever.retrieve(urls, req_timeout, n_parallel_exit_nodes, append_job_data, get_job_data, data_container)

    retries = 0
    while True:
        try:
            retries += 1
            with open('scraped_jobs.csv', 'a', newline='', encoding='utf-8') as csv_file:
                writer = csv.writer(csv_file)
                writer.writerow([comp_url, data_container["jobs"]])
                break
        except:
            if retries > 5:
                print("Something went wrong when tried to write to scraped_jobs.csv")
                sleep(10)
                retries = 0

## Loading jobs links

In [10]:
# Here we get the companies links that have jobs (the list obtained in the Part 1), or the list of failed-to-scrap companies, to retry scraping

# with open('tech_comps_with_jobs.csv', newline='') as f:
with open('failed_to_scrap_1.csv', newline='') as f:
    reader = csv.reader(f)
    comp_urls = [url[0] for url in reader]
comp_urls[:5]

['https://www.linkedin.com/company/pctronics-it-solutions',
 'https://www.linkedin.com/company/american-data',
 'https://www.linkedin.com/company/prosource_2',
 'https://www.linkedin.com/company/equitus',
 'https://www.linkedin.com/company/softinnovas']

In [15]:
# last scrapping attempt - for companies in the "failed to scrap" file, ommiting the companies with no jobs (checked twice already)
# DONT RUN IT TOGETHER WITH THE ABOVE CELL
with open('failed_to_scrap_2.csv', newline='') as f:
    reader = csv.reader(f)
    comp_urls = [url[0] for url in reader if url[1] != "No jobs button"]
comp_urls

['https://www.linkedin.com/company/radus-software-llc',
 'https://www.linkedin.com/company/bishop-peak-technology',
 'https://www.linkedin.com/company/cars-com',
 'https://www.linkedin.com/company/bluejeans-by-verizon',
 'https://www.linkedin.com/company/teleteachers',
 'https://www.linkedin.com/company/asoft-consulting',
 'https://www.linkedin.com/company/verizon-media',
 'https://www.linkedin.com/company/openlogix-corporation']

In [23]:
# additional companies with jobs (found it when needed to check for some errors)
# DONT RUN IT TOGETHER WITH ONE OF THE 2 ABOVE CELLS
with open('tech_comps_with_jobs_0.csv', newline='') as f:
    reader = csv.reader(f)
    comp_urls_0 = [url[0] for url in reader]


with open('tech_comps_with_jobs.csv', newline='') as f:
    reader = csv.reader(f)
    comp_urls_1 = [url[0] for url in reader]

comp_urls = [url for url in comp_urls_1 if url not in comp_urls_0]
print(len(comp_urls))
comp_urls[:5]

91


['https://www.linkedin.com/company/bunnystudio',
 'https://www.linkedin.com/company/asd-global',
 'https://www.linkedin.com/company/adaptive-telehealth',
 'https://www.linkedin.com/company/americloud-telecom-solutions',
 'https://www.linkedin.com/company/atekit']

## Scraping the jobs

In [24]:
# The main jobs scraping cell. Here we iterate over all companies links in comps_urls and obtain for each company the jobs list (using selenium), 
# and then scraping all the jobs using MultiSessionRetriever from Part 1, but with different data scraping functions. For each company, after the scraping, 
# we append the data to the "scraped_jobs.csv". This file is in the format: <company's url>, <json list of jobs data (list of dictionaries)>

total_start = time()
max_attempts = 5     # max retries for each company, after catching some critical exception
start_from = 0       # company number to start from (in case of previous interuption)
print_errors = True
headless=True        # set to True if you want to see the webdriver browser while scraping (else runs in background)
set_proxy = False    # set to True if you want to use the proxy defined in the get_options() method

# variables for additional info
failed = 0
fail_reason = ""

total_scrapped = 0
total_failed = 0

# variables for MultiSessionRetriever scraping (for async scraping of the jobs from company's jobs list)
switch_ip_every_n_req = 20
loop = asyncio.get_event_loop()
lock = asyncio.Lock()
retriever = MultiSessionRetriever('brd-customer-hl_80709a30-zone-datacenter_proxy_vlad', 'c423aw7kxrym', switch_ip_every_n_req, lock)

# The main loop - For each company: scrap the jobs links list, then asyncronously scrap the jobs data from each link, finally store in the scraped_jobs.scv
for i, comp_url in enumerate(comp_urls[start_from:]):
    print(f"\nCompany {i + start_from + 1}: {comp_url}")
    fail_reason = "Too many attempts"
    attempts = 0
    refreshes = 0
    start = time()
    jobs_links = []
    ok = False
    comp_name = None
    driver = None
    while attempts <= max_attempts:
        # starting the webdriver
        try:
            driver = restart_webdriver(driver, comp_url, headless=headless, set_proxy=set_proxy)
            break
        except Exception as e:
            print(e)
            attempts += 1
            
    # scrap the jobs data from current company
    while not ok and attempts <= max_attempts:
        # check for the "join" popup window, close if exists
        try:
            wait = WebDriverWait(driver, 2)
            close_popup = wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'modal__dismiss')))
            close_popup.click()
        except:
            pass

        # get the company name from the company page, (only once in the loop)
        if not comp_name:
            try:
                wait = WebDriverWait(driver, 5)
                comp_name = wait.until(EC.visibility_of_element_located((By.CLASS_NAME, 'top-card-layout__title'))).text
                print(comp_name)
            except Exception as e:
                try:
                    driver = restart_webdriver(driver, comp_url, headless=headless, set_proxy=set_proxy)
                except:
                    attempts += 1
                    continue
                if print_errors: print(f"No company name, attempt: {attempts + 1}")
                attempts += 1
                continue
        
        # find the "See jobs" button, if not exists - store exit the loop and store the company url in the failed_to_scrap.csv, for future retries        
        try:
            wait = WebDriverWait(driver, 3)
            see_jobs_link = wait.until(EC.element_to_be_clickable((By.XPATH, '//a[contains(@class, "top-card-layout__cta") and contains(text(), "See jobs")]')))
            see_jobs_link.click()
        except Exception as e:
            try:
                driver = restart_webdriver(driver, comp_url, headless=headless, set_proxy=set_proxy)
            except:
                attempts += 1
                continue
            fail_reason = "No jobs button"
            break
        
        # load the jobs list page (retries till succeed or got to max_attempts)
        driver, success = load_till_success(driver, comp_url, print_errors=print_errors, headless=headless)
        if not success:
            attempts += 1
            continue

        # check if there is a jobs list in the jobs page (sometimes not loaded or got some error), if no jobs list, exit the loop and store the company url in failed_to_scrap.csv
        try:
            WebDriverWait(driver, 2).until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'h1.core-section-container__main-title.main-title')))
            fail_reason = "No jobs in list"
            break
        except:
            pass

        # get the jobs cards list, try till succeed or till max load_attempts reached
        content_loaded = False
        load_attempts = 0
        while not content_loaded and load_attempts < 1:  
            try:
                wait = WebDriverWait(driver, 2)
                jobs_container = wait.until(EC.visibility_of_element_located((By.CLASS_NAME, 'jobs-search__results-list')))
                elements = driver.find_elements('class name', 'base-card__full-link')
                content_loaded = True
            except Exception as e:
                driver.refresh()
                load_attempts += 1
                continue

        # if still no jobs list, restart the web driver and retry
        if not content_loaded:
            try:
                driver = restart_webdriver(driver, comp_url, headless=headless, set_proxy=set_proxy)
            except:
                attempts += 1
                continue
            if print_errors: print(f"Failed to load jobs, attempt: {attempts + 1}")
            attempts += 1
            continue

        # scroll down the list (if needed) till all the jobs loaded (this works now, after empiric studying of how to do it)
        wait = WebDriverWait(driver, 0.25)
        scroll_start = time()
        while time() - scroll_start < 120:    # 2 minutes timeout for scrolling down to load additional jobs
            try:
                seen_all = wait.until(EC.visibility_of_element_located((By.CLASS_NAME, 'see-more-jobs__viewed-all')))
                break
            except:
                pass

            try:
                seen_all = wait.until(EC.visibility_of_element_located((By.CLASS_NAME, 'infinite-scroller__show-more-button')))
                see_more_btn = wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'infinite-scroller__show-more-button')))
                see_more_btn.click()
            except:
                pass
                
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            sleep(1)
            driver.execute_script("window.scrollTo(0, 0);")
            
        # get the jobs links, benefits and post date and append to the company's jobs links list
        elements = driver.find_elements(By.XPATH, '//div[contains(@class, "base-card") and contains(@class, "job-search-card")]')
        try:
            for element in elements:
                soup = BeautifulSoup(element.get_attribute('innerHTML'), 'html.parser')
                link = soup.find('a', class_='base-card__full-link').get('href')
                benefits = soup.find('span', class_='job-posting-benefits__text')
                benefits = benefits.text.strip() if benefits else None
                datetime = soup.find('time', class_='job-search-card__listdate')
                datetime = datetime['datetime'] if datetime else None
                job_comp_name = soup.find('a', class_='hidden-nested-link')
                if job_comp_name and comp_name in job_comp_name.text: 
                    jobs_links.append({"url": link, "benefits": benefits, "datetime": datetime})
                    
        except Exception as e:
            print("Failed to obtain job links")
            continue
        print(f"Jobs links obtained: {len(jobs_links)}")
        ok = True

    # at this stage all the company's jobs links obtained, or got some error, even after max attempts
    driver.quit()
    
    if not ok:
        print(f"Could not obtain jobs from: {comp_url}, reason: {fail_reason}, elapsed: {round(time() - start, 1)} sec.")
        failed += 1
        try:
            with open('failed_to_scrap.csv', 'a', newline='', encoding='utf-8') as csv_file:  
                writer = csv.writer(csv_file)
                writer.writerow([comp_url, fail_reason]) 
        except:
            pass
    else:
        # store the jobs links in the file (this is not necessary now, because as we found lately, the links are relevant only for a short time, but we left it for backup or something)
        try:
            with open('comps_jobs_links.csv', 'a', newline='', encoding='utf-8') as csv_file:  
                writer = csv.writer(csv_file)
                writer.writerow([comp_url, json.dumps(jobs_links)])
        except:
            # if failed to store, it's not critical
            pass

        # here we asybcronously scrap the jobs data using the obtained links
        # clean and format the links and meta data
        jobs_links = json.loads(json.dumps(jobs_links))
        format_benefits(jobs_links)
        choose_fresh_links(jobs_links)  # add here date_limit= parameter if you want only the jobs that posted after the date_limit, in format YYYY-MM-DD
        
        # scrap and store to the data_container
        data_container = {"current_idx": i, "jobs": [], "failed": {}, "jobs_links": jobs_links}
        await loop.create_task(get_jobs(comp_url, jobs_links, retriever, data_container))

        # print the output info
        total_scrapped += len(data_container["jobs"])
        total_failed += len(data_container["failed"])
        print(f'Finished scraping: {comp_url}, {len(data_container["jobs"])} out of {len(jobs_links)}, failed: {len(data_container["failed"])}, elapsed: {round(time() - start, 1)} sec.')
        
print(f"\nFinished scraping all. Total scraped: {total_scrapped}, total failed: {total_failed}, time elapsed: {round(time() - total_start, 1)} sec.")


Company 1: https://www.linkedin.com/company/bunnystudio
Bunny Studio
Jobs links obtained: 1
Finished scraping: https://www.linkedin.com/company/bunnystudio, 1 out of 1, failed: 0, elapsed: 17.0 sec.

Company 2: https://www.linkedin.com/company/asd-global
ASD Global
Jobs links obtained: 1
Finished scraping: https://www.linkedin.com/company/asd-global, 1 out of 1, failed: 0, elapsed: 19.4 sec.

Company 3: https://www.linkedin.com/company/adaptive-telehealth
Adaptive Telehealth
Jobs links obtained: 2
Finished scraping: https://www.linkedin.com/company/adaptive-telehealth, 2 out of 2, failed: 0, elapsed: 21.6 sec.

Company 4: https://www.linkedin.com/company/americloud-telecom-solutions
AmeriCloud Telecom Solutions
Could not obtain jobs from: https://www.linkedin.com/company/americloud-telecom-solutions, reason: No jobs button, elapsed: 29.2 sec.

Company 5: https://www.linkedin.com/company/atekit
AtekIT
Jobs links obtained: 1
Finished scraping: https://www.linkedin.com/company/atekit, 1 

# Checking the scraped data

In [5]:
import csv
field_size_limit = csv.field_size_limit(10**9)

In [1]:
# Total jobs data obtained:
with open("scraped_jobs.csv", 'r', encoding='utf-8') as file:
    jobs_counts = [line.count("datetime") for line in file]
    print(f"Total jobs scraped: {sum(jobs_counts)}, of {len(jobs_counts)} companies.")

Total jobs scraped: 17707, of 1078 companies.


In [48]:
# Load the scraped data
with open('scraped_jobs.csv', newline='', encoding='utf-8') as f:
    reader = csv.reader(f)
    jobs_dict = {comp[0]: list(eval(comp[1])) for comp in reader}

In [82]:
description = "{'Job Title and Summary': 'Field Technician: Constructing and building out a network in residential apartment complexes and office buildings in NYC, Northern NJ & Stamford, CT.', 'Company Overview': 'Honest Networks is a fast-growing, venture-backed internet provider focused on providing gigabit internet service at affordable, transparent prices, with exceptional customer service.', 'Job Responsibilities': ['Construct and build out network in residential apartment complexes and office buildings', 'Run fiber and ethernet through building risers', 'Mount and install networking equipment in MDF/IDF closets', 'Set up and align radios on rooftops'], 'Qualifications and Skills': ['1-3 year experience with fiber optic, ethernet cable or electrician/conduit work', 'Ability to run fiber and cat5e/6 cabling', 'Ability to install, terminate, test and troubleshoot network cabling', 'Ability to power and test networking switches and other networking hardware', 'Comfortable using mass transit during the day', 'Comfortable working in high-rises and on rooftops', 'Ability to read, understand and follow work orders & change orders', 'Capable of lifting up to 50 pounds', 'Comfortable working in confined spaces'], 'Required Education': 'Not specified', 'Benefits and Perks': ['Attractive salary ($26-35/hr)', 'Performance bonus potential', 'Stock options', 'Paid-time off (PTO)', 'Medical and dental benefits', 'Metro card reimbursement'], 'Application Process': 'Not specified', 'Contact Information': 'Not specified'}"

In [83]:
description = eval(description)

In [84]:
description

{'Job Title and Summary': 'Field Technician: Constructing and building out a network in residential apartment complexes and office buildings in NYC, Northern NJ & Stamford, CT.',
 'Company Overview': 'Honest Networks is a fast-growing, venture-backed internet provider focused on providing gigabit internet service at affordable, transparent prices, with exceptional customer service.',
 'Job Responsibilities': ['Construct and build out network in residential apartment complexes and office buildings',
  'Run fiber and ethernet through building risers',
  'Mount and install networking equipment in MDF/IDF closets',
  'Set up and align radios on rooftops'],
 'Qualifications and Skills': ['1-3 year experience with fiber optic, ethernet cable or electrician/conduit work',
  'Ability to run fiber and cat5e/6 cabling',
  'Ability to install, terminate, test and troubleshoot network cabling',
  'Ability to power and test networking switches and other networking hardware',
  'Comfortable using ma

In [63]:
# Show some data example
index = 144
url, jobs = list(jobs_dict)[index], list(jobs_dict.values())[index]
print(url)
for job in jobs[:1]:
    # print(job['description'])
    display(Markdown(job['description']))
    for key, value in job.items():
        print(f"\n{key}: {value}")

https://www.linkedin.com/company/waynsys-inc



<p><strong>Job Description </strong></p><ul><li>Facilitate the implementation and support of SAP Logistics, SAP WM</li><li>Map client business requirements, processes, and objectives; develops necessary product modifications to satisfy clients' needs.</li><li>Design, customize, configure, and test the SAP Logistics, WM module.</li><li>Identify gaps, issues, and work around solutions.</li><li>Provide ad-hoc training and user support as required.</li><li>Proactively identify and propose business processes and system enhancement.</li><li>Perform detailed analysis of complex business process requirements and provide appropriate system solutions; identify, interpret, validate, and document customer requirements.</li><li>Facilitate workshops to collect business requirements.</li><li>Handle changes or emergency transport as needed for high priority issues</li></ul><p><br/></p>



title: SAP Logistics Consultant

description: 
<p><strong>Job Description </strong></p><ul><li>Facilitate the implementation and support of SAP Logistics, SAP WM</li><li>Map client business requirements, processes, and objectives; develops necessary product modifications to satisfy clients' needs.</li><li>Design, customize, configure, and test the SAP Logistics, WM module.</li><li>Identify gaps, issues, and work around solutions.</li><li>Provide ad-hoc training and user support as required.</li><li>Proactively identify and propose business processes and system enhancement.</li><li>Perform detailed analysis of complex business process requirements and provide appropriate system solutions; identify, interpret, validate, and document customer requirements.</li><li>Facilitate workshops to collect business requirements.</li><li>Handle changes or emergency transport as needed for high priority issues</li></ul><p><br/></p>


location: Atlanta, GA

post_time: None

criterias: {'Seniority leve

## Description example 1:

C3EL has a great opportunity for a Secret-cleared Network Operations Center [NOC] Technician to provide network services in support of the Ballistic Missile Defense System (BMDS) by monitoring &amp; troubleshooting the long-haul connections that make up the Ground-based Midcourse Defense Communications Network (GCN).<br/><br/>The supported network is essential to the integrated, layered, ballistic defense system (BMDS) used to defend the United States, it's deployed forces, allies, and friends against all ranges of enemy ballistic missiles in all phases of flight.<br/><br/>These connections, using technologies such as TDM &amp; DWDM, can include transport devices such as MSPP, Promina, ODXC, Ciena CoreStream and 6500, ADTRAN, and SafeNet encryptors.<br/><br/>Providing Services over these long-haul connections requires the monitoring and troubleshooting of network devices, such as Juniper Routers. Network monitoring includes the use of programs such as Cisco Prime Optical, Cisco Transport Controller, IBM Netcool, and Sycamore SilvX.<br/><br/><strong>This position involves shift work, which includes nights &amp; weekends.<br/><br/></strong><strong>Duties and Responsibilities may include, but not be limited to:<br/><br/></strong><ul><li>Fielding phone calls from users opening tickets as required</li><li>Responding to queued Remedy tickets</li><li>Interfacing with other DISA technology NOCs</li><li>Employing methodical telecommunications circuit/trunk troubleshooting techniques</li><li>Reaching out to customers for coordination and resolution confirmation<br/><br/></li></ul><strong>Minimum Qualifications:<br/><br/></strong><ul><li>Secret (or higher) clearance</li><li>2+ years working knowledge of Remedy Global Ticket Management System</li><li>Relevant experience and expertise with at least one transport technology, i.e. OTN, MSPP, or ATM</li><li>Basic knowledge of IP circuits &amp; routers</li><li>IAT II Certification (Security+ CE, CCNA-Security, etc)<br/><br/></li></ul><strong>Desired Qualifications:<br/><br/></strong><ul><li>The candidate must be able to multi-task and manage/prioritize workload to include monitoring multiple Network Monitoring Systems, fielding customer calls via unclassified and classified phone systems, maintain situational awareness via classified chat rooms, and utilizing Global TMS to document all steps taken to identify and resolve service affecting incidents.</li><li>This role is expected to perform advanced troubleshooting and network analysis to determine points of impact for outages and maintenance actions not coordinated with DISA, in addition to steps necessary to mitigate service impacting outages, degradations, or demand maintenance actions/control seizures. </li><li>Execute alt-route actions on high priority circuits with very short turnaround time to avoid mission failures due to lost link conditions on flight control circuits, loss of PED quality video feeds to PED sites, and loss of critical support services such as secure communications, radar data, and weather data. </li><li>The successful candidate must be customer service oriented and be able to communicate effectively with our mission and government partners to ensure proper assessment of operational/mission impacts and prompt response to priority circuits<br/><br/></li></ul><strong>Education:<br/><br/></strong><ul><li> A minimum of a High School diploma or equivalent is required</li></ul> 

## Description example 2:

<p>Our client has an immediate need for a contract to hire <strong><em>Scheduling Specialist</em></strong> in Dallas, TX. </p><p><br/></p><p><strong>Purpose of Position</strong></p><p>The Scheduling Specialist’s primary role is to provide complete scheduling oversight for all</p><p>internal/external customers with the best overall customer service experience. The</p><p>Scheduling Specialist facilitates all coordination of regulatory and operator training requirements</p><p>between the company and its customers and clients.</p><p><br/></p><p><strong>Tasks and Responsibilities</strong></p><p>• Manages tasks such as scheduling customers/clients, sending confirmations, creating new</p><p>customer/client accounts, updating client information.</p><p>• Manage all aspects of client training schedules and coordinate with customers on specific</p><p>training needs and requirements that may impact schedule modifications.</p><p>• Understands regulatory requirements and verifies training objectives.</p><p>• Acts as back-up focal for TSA tracking for incoming clients.</p><p>• Perform Export Compliance checks as needed.</p><p>• Verify/collect pre-training documents.</p><p>• Call customer/clients to confirm attendance and training objectives.</p><p>• Manage client retention program.</p><p>• Act as primary point of contact for assigned program(s) by Interacting and communicating</p><p>with internal and external customers as well as regulatory agencies.</p><p>• Familiarity with product and services, when possible, provide customers with</p><p>additional training available to enhance the overall experience and ultimately generate more</p><p>sales and revenue.</p><p>• Review and understand country specific guidance and/or documentations and provide the</p><p>most up to date information to ensure clients/customers and instructors are adhering to all</p><p>regulatory requirements.</p><p>• Responsible to assess, organize, plan, and assign resources to customer training events,</p><p>instructor training and qualifications.</p><p><br/></p><p><strong>Minimum Experience</strong></p><p>Bachelor’s degree in Business or Aviation Management preferred or three (3) years’ related</p><p>experience and/or training; or equivalent combination of education and experience; equivalency</p><p>years’ experience substitution must be in related field.</p><p>• Achieved a master level of all responsibilities of Scheduling Specialist, Associate.</p><p>• One (1) to two (2) years of aviation experience preferred.</p><p>• Requires knowledge of aviation industry terminology, FARs, and prerequisites for</p><p>International courses, as specified by FAA/NAA regulations.</p><p><br/></p><p><strong>Knowledge, Skills, Abilities</strong></p><p>• Excellent customer service skills.</p><p>• Knowledge of aviation terminology as specified by FAA/NAA.</p><p>• Knowledge of basic scheduling concepts and/or experience with scheduling software.</p><p>• Detail oriented with excellent organization and time management skills.</p><p>• Excellent verbal and written communication skills.</p><p>• Ability to interact with various levels of management in a professional manner.</p><p>• Ability to adapt to changes rapidly and perform in a fast-paced work environment.</p><p>• Results-oriented with high drive to achieve objectives and standards with little supervision or</p><p>guidance.</p><p>• Customer/client oriented and ability to adapt/respond to different types of personalities.</p><p>• Fluency in English, through both verbal and written communications; able to speak,</p><p>understand, read, and write.</p><p>• General knowledge of the following software: MS Office Suite, TMS Systems, CRM.</p><p></p>

<p>Criteria drives talent success for over 4,500 organizations around the world. Through innovative assessments, video interviewing, and talent management tools, we help companies build more engaged workforces, improve retention, generate more revenue, and increase productivity. </p><p>  </p><p>We've been featured on the Inc. 5000’s list of fastest-growing private companies in the U.S. for the last seven years and have been recognized as a Best Place to Work by Inc. and Built in LA. Most importantly, people are at the heart of everything we do. Our mission is to help companies and job candidates connect to do fulfilling, meaningful work together. </p><p><br/></p><p>PLEASE use this link to apply: https://criteriacorp.bamboohr.com/careers/66?source=aWQ9Mw%3D%3D</p><p><span> </span></p><p><strong>POSITION SUMMARY</strong>  </p><p><span>As an Enterprise Sales Executive at Criteria, you’ll drive growth by winning new customers, helping them embrace a better, more equitable way of identifying, interviewing and developing their talent. You’ll unlock new value throughout our customers’ business by leveraging your consultative sales experience and securing complex deals. In partnership with others in the revenue, product and marketing teams, you’ll help refine our value proposition, product lineup and approach to growing with existing customers.</span></p><p><span> </span></p><p><span>The ideal candidate possesses a deep empathy for customer problems and a passion for engaging across roles and organizations of prospective customers. You love the journey of discovering how we might help them succeed at recruiting and developing a diverse set of employees.</span></p><p><span> </span></p><p><strong>REQUIRED KNOWLEDGE/SKILLS/ABILITIES</strong><span> </span></p><p>To be successful in this role the incumbent will demonstrate the following: </p><ul><li>Utilize insight and consultative selling techniques to understand the challenges faced by enterprise level accounts with 1,000+ employees, proposing innovative solutions leveraging Criteria Corp's platform.</li><li>Engage with industry influencers and buyers to stay informed about market trends, pressures, and challenges, adapting sales strategies accordingly.</li><li>Coach customer stakeholders and build consensus for Criteria Corp's solutions within organizations.</li><li>Employ value and consultative selling to identify customer needs, develop value-added propositions, and generate complex proposal and pricing structures.</li><li>Collaboratively and independently develop strategies to overcome deal-level challenges.</li><li>Map out large accounts, establishing relationships with decision-makers at multiple levels.</li><li>Implement a sales strategy for the territory to meet revenue and profit objectives through new business development.</li><li>Leverage internal teams such as product management, marketing, and engineering to address prospect needs collaboratively</li><li>Navigate through layers and barriers to reach multiple decision-makers and departments within an account</li><li>Demonstrate full sales cycle skills, from research and discovery to software demonstrations, negotiation, and pricin</li><li>Regularly update the CRM system with the latest customer information and use customer intelligence for account planning purposes.</li></ul><p><span> </span></p><p><strong>RESPONSIBILITIES</strong><span> </span></p><p>The primary responsibilities of this role include:  </p><ul><li>8+ years sales experience, preferably delivering B2B SaaS solutions to complex business problems in large enterprises, including selling through the C-suite at Fortune 500 companies.</li><li>Track record of managing complex sales cycles and securing strategic deals by understanding customer problems and crafting tailored solutions, leading to annual success in exceeding quota targets of $1M+.</li><li>Proven success of effectively generating 50%+ of a sales pipeline via outbound strategic and defined sales methodologies and processes.</li><li>Demonstrated success in identifying, growing and nurturing relationships at multiple levels within a company, and building consensus amongst disparate stakeholders.</li><li>Analytical mindset with the ability to interpret data, identify trends and make data-driven decisions to optimize sales performance.</li><li>Proficiency in CRM software (e.g., Salesforce, Churn Zero, Gong) and other sales tools for pipeline management, reporting and forecasting.</li><li>Strong organizational skills and the ability to manage multiple priorities in a dynamic, high-growth company environment.</li><li>Team player who enjoys helping others hone their craft by openly sharing their own successes and failures.</li><li>Willingness to travel 10-20% of the time as needed to meet with customers and attend industry events.</li></ul><p><br/></p>