-
Notifications
You must be signed in to change notification settings - Fork 210
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Allow exact matches and create greater abstraction of the base scraper class. #114
Changes from all commits
5944900
e5b06fa
797cba4
defbe01
87c89dc
ab068ee
ff839ad
251a999
d3455d9
fc7030f
aaf9305
420c25c
c35df1c
8da2e99
a6c60cd
616d351
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,14 +3,15 @@ | |
""" | ||
import random | ||
from abc import ABC, abstractmethod | ||
from concurrent.futures import ThreadPoolExecutor, as_completed | ||
from concurrent.futures import ThreadPoolExecutor, as_completed, wait | ||
from multiprocessing import Lock, Manager | ||
from time import sleep | ||
from typing import Any, Dict, List, Optional | ||
from typing import Any, Dict, List, Optional, Tuple | ||
|
||
from bs4 import BeautifulSoup | ||
from requests import Session | ||
from requests import Session, Response | ||
from requests.adapters import HTTPAdapter | ||
from urllib.parse import urlencode | ||
from tqdm import tqdm | ||
from urllib3.util import Retry | ||
|
||
|
@@ -52,6 +53,10 @@ def __init__(self, session: Session, config: 'JobFunnelConfigManager', | |
self.job_filter = job_filter | ||
self.session = session | ||
self.config = config | ||
self.query = ' '.join(config.search_config.keywords) | ||
# if we match exact result, we add quotes. | ||
if self.config.search_config.exact_result: | ||
self.query = f'"{self.query}"' | ||
if self.headers: | ||
self.session.headers.update(self.headers) | ||
|
||
|
@@ -193,7 +198,7 @@ def scrape(self) -> Dict[str, Job]: | |
# Get a list of job soups from the initial search results page | ||
# These wont contain enough information to do more than initialize Job | ||
try: | ||
job_soups = self.get_job_soups_from_search_result_listings() | ||
job_soups = self.get_job_soups() | ||
except Exception as err: | ||
raise ValueError( | ||
"Unable to extract jobs from initial search result page:\n\t" | ||
|
@@ -351,18 +356,6 @@ def scrape_job(self, job_soup: BeautifulSoup, delay: float, | |
return job | ||
# pylint: enable=no-member | ||
|
||
@abstractmethod | ||
def get_job_soups_from_search_result_listings(self) -> List[BeautifulSoup]: | ||
"""Scrapes a job provider's response to a search query where we are | ||
shown many job listings at once. | ||
|
||
NOTE: the soups list returned by this method should contain enough | ||
information to set your self.min_required_job_fields with get() | ||
|
||
Returns: | ||
List[BeautifulSoup]: list of jobs soups we can use to make a Job | ||
""" | ||
|
||
@abstractmethod | ||
def get(self, parameter: JobField, soup: BeautifulSoup) -> Any: | ||
"""Get a single job attribute from a soup object by JobField | ||
|
@@ -423,6 +416,132 @@ def _validate_get_set(self) -> None: | |
[field.name for field in excluded_fields] | ||
) | ||
|
||
def get_job_soups(self) -> List[BeautifulSoup]: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. naming here is also a bit confusing, perhaps we can call it There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agree with the naming. It's a bit confusing as of now. Will think about more consistent and clearer naming. |
||
"""Scrapes raw data from a job source into a list of job-soups | ||
|
||
Returns: | ||
List[BeautifulSoup]: list of jobs soups we can use to make Job init | ||
""" | ||
n_pages = self._get_n_pages() | ||
|
||
# Init list of job soups | ||
job_soup_dict = {} # type: List[Any] | ||
|
||
# Init threads & futures list FIXME: we should probably delay here too | ||
threads = ThreadPoolExecutor(max_workers=MAX_CPU_WORKERS) | ||
try: | ||
# Scrape soups for all the result pages containing many job listings | ||
futures = [] | ||
for page in range(1, n_pages+1): | ||
futures.append( | ||
threads.submit( | ||
self._get_job_soups_page, page, job_soup_dict | ||
) | ||
) | ||
|
||
# Wait for all scrape jobs to finish | ||
wait(futures) | ||
finally: | ||
threads.shutdown() | ||
|
||
return list(job_soup_dict.values()) | ||
|
||
def _get_n_pages(self, max_pages: Optional[int] = None) -> int: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would prefer |
||
"""Calculates the number of pages of job listings to be scraped. | ||
|
||
i.e. your search yields 230 results at 50 res/page -> 5 pages of jobs | ||
|
||
Args: | ||
max_pages: the maximum number of pages to be scraped. | ||
Returns: | ||
The number of pages to be scraped. | ||
""" | ||
# Get the html data, initialize bs4 with lxml | ||
first_soup = self._get_search_page_soup() | ||
num_res, n_pages = self._extract_pages_and_total_listings(first_soup) | ||
|
||
self.logger.debug(f"Found {num_res} job postings resulting in {n_pages} pages") | ||
|
||
if not max_pages: | ||
return n_pages | ||
elif n_pages < max_pages: | ||
return n_pages | ||
else: | ||
return max_pages | ||
|
||
@abstractmethod | ||
def _extract_pages_and_total_listings(self, soup: BeautifulSoup) -> Tuple[int, int]: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. all abstract methods should have a name without the additionally all the stubs should have a detailed docstring explaining the expected implementation |
||
"""Method to extract the total number of listings and pages.""" | ||
|
||
def _get_job_soups_page(self, page: int, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm confused about the difference between this and above There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. also the docstring here refers to indeed but it is for the base scraper. |
||
job_soup_dict: Dict[str, BeautifulSoup]) -> None: | ||
"""Scrapes the indeed page for a list of job soups | ||
NOTE: modifies the job_soup_list in-place | ||
NOTE: Indeed's remoteness filter sucks, and we will always see a mix. | ||
... need to add some kind of filtering for this! | ||
""" | ||
# get the soup for the page | ||
r_soup = self._get_search_page_soup(page=page) | ||
|
||
# add (or overwite new job listings) | ||
for job_soup in self._parse_job_listings_to_bs4(r_soup): | ||
job_id = self.get(JobField.KEY_ID, job_soup) | ||
if job_id not in job_soup_dict.keys(): | ||
job_soup_dict[job_id] = job_soup | ||
|
||
def _get_search_page(self, method='get', page: int = 1) -> Response: | ||
"""Return the session of the initial search | ||
|
||
Args: | ||
method: either GET or POST | ||
page: which page to select (if possible) | ||
Returns: | ||
a valid session. | ||
""" | ||
search_stem_url = self._get_search_stem_url() | ||
search_args = self._get_search_args() | ||
|
||
# append page query (provider depended) | ||
if page > 1: | ||
pg_name, pg_val = self._get_page_query(page) | ||
search_args[pg_name] = pg_val | ||
|
||
if method == 'get': | ||
# encode url to URI encoding | ||
url = f"{search_stem_url}?{urlencode(search_args)}" | ||
return self.session.get(url) | ||
elif method == 'post': | ||
return self.session.post(search_stem_url, data=search_args) | ||
else: | ||
return ValueError(f"Method should be either post or get not {method}") | ||
|
||
def _get_search_page_soup(self, method='get', page: int = 1) -> BeautifulSoup: | ||
"""Wrapper around get_search_page to obtain response in soup form.""" | ||
# get response html | ||
response_html = self._get_search_page(method, page) | ||
|
||
# log url | ||
self.logger.info(f"Scraped from url: {response_html.url}") | ||
|
||
return BeautifulSoup(response_html.text, self.config.bs4_parser) | ||
|
||
@abstractmethod | ||
def _parse_job_listings_to_bs4(self, page_soup: BeautifulSoup | ||
) -> List[BeautifulSoup]: | ||
"""Parse a page of job listings HTML text into job soups.""" | ||
|
||
@abstractmethod | ||
def _get_search_stem_url(self) -> str: | ||
"""Get the search stem url for initial search.""" | ||
|
||
@abstractmethod | ||
def _get_search_args(self) -> Dict[str, str]: | ||
"""Get all arguments used for the search query.""" | ||
|
||
@abstractmethod | ||
def _get_page_query(self, page: int) -> Tuple[str, str]: | ||
"""Return query parameter and value for specific provider.""" | ||
|
||
|
||
# Just some basic localized scrapers, you can inherit these to set the locale. | ||
class BaseUSAEngScraper(BaseScraper): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
while I am in favour of implementing abstract classes to get job pages -> listings -> soups, I think we should put this workflow into it's own class such as
BaseMultiPageScraper
so that we can write scrapers for static web-pages that only have a single page (sort of like monster).I think this way we can do the single-page scroll type of job sites as a
BaseSinglePageScraper
.