Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow exact matches and create greater abstraction of the base scraper class. #114

Closed
wants to merge 16 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
151 changes: 135 additions & 16 deletions jobfunnel/backend/scrapers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,15 @@
"""
import random
from abc import ABC, abstractmethod
from concurrent.futures import ThreadPoolExecutor, as_completed
from concurrent.futures import ThreadPoolExecutor, as_completed, wait
from multiprocessing import Lock, Manager
from time import sleep
from typing import Any, Dict, List, Optional
from typing import Any, Dict, List, Optional, Tuple

from bs4 import BeautifulSoup
from requests import Session
from requests import Session, Response
from requests.adapters import HTTPAdapter
from urllib.parse import urlencode
from tqdm import tqdm
from urllib3.util import Retry

Expand Down Expand Up @@ -52,6 +53,10 @@ def __init__(self, session: Session, config: 'JobFunnelConfigManager',
self.job_filter = job_filter
self.session = session
self.config = config
self.query = ' '.join(config.search_config.keywords)
# if we match exact result, we add quotes.
if self.config.search_config.exact_result:
self.query = f'"{self.query}"'
if self.headers:
self.session.headers.update(self.headers)

Expand Down Expand Up @@ -193,7 +198,7 @@ def scrape(self) -> Dict[str, Job]:
# Get a list of job soups from the initial search results page
# These wont contain enough information to do more than initialize Job
try:
job_soups = self.get_job_soups_from_search_result_listings()
job_soups = self.get_job_soups()
except Exception as err:
raise ValueError(
"Unable to extract jobs from initial search result page:\n\t"
Expand Down Expand Up @@ -351,18 +356,6 @@ def scrape_job(self, job_soup: BeautifulSoup, delay: float,
return job
# pylint: enable=no-member

@abstractmethod
def get_job_soups_from_search_result_listings(self) -> List[BeautifulSoup]:
"""Scrapes a job provider's response to a search query where we are
shown many job listings at once.

NOTE: the soups list returned by this method should contain enough
information to set your self.min_required_job_fields with get()

Returns:
List[BeautifulSoup]: list of jobs soups we can use to make a Job
"""

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

while I am in favour of implementing abstract classes to get job pages -> listings -> soups, I think we should put this workflow into it's own class such as BaseMultiPageScraper so that we can write scrapers for static web-pages that only have a single page (sort of like monster).

I think this way we can do the single-page scroll type of job sites as a BaseSinglePageScraper.

@abstractmethod
def get(self, parameter: JobField, soup: BeautifulSoup) -> Any:
"""Get a single job attribute from a soup object by JobField
Expand Down Expand Up @@ -423,6 +416,132 @@ def _validate_get_set(self) -> None:
[field.name for field in excluded_fields]
)

def get_job_soups(self) -> List[BeautifulSoup]:
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

naming here is also a bit confusing, perhaps we can call it get_job_listings_as_soup ? the naming somewhat conflicts with the below _get_job_soups_page

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agree with the naming. It's a bit confusing as of now. Will think about more consistent and clearer naming.

"""Scrapes raw data from a job source into a list of job-soups

Returns:
List[BeautifulSoup]: list of jobs soups we can use to make Job init
"""
n_pages = self._get_n_pages()

# Init list of job soups
job_soup_dict = {} # type: List[Any]

# Init threads & futures list FIXME: we should probably delay here too
threads = ThreadPoolExecutor(max_workers=MAX_CPU_WORKERS)
try:
# Scrape soups for all the result pages containing many job listings
futures = []
for page in range(1, n_pages+1):
futures.append(
threads.submit(
self._get_job_soups_page, page, job_soup_dict
)
)

# Wait for all scrape jobs to finish
wait(futures)
finally:
threads.shutdown()

return list(job_soup_dict.values())

def _get_n_pages(self, max_pages: Optional[int] = None) -> int:
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would prefer get_num_result_pages

"""Calculates the number of pages of job listings to be scraped.

i.e. your search yields 230 results at 50 res/page -> 5 pages of jobs

Args:
max_pages: the maximum number of pages to be scraped.
Returns:
The number of pages to be scraped.
"""
# Get the html data, initialize bs4 with lxml
first_soup = self._get_search_page_soup()
num_res, n_pages = self._extract_pages_and_total_listings(first_soup)

self.logger.debug(f"Found {num_res} job postings resulting in {n_pages} pages")

if not max_pages:
return n_pages
elif n_pages < max_pages:
return n_pages
else:
return max_pages

@abstractmethod
def _extract_pages_and_total_listings(self, soup: BeautifulSoup) -> Tuple[int, int]:
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

all abstract methods should have a name without the _, I had added that to indicate that those methods were private to the specific scraper class.

additionally all the stubs should have a detailed docstring explaining the expected implementation

"""Method to extract the total number of listings and pages."""

def _get_job_soups_page(self, page: int,
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm confused about the difference between this and above get_job_soups

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also the docstring here refers to indeed but it is for the base scraper.

job_soup_dict: Dict[str, BeautifulSoup]) -> None:
"""Scrapes the indeed page for a list of job soups
NOTE: modifies the job_soup_list in-place
NOTE: Indeed's remoteness filter sucks, and we will always see a mix.
... need to add some kind of filtering for this!
"""
# get the soup for the page
r_soup = self._get_search_page_soup(page=page)

# add (or overwite new job listings)
for job_soup in self._parse_job_listings_to_bs4(r_soup):
job_id = self.get(JobField.KEY_ID, job_soup)
if job_id not in job_soup_dict.keys():
job_soup_dict[job_id] = job_soup

def _get_search_page(self, method='get', page: int = 1) -> Response:
"""Return the session of the initial search

Args:
method: either GET or POST
page: which page to select (if possible)
Returns:
a valid session.
"""
search_stem_url = self._get_search_stem_url()
search_args = self._get_search_args()

# append page query (provider depended)
if page > 1:
pg_name, pg_val = self._get_page_query(page)
search_args[pg_name] = pg_val

if method == 'get':
# encode url to URI encoding
url = f"{search_stem_url}?{urlencode(search_args)}"
return self.session.get(url)
elif method == 'post':
return self.session.post(search_stem_url, data=search_args)
else:
return ValueError(f"Method should be either post or get not {method}")

def _get_search_page_soup(self, method='get', page: int = 1) -> BeautifulSoup:
"""Wrapper around get_search_page to obtain response in soup form."""
# get response html
response_html = self._get_search_page(method, page)

# log url
self.logger.info(f"Scraped from url: {response_html.url}")

return BeautifulSoup(response_html.text, self.config.bs4_parser)

@abstractmethod
def _parse_job_listings_to_bs4(self, page_soup: BeautifulSoup
) -> List[BeautifulSoup]:
"""Parse a page of job listings HTML text into job soups."""

@abstractmethod
def _get_search_stem_url(self) -> str:
"""Get the search stem url for initial search."""

@abstractmethod
def _get_search_args(self) -> Dict[str, str]:
"""Get all arguments used for the search query."""

@abstractmethod
def _get_page_query(self, page: int) -> Tuple[str, str]:
"""Return query parameter and value for specific provider."""


# Just some basic localized scrapers, you can inherit these to set the locale.
class BaseUSAEngScraper(BaseScraper):
Expand Down
Loading