PaulMcInnis · markkvdb · Oct 3, 2020 · Oct 3, 2020 · Oct 3, 2020 · Oct 3, 2020
diff --git a/jobfunnel/backend/scrapers/base.py b/jobfunnel/backend/scrapers/base.py
@@ -3,14 +3,15 @@
 """
 import random
 from abc import ABC, abstractmethod
-from concurrent.futures import ThreadPoolExecutor, as_completed
+from concurrent.futures import ThreadPoolExecutor, as_completed, wait
 from multiprocessing import Lock, Manager
 from time import sleep
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Tuple
 
 from bs4 import BeautifulSoup
-from requests import Session
+from requests import Session, Response
 from requests.adapters import HTTPAdapter
+from urllib.parse import urlencode
 from tqdm import tqdm
 from urllib3.util import Retry
 
@@ -52,6 +53,10 @@ def __init__(self, session: Session, config: 'JobFunnelConfigManager',
         self.job_filter = job_filter
         self.session = session
         self.config = config
+        self.query = ' '.join(config.search_config.keywords)
+        # if we match exact result, we add quotes.
+        if self.config.search_config.exact_result: 
+            self.query = f'"{self.query}"'
         if self.headers:
             self.session.headers.update(self.headers)
 
@@ -193,7 +198,7 @@ def scrape(self) -> Dict[str, Job]:
         # Get a list of job soups from the initial search results page
         # These wont contain enough information to do more than initialize Job
         try:
-            job_soups = self.get_job_soups_from_search_result_listings()
+            job_soups = self.get_job_soups()
         except Exception as err:
             raise ValueError(
                 "Unable to extract jobs from initial search result page:\n\t"
@@ -351,18 +356,6 @@ def scrape_job(self, job_soup: BeautifulSoup, delay: float,
         return job
     # pylint: enable=no-member
 
-    @abstractmethod
-    def get_job_soups_from_search_result_listings(self) -> List[BeautifulSoup]:
-        """Scrapes a job provider's response to a search query where we are
-        shown many job listings at once.
-
-        NOTE: the soups list returned by this method should contain enough
-        information to set your self.min_required_job_fields with get()
-
-        Returns:
-            List[BeautifulSoup]: list of jobs soups we can use to make a Job
-        """
-
     @abstractmethod
     def get(self, parameter: JobField, soup: BeautifulSoup) -> Any:
         """Get a single job attribute from a soup object by JobField
@@ -423,6 +416,132 @@ def _validate_get_set(self) -> None:
                 [field.name for field in excluded_fields]
             )
 
+    def get_job_soups(self) -> List[BeautifulSoup]:
+        """Scrapes raw data from a job source into a list of job-soups
+
+        Returns:
+            List[BeautifulSoup]: list of jobs soups we can use to make Job init
+        """
+        n_pages = self._get_n_pages()
+
+        # Init list of job soups
+        job_soup_dict = {}  # type: List[Any]
+
+        # Init threads & futures list FIXME: we should probably delay here too
+        threads = ThreadPoolExecutor(max_workers=MAX_CPU_WORKERS)
+        try:
+            # Scrape soups for all the result pages containing many job listings
+            futures = []
+            for page in range(1, n_pages+1):
+                futures.append(
+                    threads.submit(
+                        self._get_job_soups_page, page, job_soup_dict
+                    )
+                )
+
+            # Wait for all scrape jobs to finish
+            wait(futures)
+        finally:
+            threads.shutdown()
+
+        return list(job_soup_dict.values())
+
+    def _get_n_pages(self, max_pages: Optional[int] = None) -> int:
+        """Calculates the number of pages of job listings to be scraped.
+
+        i.e. your search yields 230 results at 50 res/page -> 5 pages of jobs
+
+        Args:
+			max_pages: the maximum number of pages to be scraped.
+        Returns:
+            The number of pages to be scraped.
+        """
+        # Get the html data, initialize bs4 with lxml
+        first_soup = self._get_search_page_soup()
+        num_res, n_pages = self._extract_pages_and_total_listings(first_soup)
+
+        self.logger.debug(f"Found {num_res} job postings resulting in {n_pages} pages")
+
+        if not max_pages:
+            return n_pages
+        elif n_pages < max_pages:
+            return n_pages
+        else:
+            return max_pages
+
+    @abstractmethod
+    def _extract_pages_and_total_listings(self, soup: BeautifulSoup) -> Tuple[int, int]:
+        """Method to extract the total number of listings and pages."""
+
+    def _get_job_soups_page(self, page: int, 
+                            job_soup_dict: Dict[str, BeautifulSoup]) -> None:
+        """Scrapes the indeed page for a list of job soups
+        NOTE: modifies the job_soup_list in-place
+        NOTE: Indeed's remoteness filter sucks, and we will always see a mix.
+            ... need to add some kind of filtering for this!
+        """
+        # get the soup for the page
+        r_soup = self._get_search_page_soup(page=page)
+
+        # add (or overwite new job listings)
+        for job_soup in self._parse_job_listings_to_bs4(r_soup):
+            job_id = self.get(JobField.KEY_ID, job_soup)
+            if job_id not in job_soup_dict.keys():
+                job_soup_dict[job_id] = job_soup
+
+    def _get_search_page(self, method='get', page: int = 1) -> Response:
+        """Return the session of the initial search
+
+        Args:
+            method: either GET or POST
+            page: which page to select (if possible)
+        Returns:
+            a valid session.
+        """
+        search_stem_url = self._get_search_stem_url()
+        search_args = self._get_search_args()
+
+        # append page query (provider depended)
+        if page > 1:
+            pg_name, pg_val = self._get_page_query(page)
+            search_args[pg_name] = pg_val
+
+        if method == 'get':
+            # encode url to URI encoding
+            url = f"{search_stem_url}?{urlencode(search_args)}"
+            return self.session.get(url)
+        elif method == 'post':
+            return self.session.post(search_stem_url, data=search_args)
+        else:
+            return ValueError(f"Method should be either post or get not {method}")
+
+    def _get_search_page_soup(self, method='get', page: int = 1) -> BeautifulSoup:
+        """Wrapper around get_search_page to obtain response in soup form."""
+        # get response html
+        response_html = self._get_search_page(method, page)
+
+        # log url
+        self.logger.info(f"Scraped from url: {response_html.url}")
+
+        return BeautifulSoup(response_html.text, self.config.bs4_parser)
+
+    @abstractmethod
+    def _parse_job_listings_to_bs4(self, page_soup: BeautifulSoup
+                                   ) -> List[BeautifulSoup]:
+        """Parse a page of job listings HTML text into job soups."""
+
+    @abstractmethod
+    def _get_search_stem_url(self) -> str:
+        """Get the search stem url for initial search."""
+
+    @abstractmethod
+    def _get_search_args(self) -> Dict[str, str]:
+        """Get all arguments used for the search query."""
+
+    @abstractmethod
+    def _get_page_query(self, page: int) -> Tuple[str, str]:
+        """Return query parameter and value for specific provider."""
+
 
 # Just some basic localized scrapers, you can inherit these to set the locale.
 class BaseUSAEngScraper(BaseScraper):