In [4]:
import requests

In [63]:
!pip install webdriver-manager

Collecting webdriver-manager
  Obtaining dependency information for webdriver-manager from https://files.pythonhosted.org/packages/b1/51/b5c11cf739ac4eecde611794a0ec9df420d0239d51e73bc19eb44f02b48b/webdriver_manager-4.0.1-py2.py3-none-any.whl.metadata
  Downloading webdriver_manager-4.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting python-dotenv (from webdriver-manager)
  Downloading python_dotenv-1.0.0-py3-none-any.whl (19 kB)
Downloading webdriver_manager-4.0.1-py2.py3-none-any.whl (27 kB)
Installing collected packages: python-dotenv, webdriver-manager
Successfully installed python-dotenv-1.0.0 webdriver-manager-4.0.1


In [33]:
from abc import ABC, abstractmethod

from lxml import html, etree
import logging
from fake_headers import Headers

In [8]:
logging.basicConfig(level=logging.DEBUG)

In [48]:
headers = Headers(browser="chrome", os="mac", headers=True).generate()

In [49]:
# headers.update({"Content-Type": "text/html"})

In [50]:
headers

{'Accept': '*/*',
 'Connection': 'keep-alive',
 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
 'DNT': '1',
 'Referer': 'https://google.com'}

In [51]:
class BaseParser:
    _parser = etree.HTMLParser()

    def parse_html(self, html: str):
        try:
            return etree.HTML(html, parser=self._parser)
        except Exception as e:
            logging.exception("Error parsing html", exc_info=e)

    def check_whatsapp(self, html: str) -> bool:
        return html.lower().find("whatsapp") != -1


class AbsParser(ABC, BaseParser):
    BASE_URL = "https://www.example.com"

    @abstractmethod
    def parse(self, url: str):
        raise NotImplementedError

    @abstractmethod
    async def get_html(self, url: str) -> str:
        raise NotImplementedError

    @abstractmethod
    def get_data(self) -> dict[str, str]:
        raise NotImplementedError


In [52]:
class IndeedPageParser(AbsParser):
    BASE_URL = "https://br.indeed.com/"
    _html_dom = None

    def _build_url(self, path: str) -> str:
        logging.debug(f"{self.BASE_URL}{path}")
        return f"{self.BASE_URL}{path}"

    def get_html(self, url: str) -> str:
        url = self._build_url(path=url)
        try:
            resp = requests.post(url, headers=headers)
            logging.debug(resp)
            if resp.status_code != 200:
                logging.error("Bad status code: %s", resp.status_code)
                return resp.text
        except Exception as e:
            logging.exception("Error getting html", exc_info=e)

    def parse(self, url: str) -> dict[str, str]:
        html = self.get_html(url=url)
        if not html:
            logging.error("No html")
            return {}
        if not self.check_whatsapp(html=html):
            logging.error("No whatsapp")
            return {}
        self._html_dom = self.parse_html(html=html)
        return self.get_data()

    def get_data(self) -> dict[str, str]:
        return {
            "vacancy": self.get_vacancy(),
            "company": self.get_company(),
            # "description": self.get_description(),
        }

    def get_vacancy(self) -> str:
        xpath = "//h1/span/text()"
        data = self._html_dom.xpath(xpath)
        # TODO(vadim): raise error if data is empty
        return data[0] if data else ""

    def get_company(self) -> str:
        xpath = "//div[contains(@data-testid, 'inlineHeader-companyName')]/span/a"
        data = self._html_dom.xpath(xpath)
        return data[0] if data else ""


In [53]:
parser = IndeedPageParser()

In [54]:
parser.parse("viewjob?jk=481aca96904bbe62&tk=1hcfksg1ek2ns801&from=serp&vjs=3")

DEBUG:root:https://br.indeed.com/viewjob?jk=481aca96904bbe62&tk=1hcfksg1ek2ns801&from=serp&vjs=3
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): br.indeed.com:443
DEBUG:urllib3.connectionpool:https://br.indeed.com:443 "POST /viewjob?jk=481aca96904bbe62&tk=1hcfksg1ek2ns801&from=serp&vjs=3 HTTP/1.1" 403 None
DEBUG:root:<Response [403]>
ERROR:root:Bad status code: 403
ERROR:root:No whatsapp


{}

In [86]:
from selenium import webdriver

In [87]:
options = webdriver.ChromeOptions()
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--no-sandbox')
options.add_argument('--headless')

In [88]:
driver = webdriver.Remote(
    command_executor="http://127.0.0.1:4444/wd/hub",
    options=webdriver.ChromeOptions()
)

DEBUG:selenium.webdriver.remote.remote_connection:POST http://127.0.0.1:4444/wd/hub/session {"capabilities": {"firstMatch": [{}], "alwaysMatch": {"browserName": "chrome", "pageLoadStrategy": "normal", "goog:chromeOptions": {"extensions": [], "args": []}}}}
DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): 127.0.0.1:4444
DEBUG:urllib3.connectionpool:http://127.0.0.1:4444 "POST /wd/hub/session HTTP/1.1" 200 1080
DEBUG:selenium.webdriver.remote.remote_connection:Remote response: status=200 | data={
  "value": {
    "sessionId": "796f97a30d60a5c1b7d631dd550a688b",
    "capabilities": {
      "acceptInsecureCerts": false,
      "browserName": "chrome",
      "browserVersion": "87.0.4280.141",
      "chrome": {
        "chromedriverVersion": "87.0.4280.141 (9f05d1d9ee7483a73e9fe91ddcb8274ebcec9d7f-refs\u002fbranch-heads\u002f4280@{#2007})",
        "userDataDir": "\u002ftmp\u002f.org.chromium.Chromium.KpHAJO"
      },
      "goog:chromeOptions": {
        "debuggerAddress": "loc

In [89]:
driver.get("https://br.indeed.com/viewjob?jk=481aca96904bbe62&tk=1hcfksg1ek2ns801&from=serp&vjs=3")

DEBUG:selenium.webdriver.remote.remote_connection:POST http://127.0.0.1:4444/wd/hub/session/796f97a30d60a5c1b7d631dd550a688b/url {"url": "https://br.indeed.com/viewjob?jk=481aca96904bbe62&tk=1hcfksg1ek2ns801&from=serp&vjs=3"}
DEBUG:urllib3.connectionpool:http://127.0.0.1:4444 "POST /wd/hub/session/796f97a30d60a5c1b7d631dd550a688b/url HTTP/1.1" 200 14
DEBUG:selenium.webdriver.remote.remote_connection:Remote response: status=200 | data={"value":null} | headers=HTTPHeaderDict({'content-length': '14', 'cache-control': 'no-cache', 'Content-Type': 'application/json; charset=utf-8'})
DEBUG:selenium.webdriver.remote.remote_connection:Finished Request


In [90]:
from selenium.webdriver.common.by import By

In [96]:
# result = driver.find_element(By.XPATH, "//h1/span/text()")


In [97]:
html = driver.page_source

DEBUG:selenium.webdriver.remote.remote_connection:GET http://127.0.0.1:4444/wd/hub/session/796f97a30d60a5c1b7d631dd550a688b/source {}
DEBUG:urllib3.connectionpool:http://127.0.0.1:4444 "GET /wd/hub/session/796f97a30d60a5c1b7d631dd550a688b/source HTTP/1.1" 200 221654
DEBUG:selenium.webdriver.remote.remote_connection:Finished Request


In [98]:
html



In [None]:
IndeedPageParser().get_data

In [101]:
result = driver.find_element_by_xpath("//h1/span/text()")

AttributeError: 'WebDriver' object has no attribute 'find_element_by_xpath'

In [102]:
result = driver.find_element(By.XPATH, "//h1/span/text()")

DEBUG:selenium.webdriver.remote.remote_connection:POST http://127.0.0.1:4444/wd/hub/session/796f97a30d60a5c1b7d631dd550a688b/element {"using": "xpath", "value": "//h1/span/text()"}
DEBUG:urllib3.connectionpool:http://127.0.0.1:4444 "POST /wd/hub/session/796f97a30d60a5c1b7d631dd550a688b/element HTTP/1.1" 400 912
DEBUG:selenium.webdriver.remote.remote_connection:Remote response: status=400 | data={"value":{"error":"invalid selector","message":"invalid selector: The result of the xpath expression \"//h1/span/text()\" is: [object Text]. It should be an element.\n  (Session info: chrome=87.0.4280.141)","stacktrace":"#0 0xaaaac021b178 \u003Cunknown>\n#1 0xaaaac01c3c1c \u003Cunknown>\n#2 0xaaaac002a02c \u003Cunknown>\n#3 0xaaaac002ce1c \u003Cunknown>\n#4 0xaaaac002cc78 \u003Cunknown>\n#5 0xaaaac002cedc \u003Cunknown>\n#6 0xaaaabff96d1c \u003Cunknown>\n#7 0xaaaabffc4bb0 \u003Cunknown>\n#8 0xaaaabff8dd0c \u003Cunknown>\n#9 0xaaaabff8f190 \u003Cunknown>\n#10 0xaaaac01eb7d8 \u003Cunknown>\n#11 0x

InvalidSelectorException: Message: invalid selector: The result of the xpath expression "//h1/span/text()" is: [object Text]. It should be an element.
  (Session info: chrome=87.0.4280.141); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#invalid-selector-exception
Stacktrace:
#0 0xaaaac021b178 <unknown>
#1 0xaaaac01c3c1c <unknown>
#2 0xaaaac002a02c <unknown>
#3 0xaaaac002ce1c <unknown>
#4 0xaaaac002cc78 <unknown>
#5 0xaaaac002cedc <unknown>
#6 0xaaaabff96d1c <unknown>
#7 0xaaaabffc4bb0 <unknown>
#8 0xaaaabff8dd0c <unknown>
#9 0xaaaabff8f190 <unknown>
#10 0xaaaac01eb7d8 <unknown>
#11 0xaaaac01f84b8 <unknown>
#12 0xaaaac01f8224 <unknown>
#13 0xaaaac01fc500 <unknown>
#14 0xaaaac01f8ac8 <unknown>
#15 0xaaaac01e1474 <unknown>
#16 0xaaaac020f128 <unknown>
#17 0xaaaac0225754 <unknown>
#18 0xffffbaaec7e4 start_thread
#19 0xffffba043adc <unknown>
