In [1]:
from celery import Celery
from celery.schedules import crontab
from structlog import get_logger

from dreamjob.backend.db.utils.collect_data import get_vacancies
from dreamjob.backend.db.utils.preprocess_data import preprocess_data
from dreamjob.backend.db.utils.data_manipulation import insert, select
from dreamjob.backend.db.utils.engine import DBConfig
from dreamjob.backend.commons.exceptions import DataCollectionFailed

logger = get_logger()


def add_new_vacancies(area: int = 2,
                      period: int = 1,
                      per_page: int = 100) -> None:
    """
    Periodically update table with new vacancies
    Period - each day in the morning
    TODO: Add celery task
    """
    try:
        logger.info("Add new vacancies",
                    area=area, period=period, per_page=per_page)

        db_engine = DBConfig.get().db_engine

        # parse vacancies and preprocess data
        vacancies_raw = get_vacancies(area=area, period=period, per_page=per_page)
        vacancies = preprocess_data(vacancies_raw)

        # check if vacancy is in database already
        ids_in_db = select(columns="id", engine=db_engine)["id"].astype("str")
        vacancies_to_insert = vacancies.query("id not in @ids_in_db")
        vacancies_to_update = vacancies.query("id in @ids_in_db")

        # insert new vacancies
        number_of_new_vacs = insert(vacancies_to_insert, db_engine)

        logger.info("New vacancies were added to table",
                    number_of_new_vacs=number_of_new_vacs,
                    number_of_vacs_to_update=vacancies_to_update.shape[0])

    except DataCollectionFailed:
        logger.info("Data collection failed")


In [2]:
db_engine = DBConfig.get().db_engine
vacancies_raw = get_vacancies(area=2, period=1, per_page=100)

2022-06-21 23:06.53 [info     ] Pages are loaded               vacancies_allowed=2000 vacancies_found=10094
2022-06-21 23:06.53 [info     ] Loading vacancy #0
2022-06-21 23:07.06 [info     ] Vacancies are loaded           vacancies_loaded=100


In [22]:
vacancies = preprocess_data(vacancies_raw.dropna(subset=["id"]))

In [23]:
ids_in_db = select(columns="id", engine=db_engine)["id"].astype("str")

In [24]:
vacancies_to_insert = vacancies.query("id not in @ids_in_db")

In [25]:
insert(vacancies_to_insert, db_engine)

1480

In [19]:
vacancies_raw["errors"][0]

[{'value': 'captcha_required',
  'captcha_url': 'https://hh.ru/account/captcha?state=pxvcxBozfu7ry7R4QCetFpeicJ2Ml%2Bur%2F701s7jX2mZxCzHziryEFFjVgnI%2BvznkyfuahMvjmAOQ8DPG8bnUGWrhRhFUOcv5QA6z0ppC9iyPP%2F6VbmCmTYn%2F9uSfLeTy',
  'type': 'captcha_required'}]