In [24]:
import pandas as pd
import requests
import json

from typing import Tuple
from pandas import DataFrame

from dreamjob.config import settings
from structlog import get_logger

logger = get_logger()
DISPLAY_COLS = settings.COLS


def get_page(params: dict) -> Tuple[DataFrame, int, int]:
    """Get vacancy urls from a single page

    Args:
        params: parameters for hh.ru data request

    Returns:
        Dataframe with vacancy urls
        Max number of pages allowed (<=20)
        Number of found vacancies

    """
    response = requests.get('https://api.hh.ru/vacancies', params)
    json_file = json.loads(response.text)

    return pd.DataFrame(json_file["items"]), json_file["pages"], json_file["found"]


def get_pages(params: dict) -> DataFrame:
    """Get vacancy urls from several pages

    Args:
        params: parameters for hh.ru data request

    Returns:
        Dataframe with vacancy urls

    """
    vacancy_urls = pd.DataFrame()

    for page in range(20):
        params["page"] = page

        page_info, max_pages, vacancies_found = get_page(params)
        vacancy_urls = pd.concat([vacancy_urls, page_info])

        if max_pages - page <= 1:
            break

    logger.info("Pages are loaded", 
                max_pages=max_pages, 
                vacancies_found=vacancies_found,
                vacancies_allowed=min(2000, vacancies_found))

    return vacancy_urls[["id", "url"]]


def get_vacancies(area: int = 2,
                  period: int = 1,
                  per_page: int = 100
                  ) -> DataFrame:
    """Get full vacancy descriptions

    Args:
        area: id of area (city, country, etc)
        period: number of days to include when requesting data
        per_page: number of vacancies per page (<=100)

    Returns:
        Dataframe with full vacancy descriptions

    """
    params = {
        "area": area,  # Saint-Petersburg id: 2
        "period": period,
        "per_page": per_page,
    }

    vacancy_urls = get_pages(params)
    vacancy_df = pd.DataFrame()

    for ind, url in enumerate(vacancy_urls["url"]):
        response = requests.get(url)
        json_file = json.loads(response.text)

        vacancy_info = pd.json_normalize(json_file)
        vacancy_df = pd.concat([vacancy_df, vacancy_info], ignore_index=True)
        
        if ind % 50 == 0:
            logger.info(f"Loading vacancy #{ind}")

    logger.info("Vacancies are loaded",
                vacancies_loaded=vacancy_df.shape[0])

    return vacancy_df[DISPLAY_COLS]


In [25]:
df = get_vacancies()

2022-05-29 18:34.04 [info     ] Pages are loaded               max_pages=20 vacancies_allowed=2000 vacancies_found=4452
2022-05-29 18:34.04 [info     ] Loading vacancy #0
2022-05-29 18:34.12 [info     ] Loading vacancy #50
2022-05-29 18:34.20 [info     ] Loading vacancy #100
2022-05-29 18:34.28 [info     ] Loading vacancy #150
2022-05-29 18:34.36 [info     ] Loading vacancy #200
2022-05-29 18:34.44 [info     ] Loading vacancy #250
2022-05-29 18:34.52 [info     ] Loading vacancy #300
2022-05-29 18:35.01 [info     ] Loading vacancy #350
2022-05-29 18:35.09 [info     ] Loading vacancy #400
2022-05-29 18:35.20 [info     ] Loading vacancy #450
2022-05-29 18:35.28 [info     ] Loading vacancy #500
2022-05-29 18:35.36 [info     ] Loading vacancy #550
2022-05-29 18:35.44 [info     ] Loading vacancy #600
2022-05-29 18:35.53 [info     ] Loading vacancy #650
2022-05-29 18:36.01 [info     ] Loading vacancy #700
2022-05-29 18:36.09 [info     ] Loading vacancy #750
2022-05-29 18:36.17 [info     ] Loa

In [26]:
cols = [
    "id", 
    "premium",
    "name",
    "description",
    "key_skills",
    "accept_handicapped",
    "accept_kids",
    "archived",
    "specializations",
    "professional_roles",
    "published_at",
    "created_at",
    "alternate_url",
    "billing_type.id",
    "billing_type.name",
    "experience.id",
    "experience.name",
    "schedule.id",
    "schedule.name",
    "employment.id",
    "employment.name",
    "employer.id",
    "employer.name",
    "employer.url",
    "employer.alternate_url",
    "employer.logo_urls.original",
    "employer.logo_urls.240",
    "employer.logo_urls.90",
    "employer.vacancies_url",
    "employer.trusted",
    "address.city",
    "address.street",
    "address.building",
    "address.description",
    "address.lat",
    "address.lng",
    "address.raw",
    "salary",
    "salary.currency",
    "salary.from",
    "salary.to", 
    "salary.gross",   
]

In [27]:
df[cols].head()

Unnamed: 0,id,premium,name,description,key_skills,accept_handicapped,accept_kids,archived,specializations,professional_roles,...,address.building,address.description,address.lat,address.lng,address.raw,salary,salary.currency,salary.from,salary.to,salary.gross
0,55458115,False,Продавец-консультант по работе с VIP-клиентами...,<p>Компания &quot;Aurum Queen&quot; занимается...,"[{'name': 'Активные продажи'}, {'name': 'Грамо...",False,False,False,"[{'id': '17.256', 'name': 'Розничная торговля'...","[{'id': '97', 'name': 'Продавец-консультант, п...",...,метро Гостиный двор,,59.934069,30.332765,"Санкт-Петербург, 3 линия, метро Гостиный двор",,RUR,60000.0,100000.0,False
1,54467120,False,Менеджер отдела продаж,<p><strong>Поучаствуйте в конкурсном отборе на...,[{'name': 'Консультирование клиентов по телефо...,True,False,False,"[{'id': '17.242', 'name': 'Прямые продажи', 'p...","[{'id': '70', 'name': 'Менеджер по продажам, м...",...,10,,59.939964,30.433907,"Санкт-Петербург, проспект Энергетиков, 10",,RUR,90000.0,,False
2,48000033,False,Агент с обучением,"<p><em>Привет, наш будущий коллега</em>❗❗<br /...","[{'name': 'Работа в команде'}, {'name': 'Обуче...",False,False,False,"[{'id': '17.256', 'name': 'Розничная торговля'...","[{'id': '40', 'name': 'Другое'}]",...,26,,59.946752,30.354235,"Санкт-Петербург, улица Чайковского, 26",,RUR,120000.0,550000.0,False
3,66172161,False,Оператор call-центра,<strong>Что надо будет делать</strong> <ul> <l...,"[{'name': 'Поиск и привлечение клиентов'}, {'n...",False,False,False,"[{'id': '4.47', 'name': 'Ввод данных', 'profar...","[{'id': '83', 'name': 'Оператор call-центра, с...",...,,,,,,,RUR,30000.0,50000.0,False
4,54691263,False,Прораб (дорожные работы и благоустройство),"<p><strong>Привет, наш будущий Прораб.</strong...","[{'name': 'Управление персоналом'}, {'name': '...",False,False,False,"[{'id': '20.287', 'name': 'Строительство', 'pr...","[{'id': '100', 'name': 'Прораб, мастер СМР'}]",...,4,,59.992852,30.327501,"Санкт-Петербург, Сердобольская улица, 4",,RUR,120000.0,,False


In [28]:
df.to_pickle("vacancies_29-05-2022.p")

In [15]:
# добавить проверку при рекомендации на то заархивирована ли вакансия