In [1]:
import requests
from bs4 import BeautifulSoup as BS
from pprint import pprint
from pandas import json_normalize
import numpy as np
import pandas as pd

from pymongo import MongoClient
from pymongo.errors import *
import hashlib
import time

In [2]:
class HH_scraper():
    
    def __init__(self, suffix):
        
        self.url = 'https://hh.ru/search/vacancy?area=1&text='
        self.suffix = suffix.replace(' ', '+')
        self.page = self.url + self.suffix
        self.headers = {'User-Agent': 'Mozilla / 5.0 (Windows NT 10.0; Win64; x64) AppleWebKit / 537.36 (KHTML, like Gecko) Chrome / 81.0.4044.92 Safari / 537.36'}
        self.dom = None
        self.client = MongoClient('localhost', 27017)
        self.db_hh = self.client['db_hh']
        self.vacancies = self.db_hh.vacancies
        self.dup_key_error_cnt = 0
        
    
    def run_scrapping(self):
        
        while(True):
            if self.page:
                self.get_dom_and_next_page()
                self.set_data_into_db()
            else:
                self.set_data_into_db()
                print(f'Уже существующие вакансии не заносятся в базу. Таких вакансий: {self.dup_key_error_cnt}')
                self.dup_key_error_cnt = 0
                break
                
        time.sleep(1)
    
    
    def get_dom_and_next_page(self):
  
        response = requests.get(self.page, headers=self.headers)
        self.dom = BS(response.text, 'html.parser')

        try:
            self.page = self.url + self.dom.find('a', {'class': 'bloko-button', 'data-qa': 'pager-next'}).get('href')
        except:
            self.page = None
            
        
    def get_address(self, link):
        
        try:
            link_response = requests.get(link, headers=self.headers)
            link_dom = BS(link_response.text, 'html.parser')
            address = link_dom.find('span', {'data-qa': 'vacancy-view-raw-address'}).text
        except:
            address = None
        
        return address
                
                
    def set_data_into_db(self):
        
        jobs = self.dom.find_all('div', {'class':'vacancy-serp-item vacancy-serp-item_redesigned'})
        
        for job in jobs:
            
            vacancy = job.find('a').text
            link = job.find('a').get('href')
            employer = job.find('a', {'class': 'bloko-link bloko-link_kind-tertiary'}).text

            try:
                salary_split = job.find('span', {'class': 'bloko-header-section-3', 'data-qa': 'vacancy-serp__vacancy-compensation'}).text.split()
            except:
                salary_split = None

            if salary_split:
                currency = salary_split[-1]

                len_salary_split = len(salary_split)

                if len_salary_split == 6: # 10 000 - 20 000 руб. 
                    salary_min = int(f'{salary_split[0]}{salary_split[1]}')
                    salary_max = int(f'{salary_split[3]}{salary_split[4]}')
                if len_salary_split == 3: # от/до 800 руб.
                    if salary_split[0] == 'от':
                        salary_min = int(salary_split[1])
                        salary_max = None
                    else:
                        salary_max = int(salary_split[1])
                        salary_min = None
                if len_salary_split == 4: # (1. от/до 10 000 руб.) и (2. 800 - 900 руб.)
                    if salary_split[0] == 'от':
                        salary_min = int(f'{salary_split[1]}{salary_split[2]}')
                        salary_max = None
                    elif salary_split[0] == 'до':
                        salary_max = int(f'{salary_split[1]}{salary_split[2]}')
                        salary_min = None
                    else:
                        salary_min = int(salary_split[0])
                        salary_max = int(salary_split[2])
            else:
                salary_min = None
                salary_max = None
                currency = None
            
            address = self.get_address(link)
            for_hash = f'{salary_min}{salary_max}{currency}{vacancy}{link}{employer}{address}'
            
            doc = {'_id': hashlib.sha1(for_hash.encode('utf-8')).hexdigest(),
                  'salary_min': salary_min,
                  'salary_max': salary_max,
                  'currency': currency,
                  'vacancy': vacancy,
                  'link': link,
                  'employer': employer,
                  'source': self.url.split('/')[2],
                  'address': address}
            
            try:
                self.vacancies.insert_one(doc)
            except DuplicateKeyError as e:
                self.dup_key_error_cnt += 1

In [3]:
scrapper = HH_scraper('кассир продавец')
scrapper.run_scrapping()

Уже существующие вакансии не заносятся в базу. Таких вакансий: 73


In [4]:
client = MongoClient('localhost', 27017)
db_hh = client['db_hh']
vacancies = db_hh.vacancies

In [5]:
def salary_is_more(num):
    
    for doc in vacancies.find({'$or': [{'salary_min': {'$gt': num}}, {'salary_max': {'$gt': num}}]}):
        pprint(doc)
        
salary_is_more(95000)

{'_id': 'd7b17a81bd6415204f7348e85a72e121320caea5',
 'address': 'Молодежная, Москва, Сколковское шоссе (Москва), 31',
 'currency': 'руб.',
 'employer': 'ООО\xa0ProБагаж',
 'link': 'https://hh.ru/vacancy/53020980?from=vacancy_search_list&hhtmFrom=vacancy_search_list&query=%D0%BA%D0%B0%D1%81%D1%81%D0%B8%D1%80%20%D0%BF%D1%80%D0%BE%D0%B4%D0%B0%D0%B2%D0%B5%D1%86',
 'salary_max': 100000,
 'salary_min': 40000,
 'source': 'hh.ru',
 'vacancy': 'Продавец-кассир'}
{'_id': '13f40a21652c1cc5c94044028614a604259e3f44',
 'address': 'Мякинино, Спартак, Строгино, Москва, Арбатско-Покровская линия, '
            'метро Строгино',
 'currency': 'руб.',
 'employer': 'Компания «СПОРТМАСТЕР», Розничные продажи',
 'link': 'https://hh.ru/vacancy/51132295?from=vacancy_search_list&hhtmFrom=vacancy_search_list&query=%D0%BA%D0%B0%D1%81%D1%81%D0%B8%D1%80%20%D0%BF%D1%80%D0%BE%D0%B4%D0%B0%D0%B2%D0%B5%D1%86',
 'salary_max': 100000,
 'salary_min': 65000,
 'source': 'hh.ru',
 'vacancy': 'Продавец-кассир (м. Строгино)'}
{