In [1]:
import requests
from bs4 import BeautifulSoup
from multiprocessing import Pool
import csv


def get_page(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1)\
        AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
    r = requests.get(url.strip(), headers=headers)
    return r.text


def get_vacancy_data(url):
    soup = BeautifulSoup(get_page(url.strip()), 'lxml')    
    empl_name = ''
    vacancy_name = ''
    vacancy_salary = {'currency': '', 'min_value': float('NaN'), 'max_value': float('NaN')}    
    empl_name = soup.find('a', class_= 'vacancy-company-name').find('span').get_text()
    vacancy_name = soup.find('h1', class_='header').get_text()
    
    try:
        vacancy_salary['currency'] = soup.find('p', class_='vacancy-salary').find_all('meta')[0]['content']
        vacancy_salary['min_value'] = soup.find('p', class_='vacancy-salary').find_all('meta')[1]['content']
        vacancy_salary['max_value'] = soup.find('p', class_='vacancy-salary').find_all('meta')[2]['content']
    except:
        pass
    
    vacancy_skills = list()
    
    try:
        sk = soup.find_all('span', class_='Bloko-TagList-Text')
        for item in sk:
            vacancy_skills.append(item.get_text())
    except:
        pass
    
    vacancy_desc = ''
    try:
        descs = soup.find('div', class_='g-user-content').contents
        for t in descs:
            try:
                vacancy_desc += t.get_text().strip() + ' \n '
            except:
                continue
    except:
        pass
    
    return {'vacancy': vacancy_name, 'employer': empl_name, 'salary': vacancy_salary,
            'description': vacancy_desc, 'skills': vacancy_skills, 'link': url}


def get_pages_links(url):
    links = list()
    soup = BeautifulSoup(get_page(url.strip() + '&page=0'), 'lxml')
    try:
        pages_count = int(soup.find_all('a', class_ = 'bloko-button HH-Pager-Control')[-1].get_text())
    except:
        pages_count = 0
    for i in range(0,pages_count):
        links.append(url.strip() + '&page' + str(i))
    return links


def get_vacancies_links(url):
    links = list()
    soup = BeautifulSoup(get_page(url.strip()),'lxml')
    titles = soup.find_all('a', class_='bloko-link HH-LinkModifier')
    for title in titles:
        links.append(title['href'].strip())
    return links


def get_all_links(url):
    all_links = list()
    pages_links = get_pages_links(url.strip())
    for page_link in pages_links:
        vac_links = get_vacancies_links(page_link)
        for vac_link in vac_links:
            all_links.append(vac_link)
    return all_links


def write_csv(data):
    with open('hh.csv', 'a') as f:
        writer = csv.writer(f, delimiter='\t', quotechar='|')
        writer.writerow((data['vacancy'],
                         data['employer'],
                         data['salary']['min_value'],
                         data['salary']['max_value'],
                         data['salary']['currency'],
                         data['description'],
                         data['skills'],
                         data['link']))

        
def make_parse(url):
    data = get_vacancy_data(url)
    write_csv(data)
    

def main():
    url = 'https://spb.hh.ru/search/vacancy?order_by=publication_time&specialization=15.93\
        &area=2&only_with_salary=true&search_period=7'
    all_links = get_all_links(url)
    with open('hh.csv', 'a') as f:
        writer = csv.writer(f, delimiter='\t', quotechar='|')
        writer.writerow(['vacancy', 'employer', 'salary min', 'salary max',
                        'currency', 'description', 'skills', 'link'])
    with Pool(10) as p:
        p.map(make_parse, all_links)
    print("done")
    
        
if __name__ == '__main__':
    main()
    

done


In [2]:
import pandas as pd
HH = pd.read_csv('hh.csv', delimiter='\t', quotechar='|')
HH[10:21]

Unnamed: 0,vacancy,employer,salary min,salary max,currency,description,skills,link
10,English translator/rewriter,Livingston Research Group,800,1000.0,USD,Livingston Research delivers help and support ...,[],https://career.ru/vacancy/27746880
11,Project Manager в направление внедрение crm-си...,Devellab,35000,70000.0,RUR,Хотите стать востребованным профессионалом в i...,"['Agile Project Management', 'Scrum', 'перегов...",https://career.ru/vacancy/27756516
12,Специалист контакт-центра (входящие звонки),SkyNet,20000,40000.0,RUR,,"['Грамотная речь', 'Пользователь ПК', 'Грамотн...",https://career.ru/vacancy/25385522
13,Начинающий специалист,SkyNet,30000,,RUR,,"['Прямые продажи', 'Активные продажи', 'Провед...",https://career.ru/vacancy/26236354
14,Оператор сканирования,Корпорация ЭЛАР,25000,35000.0,RUR,"В корпорации ЭЛАР (крупная ИТ-компания, лидиру...","['MS Outlook', 'Пользователь ПК', 'MS Excel', ...",https://career.ru/vacancy/27347425
15,Продавец-консультант,Билайн: Офисы продаж,44000,,RUR,,[],https://career.ru/vacancy/22995857
16,Менеджер по продажам,Saleset,45000,85000.0,RUR,Saleset помогает бизнесу привлекать заявки с о...,"['Холодные продажи', 'Ведение переговоров', 'А...",https://career.ru/vacancy/27340844
17,Специалист прямых продаж,ПАО МТС,25000,90000.0,RUR,,[],https://career.ru/vacancy/23528966
18,Продавец-консультант (Крестовский остров),Билайн: Офисы продаж,44000,,RUR,,[],https://career.ru/vacancy/26687575
19,Продавец-консультант (м. Улица Дыбенко),Билайн: Офисы продаж,44000,,RUR,,[],https://career.ru/vacancy/27527913
