In [None]:
import requests
import time
import os


class HHDownloader:
    """
    Downloads HTML pages witch contain links to resumes
    Automatically switch the pages by changing the page number in the link
    """
    def __init__(self, start_url_template: str, data_path: str, timeout=10):
        self.start_url_template = start_url_template
        self.headers = {'User-Agent':
                            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
        self.timeout = timeout
        self.data_path = data_path

    def check_if_exists(self, page_num: int):
        """Check if the page was already downloaded"""
        return os.path.exists(os.path.join(self.data_path, 'resume_page_{0}.html'.format(page_num)))

    def download_pages(self, start_page: int, count: int):
        """"Starts the process of downloading pages in the specified range"""
        for page_num in range(count):
            if self.check_if_exists(start_page + page_num):
                print(f"File {'resume_page_{0}.html'.format(page_num)} already exists. Skip")
                continue
            print(f"Start downloading {page_num} page")
            page_url = self.get_page_url(page_num)
            print(f"Downloaded {page_num} page")
            page = self.download_page(page_url)
            self.save_page(page, start_page + page_num)
            print(f"Saved {page_num} page")
            print("*" * 20)
            time.sleep(self.timeout)

    def get_page_url(self, page_num: int):
        """Changes the page number in the link"""
        return self.start_url_template.format(page_num)

    def download_page(self, url: str):
        page = requests.get(url, headers=self.headers)
        return page

    def save_page(self, page, page_num: int):
        """Save the HTML page with page number in title to chosen directory"""
        page_file_name = os.path.join(self.data_path, 'resume_page_{0}.html'.format(page_num))
        with open(page_file_name, 'w') as resume_request:
            resume_request.write(page.text)


if __name__ == "__main__":
    #start_url = "https://hh.ru/search/resume?text=аналитик+данных&exp_period=all_time&ored_clusters=true&order_by=relevance&search_period=0&logic=normal&pos=full_text&page={0}"
    #start_url = "https://hh.ru/search/resume?text=Python+разработчик&from=suggest_post&pos=full_text&logic=normal&exp_period=all_time&ored_clusters=true&order_by=relevance&search_period=0&hhtmFrom=resume_search_result&hhtmFromLabel=resume_search_line&page={0}"
    #start_url = "https://hh.ru/search/resume?text=Java+разработчик&from=suggest_post&ored_clusters=true&order_by=relevance&search_period=0&logic=normal&pos=full_text&exp_period=all_time&hhtmFrom=resume_search_result&hhtmFromLabel=resume_search_line&page={0}"
    #start_url = "https://hh.ru/search/resume?text=Data+science&from=suggest_post&ored_clusters=true&order_by=relevance&search_period=0&logic=normal&pos=full_text&exp_period=all_time&hhtmFrom=resume_search_result&hhtmFromLabel=resume_search_line&page={0}"
    #start_url = "https://hh.ru/search/resume?text=Android+разработчик&from=suggest_post&ored_clusters=true&order_by=relevance&search_period=0&logic=normal&pos=full_text&exp_period=all_time&hhtmFrom=resume_search_result&hhtmFromLabel=resume_search_line&page={0}"
    #start_url = "https://hh.ru/search/resume?text=devops+инженер&ored_clusters=true&order_by=relevance&search_period=0&logic=normal&pos=full_text&exp_period=all_time&hhtmFrom=resume_search_result&hhtmFromLabel=resume_search_line&page={0}"
    #start_url = "https://hh.ru/search/resume?text=Ml+инженер&from=suggest_post&ored_clusters=true&order_by=relevance&search_period=0&logic=normal&pos=full_text&exp_period=all_time&hhtmFrom=resume_search_result&hhtmFromLabel=resume_search_line&page={0}"
    #start_url = "https://hh.ru/search/resume?text=ios-разработчик&ored_clusters=true&order_by=relevance&search_period=0&logic=normal&pos=full_text&exp_period=all_time&hhtmFrom=resume_search_result&hhtmFromLabel=resume_search_line&page={0}"
    #start_url = "https://hh.ru/search/resume?text=1-c+программист&ored_clusters=true&order_by=relevance&search_period=0&logic=normal&pos=full_text&exp_period=all_time&hhtmFrom=resume_search_result&hhtmFromLabel=resume_search_line&page={0}"
    start_url = "https://hh.ru/search/resume?text=Net+разработчик&from=suggest_post&ored_clusters=true&order_by=relevance&search_period=0&logic=normal&pos=full_text&exp_period=all_time&hhtmFrom=resume_search_result&hhtmFromLabel=resume_search_line&page={0}"
    hh_downloader = HHDownloader(start_url, "/content/drive/MyDrive/saved_html", timeout=20)
    hh_downloader.download_pages(108, 12)

Start downloading 0 page
Downloaded 0 page
Saved 0 page
********************
Start downloading 1 page
Downloaded 1 page
Saved 1 page
********************
Start downloading 2 page
Downloaded 2 page
Saved 2 page
********************
Start downloading 3 page
Downloaded 3 page
Saved 3 page
********************
Start downloading 4 page
Downloaded 4 page
Saved 4 page
********************
Start downloading 5 page
Downloaded 5 page
Saved 5 page
********************
Start downloading 6 page
Downloaded 6 page
Saved 6 page
********************
Start downloading 7 page
Downloaded 7 page
Saved 7 page
********************
Start downloading 8 page
Downloaded 8 page
Saved 8 page
********************
Start downloading 9 page
Downloaded 9 page
Saved 9 page
********************
Start downloading 10 page
Downloaded 10 page
Saved 10 page
********************
Start downloading 11 page
Downloaded 11 page
Saved 11 page
********************


In [None]:
import re
from bs4 import BeautifulSoup


def write_links(page: str, pattern: str, start: int, end: int):
    """Осуществляет поиск ссылок на резюме в скаченных страницах"""
    for i in range(start, end + 1):
        with open(page.format(i), 'r') as resume_request, open('id_list.txt', 'a') as id_list:
            soup = BeautifulSoup(resume_request, features="lxml")
            for id_ls in soup.find_all("a",attrs={"class":"bloko-link", "data-qa":"serp-item__title"}):
              id_list.write(re.search(pattern, id_ls['href']).group(1))
              id_list.write('\n')

write_links('/content/drive/MyDrive/saved_html/resume_page_{0}.html', r'resume\/(.+)\?', 108, 119)

In [None]:
import requests
import time
import os


class HHResumeDownloader:
    """Класс предназначен для скачевания страниц резюме, при помощи списка уникальных ссылок на резюме"""
    def __init__(self, page_url: str, data_path: str, number: int, timeout=10):
        self.page_url = page_url
        self.headers = {'User-Agent':
                            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
        self.timeout = timeout
        self.data_path = data_path
        self.number = number
        self.proxy = {'http': 'http://161.202.226.195:8123'}

    def check_if_exists(self, resume_id: str):
        """Функция проверяет, была ли уже скаченна данная странца"""
        return os.path.exists(os.path.join(self.data_path, f'resume_page_{resume_id}.html'))

    def get_page_url(self, resume_id: str):
        """Склеивает темплейт ссылки на резюме и уникальный номер страницы"""
        return self.page_url.format(resume_id)

    def download_pages(self):
        """Запускает процесс скачивания страниц"""
        with open('id_list.txt', 'r') as id_list:
            rep = 0
            for resume_id in id_list:
                resume_id = resume_id.strip()
                if self.check_if_exists(resume_id):
                    print(f"File {'resume_page_{0}.html'.format(resume_id)} already exists. Skip")
                    continue
                print(f"Start downloading {resume_id} page")
                page_url = self.get_page_url(resume_id)
                print(f"Downloaded {resume_id} page")
                page = self.download_page(page_url)
                self.save_page(page, resume_id)
                print(f"Saved {resume_id} page")
                print("*" * 20)
                time.sleep(self.timeout)
                rep += 1
                if self.number == rep:
                    break

    def download_page(self, url: str):
        """Скачивание страницы"""
        page = requests.get(url, headers=self.headers, proxies=self.proxy)
        return page

    def save_page(self, page, resume_id: str):
        """Сохранение страницы в папку"""
        page_file_name = os.path.join(self.data_path, 'resume_page_{0}.html'.format(resume_id))
        with open(page_file_name, 'w') as resume_request:
            resume_request.write(page.text)


if __name__ == "__main__":
    hh_resume_downloader = HHResumeDownloader("https://hh.ru/resume/{0}", "/content/drive/MyDrive/saved_resumes/", 2500, timeout=5)
    hh_resume_downloader.download_pages()

File resume_page_2324b9be0004f55e4c0039ed1f61387565664e.html already exists. Skip
File resume_page_c776e49000090025c90039ed1f4c4374593469.html already exists. Skip
File resume_page_2040886000082acbf30039ed1f696d51455a56.html already exists. Skip
File resume_page_f99bfe92000930d3390039ed1f3435716e6c39.html already exists. Skip
File resume_page_1cf286260008799ed00039ed1f4e734a553858.html already exists. Skip
File resume_page_4d00443b00059949e30039ed1f61304f51786b.html already exists. Skip
File resume_page_16c6569500087c34930039ed1f4d616b303762.html already exists. Skip
File resume_page_59a2d49b0002ac9a650039ed1f657771484465.html already exists. Skip
File resume_page_3f9ababf0007f73bb00039ed1f6d416544696b.html already exists. Skip
File resume_page_71a239680007ebba400039ed1f4d42374b3673.html already exists. Skip
File resume_page_b5e794860008d2a6740039ed1f45596c514970.html already exists. Skip
File resume_page_75b36de800055a378c0039ed1f324e49597564.html already exists. Skip
File resume_page

In [None]:
from bs4 import BeautifulSoup
import os
import re
#from googletrans import Translator
import json
import pandas as pd


class Resume:
    """Class extracts the resume information from HTML page"""
    def __init__(self, soup, resume_id):
        self.soup = soup
        self.resume_id = resume_id
        self.resume_title = self.extract_title()
        self.city = self.extract_information('span', {'data-qa': "resume-personal-address"})
        self.age = self.extract_age()
        self.gender = self.extract_gender
        self.area = self.extract_area()
        self.desired_wage = self.extract_wage()
        self.work_exp = self.extract_work_experience()
        self.education = self.extract_nested_information('div', {'data-qa': "resume-block-education"})
        self.language_prof = self.extract_languages()
        self.skills = self.extract_information('span', {'class': "bloko-tag__section bloko-tag__section_text"})
        self.university = self.extract_university('a', {'class':"bloko-link bloko-link_kind-tertiary"})
        self.faculty = self.extract_information('div', {'data-qa':"resume-block-education-organization"})
        self.work_exp_descr = self.extract_work_exp_descr()
        #self.dict_resume = self.resume_dict_maker()

    def __repr__(self):
        return f'{self.dict_resume}'

    def extract_title(self):
        title = self.extract_information('span', {'data-qa': "resume-block-title-position"})
        main_titles = title.split(',')
        main_title = main_titles[0]
        return main_title

    def extract_age(self):
        raw_age = self.extract_information('span', {'data-qa': "resume-personal-age"})
        digit_pattern = r'\d+'
        if raw_age != "No information":
            age = re.findall(digit_pattern, raw_age)
            age = age[0]
            return age
        else:
            return 0

    def extract_work_experience(self):
        work_exp = self.extract_nested_information('div', {'data-qa': "resume-block-experience"})
        float_pattern = r'\d+'  # Search for any digit in str
        if work_exp != "No information":
            work_exp = re.findall(float_pattern, work_exp) # Extract period of work from the str
            if len(work_exp) == 2:
                work_exp = round((int(work_exp[0]) + int(work_exp[0])/12), 2)
            elif len(work_exp) == 1:
                work_exp = round(int(work_exp[0])/12, 2)
            return work_exp
        else:
            return 0

    def extract_area(self):
        area = self.extract_information('li', {'class':"resume-block__specialization", 'data-qa':'resume-block-position-specialization'})
        return area

    def extract_wage(self):
        wage = self.extract_information('span', {'data-qa': "resume-block-salary"})
        int_pattern = r'\d'
        if wage != "No information":
            wage_amount = re.findall(int_pattern, wage)
            wage_str = ''.join(wage_amount)
            wage = int(f'{wage_str}')
            return wage
        else:
            return 0

    def extract_languages(self):
        languages = self.extract_information('p', {'data-qa': "resume-block-language-item"}, to_list=False)
        if type(languages) == str:  # If only one or no language stated in resume
            languages = {"Russian": "Native"}
        return languages

    @property
    def extract_gender(self):
        gender = self.extract_information('span', {'data-qa': "resume-personal-gender"})
        if gender == 'The man' or gender == 'Male':  # If translation inaccurate
            gender = 'Мужчина'
        elif gender == 'Woman' or gender == 'Female':
            gender = 'Женщина'
        return gender

    def extract_university(self, tag: str, attributes: dict):
        finder = self.soup.find_all(tag, attributes)
        if finder is None or len(finder) == 0:
            return "No information"
        else:
          st = set()
          for i in finder:
            if re.search(r'university=(.+)', i['href']):
              st.add(i.get_text())
          if len(st) == 0:
            return "No information"
          return list(st)

    def extract_work_exp_descr(self):
        finder1 = self.soup.find_all('a', {'class':"bloko-link bloko-link_kind-tertiary"})
        st = []
        for i in finder1:
          if re.search(r'employer', i['href']):
            st.append(i.get_text())

        finder2 = self.soup.find_all('div', {'data-qa':"resume-block-experience-position", 'class':"bloko-text bloko-text_strong"})
        st2 = []
        if len(finder2) == 1:
            st2.append(finder2[0].get_text())
        elif len(finder2) > 1:
          for part in finder2:
            list_element = part.get_text()
            st2.append(list_element)

        finder3 = self.soup.find_all('div', {'data-qa':"resume-block-experience-description"})
        st3 = []
        if len(finder3) == 1:
            st3.append(finder3[0].get_text())
        elif len(finder3) > 1:
          for part in finder3:
            list_element = part.get_text()
            st3.append(list_element)
        res = dict()
        for i in range(len(st)):
          res[st[i]] = [st2[i], st3[i]]
        if len(res) == 0:
          return 'No information'
        return res
    '''def check_for_translation(self, info):
        rus_pattern = r'[а-яА-Я]+'
        if re.search(rus_pattern, info):
            info = self.translator(info)
        return info'''

    '''@staticmethod
    def translator(text):  # If the resume is in Russian
        translator = Translator()
        result = translator.translate(text)
        return result.text'''

    def extract_information(self, tag: str, attributes: dict, to_list=True):
        finder = self.soup.find_all(tag, attributes)
        if len(finder) == 1:
            #finder = self.check_for_translation(finder[0].get_text())
            finder = finder[0].get_text()
            return finder
        elif len(finder) > 1:
            if to_list:
                listed_info = []
                for part in finder:
                    list_element = part.get_text()
                    #list_element = self.check_for_translation(list_element)
                    listed_info.append(list_element)
                return listed_info
            else:
                dict_info = {}
                for dict_part in finder:
                    dict_element = dict_part.get_text()
                    #dict_element = self.check_for_translation(dict_element)
                    splitted_dict_element = dict_element.split()
                    dict_info[splitted_dict_element[0]] = splitted_dict_element[2]
                return dict_info
        else:
            return "No information"

    def extract_nested_information(self, tag: str, attributes: dict, to_list=True):
        finder = self.soup.find(tag, attributes)
        if finder is None:
            return "No information"
        else:
            finder = finder.find('span', {'class': "resume-block__title-text resume-block__title-text_sub"})
            #finder = self.check_for_translation(finder.get_text())
            finder = finder.get_text()
        return finder

    def resume_dict_maker(self):
          resume_dict = {'id': self.resume_id,
                       'title': self.resume_title,
                       'city': self.city,
                       'age': self.age,
                       'gender': self.gender,
                       'area': self.area,
                       'desired_wage': self.desired_wage,
                       'work_experience': self.work_exp,
                       'work_exp_descr': self.work_exp_descr,
                       'education_level': self.education,
                       'university':self.university,
                       'faculty': self.faculty,
                       'languages': self.language_prof,
                       'skills': self.skills
                       }
          return resume_dict

In [None]:
class ResumeGetter:
    """Запускает процесс скачивания резюме"""
    def __init__(self):
        self.dir_path = "/content/drive/MyDrive/saved_resumes/"
        self.resume_storage = os.listdir(path=self.dir_path)
        self.resume_dict_storage = []

    def get_resume(self):
        for resume in self.resume_storage:
            with open(f'{self.dir_path}{resume}', "r") as resume_page:
                resume_text = BeautifulSoup(resume_page, features="lxml")
                resume_id = str(re.search(r"page_(.+)\.html", resume).group(1))  # забираем id из названия файла
                if self.check_if_exists(resume_id):
                    print(f"File {f'{resume_id}.json'} already exists. Skip")
                    continue
                print(f'Start parce {resume_id}')
                resume_getter = Resume(resume_text, resume_id)
                dict_format = resume_getter.resume_dict_maker()
                self.resume_dict_storage.append(dict_format)
                self.get_json_resume(dict_format, resume_id)
                print(f'Resume {resume_id} saved in json file')
                print('*' * 20)
        return self.resume_dict_storage

    def check_if_exists(self, res_id: str):
        """Проверка в списке загруженных файлов"""
        return os.path.exists(os.path.join('/content/drive/MyDrive/new_json_resumes/', f'{res_id}.json'))

    def get_json_resume(self, resume_dict: dict, r_id: str):
        """Сохраняет резюме в отдельный json-файл"""
        with open(f'/content/drive/MyDrive/new_json_resumes/{r_id}.json', 'w+') as json_file:
            json.dump(resume_dict, json_file, indent=4)


In [None]:
if __name__ == '__main__':
    data = ResumeGetter()
    all_resumes = data.get_resume()

[1;30;43mВыходные данные были обрезаны до нескольких последних строк (5000).[0m
Resume 49465e380001f869560039ed1f7a6b34563730 saved in json file
********************
Start parce 764142e90004f005fb0039ed1f5a51727a3043
Resume 764142e90004f005fb0039ed1f5a51727a3043 saved in json file
********************
Start parce 395ea35b0004f1f85a0039ed1f36524a354941
Resume 395ea35b0004f1f85a0039ed1f36524a354941 saved in json file
********************
Start parce 90b24032000316f07a0039ed1f635044625367
Resume 90b24032000316f07a0039ed1f635044625367 saved in json file
********************
Start parce 99750348000264b3990039ed1f7a34736d4e36
Resume 99750348000264b3990039ed1f7a34736d4e36 saved in json file
********************
Start parce 385ec66100052df8470039ed1f48634b56716c
Resume 385ec66100052df8470039ed1f48634b56716c saved in json file
********************
Start parce ab58a700000278a2fa0039ed1f465071706178
Resume ab58a700000278a2fa0039ed1f465071706178 saved in json file
********************
Start parc

In [None]:
df = pd.DataFrame(all_resumes)
df.to_csv('/content/drive/MyDrive/Дипломная работа/Данные из резюме.csv', encoding = 'utf-8', sep=',')

In [None]:
df = pd.DataFrame(all_resumes)
df.to_pickle("/content/drive/MyDrive/Дипломная работа/Данные резюме.pkl")