In [27]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import time
import pandas as pd

def get_source_html(url, output_file='output_reviews_all_.csv'):
    service = ChromeService('/Users/pavelesipenok/Downloads/chromedriver-mac-arm64_3/chromedriver')
    options = webdriver.ChromeOptions()
    driver = webdriver.Chrome(service=service, options=options)

    try:
        all_vacancy_links = set()  # Множество для всех найденных ссылок на вакансии

        # Сначала соберем ссылки на все страницы с вакансиями
        for page_number in range(1, 2):
            url = f'{base_url}&p={page_number}'
            driver.get(url)
            time.sleep(1)

            vacancy_links = driver.find_elements(By.CLASS_NAME, 'iva-item-sliderLink-uLz1v')
            all_vacancy_links.update(link.get_attribute('href') for link in vacancy_links)

        # Теперь проходим по каждой ссылке на вакансию и извлекаем информацию
        for vacancy_url in all_vacancy_links:
            print(f"Processing vacancy: {vacancy_url}")
            
            # Здесь нужно добавить код для извлечения данных о вакансии
            vacancy_data = get_vacancy_data(driver, vacancy_url)

            # Если вакансия уникальна, добавляем данные в список
            if vacancy_data:
                save_vacancy_data(vacancy_data, output_file)

    finally:
        driver.quit()

def get_vacancy_data(driver, vacancy_url):
    driver.get(vacancy_url)
    time.sleep(1)

    # Извлечение названия вакансии
    title = extract_text(driver, 'styles-module-root-TWVKW')

    # Извлечение параметров вакансии
    params_list = driver.find_elements(By.CLASS_NAME, 'params-paramsList__item-appQw')
    params_dict = extract_params(params_list)

    # Извлечение описания вакансии
    description = extract_description(driver)

    # Извлечение адреса
    address = extract_address(driver)

    # Возвращаем словарь с данными о вакансии
    return {
        'Title': title,
        **params_dict,
        'Description': description,
        'Address': address,
        'Vacancy_URL': vacancy_url
    }


def get_vacancy_data(driver, vacancy_url):
    driver.get(vacancy_url)
    time.sleep(1)

    # Извлечение названия вакансии
    title = extract_text(driver, 'styles-module-root-TWVKW')

    # Извлечение параметров вакансии
    params_list = driver.find_elements(By.CLASS_NAME, 'params-paramsList__item-appQw')
    params_dict = extract_params(params_list)
    
    # Извлечение описания вакансии
    description = extract_description(driver)
    
    # Извлечение адреса
    address = extract_address(driver)
    
    # Извлечение данных о номере объявления и дате
    ad_info = driver.find_element(By.CLASS_NAME, 'style-item-footer-text-LEjEe').text
    ad_number, ad_date = extract_ad_info(ad_info)

    # Извлечение заработной платы
    salary_element = driver.find_element(By.CLASS_NAME, 'styles-module-size_xxxl-A2qfi')
    salary = extract_salary(salary_element)

    # Возвращаем словарь с данными о вакансии
    return {
        'Title': title,
        **params_dict,
        'Description': description,
        'Vacancy_URL': vacancy_url,
        'Ad_Number': ad_number,
        'Address': address,
        'Ad_Date': ad_date,
        'Salary': salary
    }

def extract_salary(salary_element):
    try:
        salary_value = salary_element.get_attribute('content')
        return int(salary_value) if salary_value else None
    except Exception as e:
        print(f"Error extracting salary: {e}")
        return None


def extract_text(driver, class_name):
    try:
        element = driver.find_element(By.CLASS_NAME, class_name)
        return element.text
    except Exception as e:
        print(f"Error extracting text from element with class '{class_name}': {e}")
        return None

def extract_params(params_list):
    params_dict = {}
    for param in params_list:
        param_text = param.text
        try:
            param_name, param_value = map(str.strip, param_text.split(':', 1))
            params_dict[param_name] = param_value
        except ValueError:
            print(f"Error parsing parameter: {param_text}")

    return params_dict

def extract_description(driver):
    try:
        description_element = driver.find_element(By.CLASS_NAME, 'style-item-description-html-qCwUL')
        description_html = description_element.get_attribute('innerHTML')
        
        # Используем BeautifulSoup для парсинга HTML описания
        soup = BeautifulSoup(description_html, 'html.parser')
        description_text = soup.get_text(separator='\n')
        
        return description_text.strip()
    except Exception as e:
        print(f"Error extracting description: {e}")
        return None

def extract_address(driver):
    try:
        address_element = driver.find_element(By.XPATH, "//div[@itemprop='address']//span[@class='style-item-address__string-wt61A']")
        return address_element.text.strip()
    except Exception as e:
        print(f"Error extracting address: {e}")
        return None

def save_vacancy_data(vacancy_data, output_file):
    # Создаем DataFrame с заранее определенными колонками
    columns = ['Title', 'Сфера деятельности', 'График работы', 'Смены', 
               'Частота выплат', 'Опыт работы', 
               'В том числе для кандидатов', 'Что получают работники', 
               'Description', 'Address', 
               'Vacancy_URL', 'Ad_Number', 'Ad_Date', 'Salary']
    
    df = pd.DataFrame([vacancy_data], columns=columns)
    
    # Сохраняем данные о вакансии в CSV
    if not pd.io.common.file_exists(output_file):
        df.to_csv(output_file, index=False, header=True)
    else:
        df.to_csv(output_file, mode='a', index=False, header=False)
    
    print(f"Data saved to {output_file}")

if __name__ == '__main__':
    base_url = 'https://www.avito.ru/brands/i38660146/all/vakansii?sellerId=5eadab55f289ba2bf6228f886022243b'
    get_source_html(base_url)


Processing vacancy: https://www.avito.ru/moskva/vakansii/prodavets-konsultant_tts_avenyu_3343943786
Data saved to output_reviews_all_.csv
Processing vacancy: https://www.avito.ru/vyazniki/vakansii/voditel_pogruzchika_3727950936
Data saved to output_reviews_all_.csv
Processing vacancy: https://www.avito.ru/kovrov/vakansii/slesar-remontnik_3728402966
Data saved to output_reviews_all_.csv
Processing vacancy: https://www.avito.ru/vladimir/vakansii/voditel_pogruzchika_s_obucheniem_3696492740
Data saved to output_reviews_all_.csv
Processing vacancy: https://www.avito.ru/moskva/vakansii/prodavets-konsultant_tts_rumer_3343868240
Error extracting text from element with class 'styles-module-root-TWVKW': Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=120.0.6099.216)
Stacktrace:
0   chromedriver                        0x0000000100b044cc chromedriver + 4162764
1   chromedriver                        0x0000000100afc654 chromedrive

NoSuchWindowException: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=120.0.6099.216)
Stacktrace:
0   chromedriver                        0x0000000100b044cc chromedriver + 4162764
1   chromedriver                        0x0000000100afc654 chromedriver + 4130388
2   chromedriver                        0x0000000100753bc0 chromedriver + 293824
3   chromedriver                        0x000000010072c6c0 chromedriver + 132800
4   chromedriver                        0x00000001007c525c chromedriver + 758364
5   chromedriver                        0x00000001007d9294 chromedriver + 840340
6   chromedriver                        0x000000010078d6bc chromedriver + 530108
7   chromedriver                        0x000000010078e930 chromedriver + 534832
8   chromedriver                        0x0000000100ac9df8 chromedriver + 3923448
9   chromedriver                        0x0000000100ace3cc chromedriver + 3941324
10  chromedriver                        0x0000000100ab2028 chromedriver + 3825704
11  chromedriver                        0x0000000100acef2c chromedriver + 3944236
12  chromedriver                        0x0000000100aa46e4 chromedriver + 3770084
13  chromedriver                        0x0000000100aeb970 chromedriver + 4061552
14  chromedriver                        0x0000000100aebae8 chromedriver + 4061928
15  chromedriver                        0x0000000100afc2d4 chromedriver + 4129492
16  libsystem_pthread.dylib             0x00000001ac05bfa8 _pthread_start + 148
17  libsystem_pthread.dylib             0x00000001ac056da0 thread_start + 8
