In [2]:
import time
import csv
import re
import random
import logging
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, ElementClickInterceptedException



In [3]:
logging.basicConfig(
    level=logging.INFO,
    format='[%(asctime)s] %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)

In [4]:
def setup_driver(headless=True):
    options = Options()
    if headless:
        options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

In [5]:
def save_to_csv(filename, data, headers):
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(headers)
        writer.writerows(data)
    logging.info(f'Сохранено {len(data)} записей в "{filename}"')

In [6]:
def safe_text(parent, selector):
    try:
        return parent.find_element(By.CSS_SELECTOR, selector).text.strip()
    except NoSuchElementException:
        return None

In [7]:
def get_element_text(elements, index):
    try:
        return elements[index].text.strip()
    except IndexError:
        return ''

In [8]:
def is_captcha_page(driver):
    try:
        return "Вы робот?" in driver.page_source
    except Exception:
        return False

In [9]:
def parse_novostroy_pik(driver):
    logging.info('Начало парсинга: Novostroy (ПИК)')
    start = time.time()
    url = 'https://www.novostroy-m.ru/baza/mkr_bobrovo/otzyvy'
    driver.get(url)
    results = []

    for page in range(1, 5): 
        driver.implicitly_wait(10)
        reviews = driver.find_elements(By.CSS_SELECTOR, 'div.lh_24.fs_16')
        names = driver.find_elements(By.CSS_SELECTOR, 'div.fw_m.fs_16.lh_24.mb_4.stroke_crop span')
        dates = driver.find_elements(By.CSS_SELECTOR, 'div.fs_14.lh_20.mute_clr')
        rating_blocks = driver.find_elements(By.CSS_SELECTOR, 'div.pt_4.pb_4.f_s_0')

        for review, name, date, rating_block in zip(reviews, names, dates, rating_blocks):
            stars = rating_block.find_elements(By.CSS_SELECTOR, 'svg[fill="#ffcd00"]')
            results.append([
                name.text.strip(),
                date.text.strip(),
                review.text.strip(),
                len(stars)
            ])

        try:
            next_button = driver.find_element(By.CSS_SELECTOR, f'li a[data-page="{page + 1}"]')
            next_button.click()
            time.sleep(2)
        except NoSuchElementException:
            break

    save_to_csv('novostroy_pik.csv', results, ['Name', 'Date', 'Review', 'Rating'])
    logging.info(f'Завершено: Novostroy (ПИК) (время: {time.time() - start:.2f} сек)')

In [10]:
def parse_forum_zhk_pik(driver):
    logging.info('Начало парсинга: Forum-ZHK (ПИК)')
    start = time.time()
    url = 'https://forum-zhk.com/zastroischiki-msk/pik-2'
    driver.get(url)
    results = []
    page = 1

    while True:
        driver.implicitly_wait(10)
        reviews = driver.find_elements(By.CSS_SELECTOR, 'div.reviews-item')

        for review in reviews:
            name = safe_text(review, 'span.reviews-item-name')
            date = safe_text(review, 'span.reviews-item-date')
            review_text = safe_text(review, 'div.reviews-item-text')
            try:
                rating = review.find_element(By.CSS_SELECTOR, 'input[name="score"]').get_attribute('value')
            except NoSuchElementException:
                rating = None

            results.append([name, date, review_text, rating])

        try:
            next_button = driver.find_element(By.CSS_SELECTOR, f'a[data-page="{page + 1}"]')
            next_button.click()
            page += 1
            time.sleep(2)
        except Exception:
            break

    save_to_csv('forum-zhk_pik.csv', results, ['Name', 'Date', 'Review', 'Rating'])
    logging.info(f'Завершено: Forum-ZHK (ПИК) (время: {time.time() - start:.2f} сек)')

In [11]:
def parse_otzovik_pik(driver):
    logging.info('Начало парсинга: Otzovik (ПИК)')
    start = time.time()

    options = Options()
    options.add_argument('--disable-blink-features=AutomationControlled')
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/123.0.0.0 Safari/537.36")
    driver = webdriver.Chrome(options=options)

    base_url = 'https://otzovik.com/reviews/gruppa_kompaniy_pik_russia_moskovskaya_oblast/'
    results = []

    try:
        for page in range(1, 15):  
            url = f"{base_url}{page}/"
            driver.get(url)

            if is_captcha_page(driver):
                logging.warning(f'Обнаружена капча на странице {url}')
                input("Решите капчу вручную в браузере и нажмите Enter для продолжения...")

            try:
                WebDriverWait(driver, 15).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, 'div.review-teaser[itemprop="description"]'))
                )
            except:
                logging.warning(f"Отзывы не загрузились на странице {url}")
                continue

            reviews = driver.find_elements(By.CSS_SELECTOR, 'div.review-teaser[itemprop="description"]')
            names = driver.find_elements(By.CSS_SELECTOR, 'div.user-info span[itemprop="name"]')
            dates = driver.find_elements(By.CSS_SELECTOR, 'div.review-postdate[itemprop="datePublished"]')
            ratings = driver.find_elements(By.CSS_SELECTOR, 'span[itemprop="reviewRating"] meta[itemprop="ratingValue"]')
            advantages = driver.find_elements(By.CSS_SELECTOR, 'div.review-plus')
            disadvantages = driver.find_elements(By.CSS_SELECTOR, 'div.review-minus')

            for i in range(len(reviews)):
                full_review = f"{reviews[i].text.strip()}\nДостоинства: {get_element_text(advantages, i)}\nНедостатки: {get_element_text(disadvantages, i)}"
                results.append([
                    get_element_text(names, i),
                    dates[i].get_attribute('content').strip() if i < len(dates) else '',
                    full_review.strip(),
                    ratings[i].get_attribute('content').strip() if i < len(ratings) else ''
                ])

            time.sleep(random.uniform(5, 8))

    finally:
        driver.quit()

    save_to_csv('otzovik_pik.csv', results, ['Name', 'Date', 'Review', 'Rating'])
    logging.info(f'Завершено: Otzovik (ПИК) (время: {time.time() - start:.2f} сек)')

In [12]:
def parse_novostroev_pik(driver):
    logging.info('Начало парсинга: Novostroev (ПИК)')
    start = time.time()
    url = 'https://novostroev.ru/zastroyshchiki/gruppa-kompaniy-pik/otzyvy/'
    driver.get(url)
    results = []
    page = 1

    while True:
        time.sleep(3)
        reviews = driver.find_elements(By.CSS_SELECTOR, '.reviews-list__item')

        for review in reviews:
            name = safe_text(review, '.review-n-2__name')
            date = safe_text(review, '.review-n-2__meta')
            review_text = safe_text(review, '.review-n-2__text')
            stars = None

            try:
                style = review.find_element(By.CSS_SELECTOR, '.stars__full').get_attribute("style")
                match = re.search(r'width:\s*(\d+)%', style)
                stars = int(match.group(1)) // 20 if match else None
            except Exception:
                pass

            results.append([name, date, review_text, stars])

        try:
            next_button = driver.find_element(By.CSS_SELECTOR, f'a[href*="page={page + 1}"]')
            next_button.click()
            page += 1
        except Exception:
            break

    save_to_csv('novostroev_pik.csv', results, ['Name', 'Date', 'Review', 'Stars'])
    logging.info(f'Завершено: Novostroev (ПИК) (время: {time.time() - start:.2f} сек)')

In [13]:
def parse_novostroy_samolet(driver):
    logging.info("Начало парсинга: Novostroy (Самолет)")
    start = time.time()

    url = "https://www.novostroy-m.ru/kompanii/samolet_development/otzyvy"
    driver.get(url)
    results = []

    for page in range(1, 44):
        driver.implicitly_wait(10)

        reviews = driver.find_elements(By.CSS_SELECTOR, 'div.lh_24.fs_16[itemprop="reviewBody"]')
        names = driver.find_elements(By.CSS_SELECTOR, 'span[itemprop="name"]')
        dates = driver.find_elements(By.CSS_SELECTOR, 'meta[itemprop="datePublished"]')
        ratings = driver.find_elements(By.CSS_SELECTOR, 'div.pt_4.pb_4.f_s_0')

        if not (reviews and names and dates and ratings):
            logging.warning(f"На странице {page} не найдены все элементы.")
            break

        for review, name, date, rating in zip(reviews, names, dates, ratings):
            try:
                filled_stars = rating.find_elements(By.CSS_SELECTOR, 'svg.w_16.fl_l.d_b[fill="#ffcd00"]')
                results.append([
                    name.text.strip(),
                    date.get_attribute("content").strip(),
                    review.text.strip(),
                    len(filled_stars)
                ])
            except Exception as e:
                logging.warning(f"Ошибка при обработке одного из отзывов: {e}")
                continue

        try:
            WebDriverWait(driver, 10).until(
                EC.invisibility_of_element_located((By.CSS_SELECTOR, 'div.js-comments.ajax-loader'))
            )

            next_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, 'li.next .paginator_arrow_box'))
            )

            driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", next_button)
            time.sleep(1)
            driver.execute_script("arguments[0].click();", next_button)
            time.sleep(2)

        except (NoSuchElementException, TimeoutException, ElementClickInterceptedException) as e:
            logging.warning(f'Кнопка "вперёд" не найдена или не кликабельна на странице {page}: {e}')
            break

    save_to_csv("novostroy_samolet.csv", results, ["Name", "Date", "Review", "Rating"])
    logging.info(f"Завершено: Novostroy (Самолет) — {len(results)} отзывов за {time.time() - start:.2f} сек.")

In [14]:
def parse_forum_zhk_samolet(driver):
    logging.info('Начало парсинга: Forum-ZHK (Самолет)')
    start = time.time()
    url = 'https://forum-zhk.com/zastroischiki-msk/samolet-development'
    driver.get(url)
    results = []
    page = 1

    while True:
        driver.implicitly_wait(10)
        reviews = driver.find_elements(By.CSS_SELECTOR, 'div.reviews-item')

        for review in reviews:
            name = safe_text(review, 'span.reviews-item-name')
            date = safe_text(review, 'span.reviews-item-date')
            review_text = safe_text(review, 'div.reviews-item-text')
            try:
                rating = review.find_element(By.CSS_SELECTOR, 'input[name="score"]').get_attribute('value')
            except NoSuchElementException:
                rating = None

            results.append([name, date, review_text, rating])

        try:
            next_button = driver.find_element(By.CSS_SELECTOR, f'a[data-page="{page + 1}"]')
            next_button.click()
            page += 1
            time.sleep(2)
        except Exception:
            break

    save_to_csv('forum-zhk_samolet.csv', results, ['Name', 'Date', 'Review', 'Rating'])
    logging.info(f'Завершено: Forum-ZHK (Самолет) (время: {time.time() - start:.2f} сек)')

In [15]:
def parse_otzovik_samolet(driver):
    logging.info('Начало парсинга: Otzovik (Самолет)')
    start = time.time()

    options = Options()
    options.add_argument('--disable-blink-features=AutomationControlled')
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/123.0.0.0 Safari/537.36")
    driver = webdriver.Chrome(options=options)

    base_url = 'https://otzovik.com/reviews/kompaniya_samolet_development_russia_moscow/?&capt4a=5731746187449168'
    results = []

    try:
        for page in range(1, 9):  
            url = f"{base_url}&page={page}"
            driver.get(url)

            if is_captcha_page(driver):
                logging.warning(f'Обнаружена капча на странице {url}')
                input("Решите капчу вручную в браузере и нажмите Enter для продолжения...")

            try:
                WebDriverWait(driver, 15).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, 'div.review-teaser[itemprop="description"]'))
                )
            except:
                logging.warning(f"Отзывы не загрузились на странице {url}")
                continue

            reviews = driver.find_elements(By.CSS_SELECTOR, 'div.review-teaser[itemprop="description"]')
            names = driver.find_elements(By.CSS_SELECTOR, 'div.user-info span[itemprop="name"]')
            dates = driver.find_elements(By.CSS_SELECTOR, 'div.review-postdate[itemprop="datePublished"]')
            ratings = driver.find_elements(By.CSS_SELECTOR, 'span[itemprop="reviewRating"] meta[itemprop="ratingValue"]')
            advantages = driver.find_elements(By.CSS_SELECTOR, 'div.review-plus')
            disadvantages = driver.find_elements(By.CSS_SELECTOR, 'div.review-minus')

            for i in range(len(reviews)):
                full_review = f"{reviews[i].text.strip()}\nДостоинства: {get_element_text(advantages, i)}\nНедостатки: {get_element_text(disadvantages, i)}"
                results.append([
                    get_element_text(names, i),
                    dates[i].get_attribute('content').strip() if i < len(dates) else '',
                    full_review.strip(),
                    ratings[i].get_attribute('content').strip() if i < len(ratings) else ''
                ])

            time.sleep(random.uniform(5, 8))

    finally:
        driver.quit()

    save_to_csv('otzovik_samolet.csv', results, ['Name', 'Date', 'Review', 'Rating'])
    logging.info(f'Завершено: Otzovik (Самолет) (время: {time.time() - start:.2f} сек)')

In [16]:
def parse_novostroev_samolet(driver):
    logging.info('Начало парсинга: Novostroev (Самолет)')
    start = time.time()
    url = 'https://novostroev.ru/zastroyshchiki/samolet-development/otzyvy/'
    driver.get(url)
    results = []
    page = 1

    while True:
        time.sleep(3)
        reviews = driver.find_elements(By.CSS_SELECTOR, '.reviews-list__item')

        for review in reviews:
            name = safe_text(review, '.review-n-2__name')
            date = safe_text(review, '.review-n-2__meta')
            review_text = safe_text(review, '.review-n-2__text')
            stars = None

            try:
                style = review.find_element(By.CSS_SELECTOR, '.stars__full').get_attribute("style")
                match = re.search(r'width:\s*(\d+)%', style)
                stars = int(match.group(1)) // 20 if match else None
            except Exception:
                pass

            results.append([name, date, review_text, stars])

        try:
            next_button = driver.find_element(By.CSS_SELECTOR, f'a[href*="page={page + 1}"]')
            next_button.click()
            page += 1
        except Exception:
            break

    save_to_csv('novostroev_samolet.csv', results, ['Name', 'Date', 'Review', 'Stars'])
    logging.info(f'Завершено: Novostroev (Самолет) (время: {time.time() - start:.2f} сек)')

In [17]:
def parse_novostroy_a101(driver):
    logging.info('Начало парсинга: Novostroy (А101)')
    start = time.time()
    url = 'https://www.novostroy-m.ru/kompanii/a101_development/otzyvy'
    driver.get(url)
    results = []

    for page in range(1, 14):  
        driver.implicitly_wait(10)
        reviews = driver.find_elements(By.CSS_SELECTOR, 'div.lh_24.fs_16[itemprop="reviewBody"]')
        names = driver.find_elements(By.CSS_SELECTOR, 'span[itemprop="name"]')
        dates = driver.find_elements(By.CSS_SELECTOR, 'meta[itemprop="datePublished"]')
        ratings = driver.find_elements(By.CSS_SELECTOR, 'div.pt_4.pb_4.f_s_0')

        for review, name, date, rating in zip(reviews, names, dates, ratings):
            filled_stars = rating.find_elements(By.CSS_SELECTOR, 'svg.w_16.fl_l.d_b[fill="#ffcd00"]')
            results.append([
                name.text.strip(),
                date.get_attribute('content').strip(),
                review.text.strip(),
                len(filled_stars)
            ])

        try:
            next_button = driver.find_element(By.CSS_SELECTOR, f'li a[data-page="{page + 1}"]')
            next_button.click()
            time.sleep(2)
        except NoSuchElementException:
            break

    save_to_csv('novostroy_a101.csv', results, ['Name', 'Date', 'Review', 'Rating'])
    logging.info(f'Завершено: Novostroy (А101) (время: {time.time() - start:.2f} сек)')

In [18]:
def parse_forum_zhk_a101(driver):
    logging.info('Начало парсинга: Forum-ZHK (А101)')
    start = time.time()
    url = 'https://forum-zhk.com/zastroischiki-msk/a101'
    driver.get(url)
    results = []
    page = 1

    while True:
        driver.implicitly_wait(10)
        reviews = driver.find_elements(By.CSS_SELECTOR, 'div.reviews-item')

        for review in reviews:
            name = safe_text(review, 'span.reviews-item-name')
            date = safe_text(review, 'span.reviews-item-date')
            review_text = safe_text(review, 'div.reviews-item-text')
            try:
                rating = review.find_element(By.CSS_SELECTOR, 'input[name="score"]').get_attribute('value')
            except NoSuchElementException:
                rating = None

            results.append([name, date, review_text, rating])

        try:
            next_button = driver.find_element(By.CSS_SELECTOR, f'a[data-page="{page + 1}"]')
            next_button.click()
            page += 1
            time.sleep(2)
        except Exception:
            break

    save_to_csv('forum-zhk_a101.csv', results, ['Name', 'Date', 'Review', 'Rating'])
    logging.info(f'Завершено: Forum-ZHK (А101) (время: {time.time() - start:.2f} сек)')

In [19]:
def parse_otzovik_a101(driver):
    logging.info('Начало парсинга: Otzovik (А101)')
    start = time.time()

    options = Options()
    options.add_argument('--disable-blink-features=AutomationControlled')
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/123.0.0.0 Safari/537.36")
    driver = webdriver.Chrome(options=options)

    base_url = 'https://otzovik.com/reviews/zastroyschik_a_101_russia_moscow/'
    results = []

    try:
        for page in range(1, 6):  
            url = f"{base_url}{page}/"
            driver.get(url)

            if is_captcha_page(driver):
                logging.warning(f'Обнаружена капча на странице {url}')
                input("Решите капчу вручную в браузере и нажмите Enter для продолжения...")

            try:
                WebDriverWait(driver, 15).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, 'div.review-teaser[itemprop="description"]'))
                )
            except:
                logging.warning(f"Отзывы не загрузились на странице {url}")
                continue

            reviews = driver.find_elements(By.CSS_SELECTOR, 'div.review-teaser[itemprop="description"]')
            names = driver.find_elements(By.CSS_SELECTOR, 'div.user-info span[itemprop="name"]')
            dates = driver.find_elements(By.CSS_SELECTOR, 'div.review-postdate[itemprop="datePublished"]')
            ratings = driver.find_elements(By.CSS_SELECTOR, 'span[itemprop="reviewRating"] meta[itemprop="ratingValue"]')
            advantages = driver.find_elements(By.CSS_SELECTOR, 'div.review-plus')
            disadvantages = driver.find_elements(By.CSS_SELECTOR, 'div.review-minus')

            for i in range(len(reviews)):
                full_review = f"{reviews[i].text.strip()}\nДостоинства: {get_element_text(advantages, i)}\nНедостатки: {get_element_text(disadvantages, i)}"
                results.append([
                    get_element_text(names, i),
                    dates[i].get_attribute('content').strip() if i < len(dates) else '',
                    full_review.strip(),
                    ratings[i].get_attribute('content').strip() if i < len(ratings) else ''
                ])

            time.sleep(random.uniform(5, 8))

    finally:
        driver.quit()

    save_to_csv('otzovik_a101.csv', results, ['Name', 'Date', 'Review', 'Rating'])
    logging.info(f'Завершено: Otzovik (А101) (время: {time.time() - start:.2f} сек)')

In [20]:
def parse_novostroev_a101(driver):
    logging.info('Начало парсинга: Novostroev (А101)')
    start = time.time()
    url = 'https://novostroev.ru/zastroyshchiki/a101-development/otzyvy/'
    driver.get(url)
    results = []
    page = 1

    while True:
        time.sleep(3)
        reviews = driver.find_elements(By.CSS_SELECTOR, '.reviews-list__item')

        for review in reviews:
            name = safe_text(review, '.review-n-2__name')
            date = safe_text(review, '.review-n-2__meta')
            review_text = safe_text(review, '.review-n-2__text')
            stars = None

            try:
                style = review.find_element(By.CSS_SELECTOR, '.stars__full').get_attribute("style")
                match = re.search(r'width:\s*(\d+)%', style)
                stars = int(match.group(1)) // 20 if match else None
            except Exception:
                pass

            results.append([name, date, review_text, stars])

        try:
            next_button = driver.find_element(By.CSS_SELECTOR, f'a[href*="page={page + 1}"]')
            next_button.click()
            page += 1
        except Exception:
            break

    save_to_csv('novostroev_a101.csv', results, ['Name', 'Date', 'Review', 'Stars'])
    logging.info(f'Завершено: Novostroev (А101) (время: {time.time() - start:.2f} сек)')

In [21]:
def parse_novostroy_mr_group(driver):
    logging.info('Начало парсинга: Novostroy (MR Group)')
    start = time.time()
    url = 'https://www.novostroy-m.ru/kompanii/mr_group/otzyvy'
    driver.get(url)
    results = []

    for page in range(1, 25):  
        driver.implicitly_wait(10)
        reviews = driver.find_elements(By.CSS_SELECTOR, 'div.lh_24.fs_16[itemprop="reviewBody"]')
        names = driver.find_elements(By.CSS_SELECTOR, 'span[itemprop="name"]')
        dates = driver.find_elements(By.CSS_SELECTOR, 'meta[itemprop="datePublished"]')
        ratings = driver.find_elements(By.CSS_SELECTOR, 'div.pt_4.pb_4.f_s_0')

        for review, name, date, rating in zip(reviews, names, dates, ratings):
            filled_stars = rating.find_elements(By.CSS_SELECTOR, 'svg.w_16.fl_l.d_b[fill="#ffcd00"]')
            results.append([
                name.text.strip(),
                date.get_attribute('content').strip(),
                review.text.strip(),
                len(filled_stars)
            ])

        try:
            next_button = driver.find_element(By.CSS_SELECTOR, f'li a[data-page="{page + 1}"]')
            next_button.click()
            time.sleep(2)
        except NoSuchElementException:
            break

    save_to_csv('novostroy_mr_group.csv', results, ['Name', 'Date', 'Review', 'Rating'])
    logging.info(f'Завершено: Novostroy (MR Group) (время: {time.time() - start:.2f} сек)')

In [22]:
def parse_forum_zhk_mr_group(driver):
    logging.info('Начало парсинга: Forum-ZHK (MR Group)')
    start = time.time()
    url = 'https://forum-zhk.com/zastroischiki-msk/mr-group'
    driver.get(url)
    results = []
    page = 1

    while True:
        driver.implicitly_wait(10)
        reviews = driver.find_elements(By.CSS_SELECTOR, 'div.reviews-item')

        for review in reviews:
            name = safe_text(review, 'span.reviews-item-name')
            date = safe_text(review, 'span.reviews-item-date')
            review_text = safe_text(review, 'div.reviews-item-text')
            try:
                rating = review.find_element(By.CSS_SELECTOR, 'input[name="score"]').get_attribute('value')
            except NoSuchElementException:
                rating = None

            results.append([name, date, review_text, rating])

        try:
            next_button = driver.find_element(By.CSS_SELECTOR, f'a[data-page="{page + 1}"]')
            driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", next_button)
            time.sleep(1)
            next_button.click()
            page += 1
            time.sleep(2)
        except Exception:
            break

    save_to_csv('forum-zhk_mr_group.csv', results, ['Name', 'Date', 'Review', 'Rating'])
    logging.info(f'Завершено: Forum-ZHK (MR Group) (время: {time.time() - start:.2f} сек)')
    

In [23]:
def parse_otzovik_mr_group(driver):
    logging.info('Начало парсинга: Otzovik (MR Group)')
    start = time.time()

    options = Options()
    options.add_argument('--disable-blink-features=AutomationControlled')
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/123.0.0.0 Safari/537.36")
    driver = webdriver.Chrome(options=options)

    base_url = 'https://otzovik.com/reviews/zastroyschik_mr_group_russia_moscow/'
    results = []

    try:
        page = 1
        url = f"{base_url}{page}/"
        driver.get(url)

        if is_captcha_page(driver):
            logging.warning(f'Обнаружена капча на странице {url}')
            input("Решите капчу вручную в браузере и нажмите Enter для продолжения...")

        try:
            WebDriverWait(driver, 15).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, 'div.review-teaser[itemprop="description"]'))
            )
        except:
            logging.warning(f"Отзывы не загрузились на странице {url}")

        reviews = driver.find_elements(By.CSS_SELECTOR, 'div.review-teaser[itemprop="description"]')
        names = driver.find_elements(By.CSS_SELECTOR, 'div.user-info span[itemprop="name"]')
        dates = driver.find_elements(By.CSS_SELECTOR, 'div.review-postdate[itemprop="datePublished"]')
        ratings = driver.find_elements(By.CSS_SELECTOR, 'span[itemprop="reviewRating"] meta[itemprop="ratingValue"]')
        advantages = driver.find_elements(By.CSS_SELECTOR, 'div.review-plus')
        disadvantages = driver.find_elements(By.CSS_SELECTOR, 'div.review-minus')

        for i in range(len(reviews)):
            full_review = f"{reviews[i].text.strip()}\nДостоинства: {get_element_text(advantages, i)}\nНедостатки: {get_element_text(disadvantages, i)}"
            results.append([
                get_element_text(names, i),
                dates[i].get_attribute('content').strip() if i < len(dates) else '',
                full_review.strip(),
                ratings[i].get_attribute('content').strip() if i < len(ratings) else ''
            ])

        time.sleep(random.uniform(5, 8))

    finally:
        driver.quit()

    save_to_csv('otzovik_mr_group.csv', results, ['Name', 'Date', 'Review', 'Rating'])
    logging.info(f'Завершено: Otzovik (MR Group) (время: {time.time() - start:.2f} сек)')

In [None]:
def parse_novostroev_mr_group(driver):
    logging.info('Начало парсинга: Novostroev (MR Group)')
    start = time.time()
    url = 'https://novostroev.ru/zastroyshchiki/mr-group/otzyvy/'
    driver.get(url)
    results = []
    page = 1

    while True:
        time.sleep(3)
        reviews = driver.find_elements(By.CSS_SELECTOR, '.reviews-list__item')

        for review in reviews:
            name = safe_text(review, '.review-n-2__name')
            date = safe_text(review, '.review-n-2__meta')
            review_text = safe_text(review, '.review-n-2__text')
            stars = None

            try:
                style = review.find_element(By.CSS_SELECTOR, '.stars__full').get_attribute("style")
                match = re.search(r'width:\s*(\d+)%', style)
                stars = int(match.group(1)) // 20 if match else None
            except Exception:
                pass

            results.append([name, date, review_text, stars])

        try:
            next_button = driver.find_element(By.CSS_SELECTOR, f'a[href*="page={page + 1}"]')
            driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", next_button)
            time.sleep(1)
            next_button.click()
            page += 1
        except Exception:
            break

    save_to_csv('novostroev_mr_group.csv', results, ['Name', 'Date', 'Review', 'Stars'])
    logging.info(f'Завершено: Novostroev (MR Group) (время: {time.time() - start:.2f} сек)')

In [25]:
def parse_novostroy_donstroy(driver):
    logging.info('Начало парсинга: Novostroy (Донстрой)')
    start = time.time()
    url = 'https://www.novostroy-m.ru/kompanii/donstroy/otzyvy'
    driver.get(url)
    results = []

    for page in range(1, 11):  
        driver.implicitly_wait(10)
        reviews = driver.find_elements(By.CSS_SELECTOR, 'div.lh_24.fs_16[itemprop="reviewBody"]')
        names = driver.find_elements(By.CSS_SELECTOR, 'span[itemprop="name"]')
        dates = driver.find_elements(By.CSS_SELECTOR, 'meta[itemprop="datePublished"]')
        ratings = driver.find_elements(By.CSS_SELECTOR, 'div.pt_4.pb_4.f_s_0')

        for review, name, date, rating in zip(reviews, names, dates, ratings):
            filled_stars = rating.find_elements(By.CSS_SELECTOR, 'svg.w_16.fl_l.d_b[fill="#ffcd00"]')
            results.append([
                name.text.strip(),
                date.get_attribute('content').strip(),
                review.text.strip(),
                len(filled_stars)
            ])

        try:
            next_button = driver.find_element(By.CSS_SELECTOR, f'li a[data-page="{page + 1}"]')
            next_button.click()
            time.sleep(2)
        except NoSuchElementException:
            break

    save_to_csv('novostroy_donstroy.csv', results, ['Name', 'Date', 'Review', 'Rating'])
    logging.info(f'Завершено: Novostroy (Донстрой) (время: {time.time() - start:.2f} сек)')


In [26]:
def parse_forum_zhk_donstroy(driver):
    logging.info('Начало парсинга: Forum-ZHK (Донстрой)')
    start = time.time()
    url = 'https://forum-zhk.com/zastroischiki-msk/donstroy'
    driver.get(url)
    results = []
    page = 1

    while True:
        driver.implicitly_wait(10)
        reviews = driver.find_elements(By.CSS_SELECTOR, 'div.reviews-item')

        for review in reviews:
            name = safe_text(review, 'span.reviews-item-name')
            date = safe_text(review, 'span.reviews-item-date')
            review_text = safe_text(review, 'div.reviews-item-text')
            try:
                rating = review.find_element(By.CSS_SELECTOR, 'input[name="score"]').get_attribute('value')
            except NoSuchElementException:
                rating = None

            results.append([name, date, review_text, rating])

        try:
            next_button = driver.find_element(By.CSS_SELECTOR, f'a[data-page="{page + 1}"]')
            driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", next_button)
            time.sleep(1)
            next_button.click()
            page += 1
            time.sleep(2)
        except Exception:
            break

    save_to_csv('forum-zhk_donstroy.csv', results, ['Name', 'Date', 'Review', 'Rating'])
    logging.info(f'Завершено: Forum-ZHK (Донстрой) (время: {time.time() - start:.2f} сек)')
    

In [27]:
def parse_otzovik_donstroy(driver):
    logging.info('Начало парсинга: Otzovik (Донстрой)')
    start = time.time()

    options = Options()
    options.add_argument('--disable-blink-features=AutomationControlled')
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/123.0.0.0 Safari/537.36")
    driver = webdriver.Chrome(options=options)

    base_url = 'https://otzovik.com/reviews/kompaniya_donstroy_russia_moscow/'
    results = []

    try:
        page = 1
        url = f"{base_url}{page}/"
        driver.get(url)

        if is_captcha_page(driver):
            logging.warning(f'Обнаружена капча на странице {url}')
            input("Решите капчу вручную в браузере и нажмите Enter для продолжения...")

        try:
            WebDriverWait(driver, 15).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, 'div.review-teaser[itemprop="description"]'))
            )
        except:
            logging.warning(f"Отзывы не загрузились на странице {url}")

        reviews = driver.find_elements(By.CSS_SELECTOR, 'div.review-teaser[itemprop="description"]')
        names = driver.find_elements(By.CSS_SELECTOR, 'div.user-info span[itemprop="name"]')
        dates = driver.find_elements(By.CSS_SELECTOR, 'div.review-postdate[itemprop="datePublished"]')
        ratings = driver.find_elements(By.CSS_SELECTOR, 'span[itemprop="reviewRating"] meta[itemprop="ratingValue"]')
        advantages = driver.find_elements(By.CSS_SELECTOR, 'div.review-plus')
        disadvantages = driver.find_elements(By.CSS_SELECTOR, 'div.review-minus')

        for i in range(len(reviews)):
            full_review = f"{reviews[i].text.strip()}\nДостоинства: {get_element_text(advantages, i)}\nНедостатки: {get_element_text(disadvantages, i)}"
            results.append([
                get_element_text(names, i),
                dates[i].get_attribute('content').strip() if i < len(dates) else '',
                full_review.strip(),
                ratings[i].get_attribute('content').strip() if i < len(ratings) else ''
            ])

        time.sleep(random.uniform(5, 8))

    finally:
        driver.quit()

    save_to_csv('otzovik_donstroy.csv', results, ['Name', 'Date', 'Review', 'Rating'])
    logging.info(f'Завершено: Otzovik (Донстрой) (время: {time.time() - start:.2f} сек)')

In [28]:
def parse_novostroev_donstroy(driver):
    logging.info('Начало парсинга: Novostroev (Донстрой)')
    start = time.time()
    url = 'https://novostroev.ru/zastroyshchiki/donstroy/otzyvy/'
    driver.get(url)
    results = []
    page = 1

    while True:
        time.sleep(3)
        reviews = driver.find_elements(By.CSS_SELECTOR, '.reviews-list__item')

        for review in reviews:
            name = safe_text(review, '.review-n-2__name')
            date = safe_text(review, '.review-n-2__meta')
            review_text = safe_text(review, '.review-n-2__text')
            stars = None

            try:
                style = review.find_element(By.CSS_SELECTOR, '.stars__full').get_attribute("style")
                match = re.search(r'width:\s*(\d+)%', style)
                stars = int(match.group(1)) // 20 if match else None
            except Exception:
                pass

            results.append([name, date, review_text, stars])

        try:
            next_button = driver.find_element(By.CSS_SELECTOR, f'a[href*="page={page + 1}"]')
            driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", next_button)
            time.sleep(1)
            next_button.click()
            page += 1
        except Exception:
            break

    save_to_csv('novostroev_donstroy.csv', results, ['Name', 'Date', 'Review', 'Stars'])
    logging.info(f'Завершено: Novostroev (Донстрой) (время: {time.time() - start:.2f} сек)')

In [28]:
if __name__ == "__main__":
    logging.info("Запуск парсинга отзывов")
    driver = setup_driver()
    try:
        parse_novostroy_pik(driver)
        parse_forum_zhk_pik(driver)
        parse_otzovik_pik(driver)
        parse_novostroev_pik(driver)

    finally:
        driver.quit()
        logging.info("Парсинг завершён, браузер закрыт")

[2025-05-02 16:20:00] INFO - Запуск парсинга отзывов
[2025-05-02 16:20:00] INFO - Get LATEST chromedriver version for google-chrome
[2025-05-02 16:20:00] INFO - Get LATEST chromedriver version for google-chrome
[2025-05-02 16:20:00] INFO - Driver [/Users/sabinakalbieva/.wdm/drivers/chromedriver/mac64/135.0.7049.114/chromedriver-mac-arm64/chromedriver] found in cache
[2025-05-02 16:20:01] INFO - Начало парсинга: Novostroy (ПИК)
[2025-05-02 16:21:17] INFO - Сохранено 22 записей в "novostroy_pik.csv"
[2025-05-02 16:21:17] INFO - Завершено: Novostroy (ПИК) (время: 75.66 сек)
[2025-05-02 16:21:17] INFO - Начало парсинга: Forum-ZHK (ПИК)
[2025-05-02 16:24:30] INFO - Сохранено 128 записей в "forum-zhk_pik.csv"
[2025-05-02 16:24:30] INFO - Завершено: Forum-ZHK (ПИК) (время: 193.64 сек)
[2025-05-02 16:24:30] INFO - Начало парсинга: Otzovik (ПИК)
[2025-05-02 16:26:43] INFO - Сохранено 550 записей в "otzovik_pik.csv"
[2025-05-02 16:26:43] INFO - Завершено: Otzovik (ПИК) (время: 132.55 сек)
[2025-

In [105]:
if __name__ == "__main__":
    logging.info("Запуск парсинга отзывов")
    driver = setup_driver()
    try:
        parse_novostroy_samolet(driver)
        parse_forum_zhk_samolet(driver)
        parse_otzovik_samolet(driver)
        parse_novostroev_samolet(driver)

    finally:
        driver.quit()
        logging.info("Парсинг завершён, браузер закрыт")

[2025-05-02 19:26:23] INFO - Запуск парсинга отзывов
[2025-05-02 19:26:23] INFO - Get LATEST chromedriver version for google-chrome
[2025-05-02 19:26:23] INFO - Get LATEST chromedriver version for google-chrome
[2025-05-02 19:26:24] INFO - Driver [/Users/sabinakalbieva/.wdm/drivers/chromedriver/mac64/136.0.7103.49/chromedriver-mac-arm64/chromedriver] found in cache
[2025-05-02 19:26:25] INFO - Начало парсинга: Novostroy (Самолет)
[2025-05-02 19:36:46] INFO - Сохранено 430 записей в "novostroy_samolet.csv"
[2025-05-02 19:36:46] INFO - Завершено: Novostroy (Самолет) — 430 отзывов за 621.58 сек.
[2025-05-02 19:36:46] INFO - Начало парсинга: Forum-ZHK (Самолет)
[2025-05-02 19:37:57] INFO - Сохранено 33 записей в "forum-zhk_samolet.csv"
[2025-05-02 19:37:57] INFO - Завершено: Forum-ZHK (Самолет) (время: 70.87 сек)
[2025-05-02 19:37:57] INFO - Начало парсинга: Otzovik (Самолет)
[2025-05-02 19:39:51] INFO - Сохранено 287 записей в "otzovik_samolet.csv"
[2025-05-02 19:39:51] INFO - Завершено: 

In [29]:
if __name__ == "__main__":
    logging.info("Запуск парсинга отзывов")
    driver = setup_driver()
    try:
        parse_novostroy_a101(driver)
        parse_forum_zhk_a101(driver)
        parse_otzovik_a101(driver)
        parse_novostroev_a101(driver)

    finally:
        driver.quit()
        logging.info("Парсинг завершён, браузер закрыт")

[2025-05-02 22:25:59] INFO - Запуск парсинга отзывов
[2025-05-02 22:26:00] INFO - Get LATEST chromedriver version for google-chrome
[2025-05-02 22:26:00] INFO - Get LATEST chromedriver version for google-chrome
[2025-05-02 22:26:01] INFO - Driver [/Users/sabinakalbieva/.wdm/drivers/chromedriver/mac64/136.0.7103.49/chromedriver-mac-arm64/chromedriver] found in cache
[2025-05-02 22:26:01] INFO - Начало парсинга: Novostroy (А101)
[2025-05-02 22:28:20] INFO - Сохранено 103 записей в "novostroy_a101.csv"
[2025-05-02 22:28:20] INFO - Завершено: Novostroy (А101) (время: 138.46 сек)
[2025-05-02 22:28:20] INFO - Начало парсинга: Forum-ZHK (А101)
[2025-05-02 22:34:21] INFO - Сохранено 92 записей в "forum-zhk_a101.csv"
[2025-05-02 22:34:21] INFO - Завершено: Forum-ZHK (А101) (время: 361.52 сек)
[2025-05-02 22:34:21] INFO - Начало парсинга: Otzovik (А101)
[2025-05-02 22:36:10] INFO - Сохранено 172 записей в "otzovik_a101.csv"
[2025-05-02 22:36:10] INFO - Завершено: Otzovik (А101) (время: 108.10 се

In [65]:
if __name__ == "__main__":
    logging.info("Запуск парсинга отзывов")
    driver = setup_driver()
    try:
        parse_novostroy_mr_group(driver)
        parse_forum_zhk_mr_group(driver)
        parse_otzovik_mr_group(driver)
        parse_novostroev_mr_group(driver)

    finally:
        driver.quit()
        logging.info("Парсинг завершён, браузер закрыт")

[2025-05-02 17:24:30] INFO - Запуск парсинга отзывов
[2025-05-02 17:24:30] INFO - Get LATEST chromedriver version for google-chrome
[2025-05-02 17:24:30] INFO - Get LATEST chromedriver version for google-chrome
[2025-05-02 17:24:31] INFO - Driver [/Users/sabinakalbieva/.wdm/drivers/chromedriver/mac64/136.0.7103.49/chromedriver-mac-arm64/chromedriver] found in cache
[2025-05-02 17:24:31] INFO - Начало парсинга: Novostroy (MR Group)
[2025-05-02 17:27:18] INFO - Сохранено 170 записей в "novostroy_mr_group.csv"
[2025-05-02 17:27:18] INFO - Завершено: Novostroy (MR Group) (время: 166.12 сек)
[2025-05-02 17:27:18] INFO - Начало парсинга: Forum-ZHK (MR Group)
[2025-05-02 17:33:14] INFO - Сохранено 77 записей в "forum-zhk_mr_group.csv"
[2025-05-02 17:33:14] INFO - Завершено: Forum-ZHK (MR Group) (время: 356.60 сек)
[2025-05-02 17:33:14] INFO - Начало парсинга: Otzovik (MR Group)
[2025-05-02 17:33:44] INFO - Сохранено 29 записей в "otzovik_mr_group.csv"
[2025-05-02 17:33:44] INFO - Завершено: O

In [81]:
if __name__ == "__main__":
    logging.info("Запуск парсинга отзывов")
    driver = setup_driver()
    try:
        parse_novostroy_donstroy(driver)
        parse_forum_zhk_donstroy(driver)
        parse_otzovik_donstroy(driver)
        parse_novostroev_donstroy(driver)

    finally:
        driver.quit()
        logging.info("Парсинг завершён, браузер закрыт")

[2025-05-02 18:05:35] INFO - Запуск парсинга отзывов
[2025-05-02 18:05:35] INFO - Get LATEST chromedriver version for google-chrome
[2025-05-02 18:05:36] INFO - Get LATEST chromedriver version for google-chrome
[2025-05-02 18:05:37] INFO - Driver [/Users/sabinakalbieva/.wdm/drivers/chromedriver/mac64/136.0.7103.49/chromedriver-mac-arm64/chromedriver] found in cache
[2025-05-02 18:05:38] INFO - Начало парсинга: Novostroy (Донстрой)
[2025-05-02 18:07:02] INFO - Сохранено 80 записей в "novostroy_donstroy.csv"
[2025-05-02 18:07:02] INFO - Завершено: Novostroy (Донстрой) (время: 84.03 сек)
[2025-05-02 18:07:02] INFO - Начало парсинга: Forum-ZHK (Донстрой)
[2025-05-02 18:09:35] INFO - Сохранено 70 записей в "forum-zhk_donstroy.csv"
[2025-05-02 18:09:35] INFO - Завершено: Forum-ZHK (Донстрой) (время: 152.70 сек)
[2025-05-02 18:09:35] INFO - Начало парсинга: Otzovik (Донстрой)
[2025-05-02 18:10:06] INFO - Сохранено 1 записей в "otzovik_donstroy.csv"
[2025-05-02 18:10:06] INFO - Завершено: Otzo