In [28]:
from bs4 import BeautifulSoup as bs
import pandas as pd
from pyppeteer import launch # type: ignore

In [29]:
async def parse_city_report(page, url):
    await page.goto(url, {'waitUntil': 'networkidle2'})
    content = await page.content()
    soup = bs(content, 'html.parser')

    report_id = url.rstrip('/').split('/')[-1]

    report_type = None
    og = soup.find('meta', {'property': 'og:title'})
    if og and og.get('content'):
        parts = [p.strip() for p in og['content'].split('/')]
        if len(parts) >= 2:
            report_type = parts[1]

    user_message = None
    for msg in soup.select('div.problem-message'):
        title_el = msg.select_one('.problem-message__title')
        if title_el and title_el.get_text(strip=True).startswith('Сообщение'):
            text_div = msg.select_one('div.problem-message__text')
            if text_div:
                user_message = text_div.get_text(' ', strip=True)
            break

    return {
        'id': report_id,
        'type': report_type,
        'message': user_message
    }

In [None]:
async def brows():
    browser = await launch(headless=False, args=['--no-sandbox'], executablePath=r"C:\Program Files (x86)\Microsoft\Edge\Application\msedge.exe")
    page = await browser.newPage()
    await page.setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64)")

    all_reports = []
    for p in range(1, 2):
        list_url = f'https://gorod.gov.spb.ru/problems/public/?page={p}'
        await page.goto(list_url, {'waitUntil': 'networkidle2'})
        content = await page.content()
        soup = bs(content, 'html.parser')

        links = await page.evaluate('''() => {
            return Array.from(document.querySelectorAll('a[href^="/problems/"]'))
                .map(a => a.href)
                .filter(h => /^https:\\/\\/gorod\\.gov\\.spb\\.ru\\/problems\\/\\d+\\/$/.test(h));
        }''')

        # de-duplicate
        seen = set()
        unique_links = []
        for link in links:
            if link not in seen:
                seen.add(link)
                unique_links.append(link)

        # parse each problem
        for link in unique_links:
            report = await parse_city_report(page, link)
            all_reports.append(report)

    await browser.close()
    return all_reports
all_reports = await brows()
reports_df = pd.DataFrame(all_reports)
