# CAPTCHA‑aware Kinopoisk Scraper
This notebook demonstrates how to pause scraping when the website presents a CAPTCHA challenge and resume automatically after you solve it.

It uses **pyppeteer**. Make sure you have it installed:
```bash
pip install pyppeteer
```
⚠️ **Edit the `executablePath` to match your browser location** or remove the argument to let pyppeteer download Chromium automatically.

In [1]:
import asyncio
from pyppeteer import launch
from random import randint

# Optional: tweak these to be less detectable
USER_AGENTS = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_7_10) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15',
]

In [2]:
CAPTCHA_MARKER = 'captcha'
LIST_ITEM      = '[data-test-id="movie-list-item"]'  # selector present on real page


In [3]:
async def wait_until_captcha_is_gone(page):
    """Pause while the tab shows a CAPTCHA, resume when it disappears."""
    print('🛑 CAPTCHA shown – please solve it in the browser window…')
    while CAPTCHA_MARKER in page.url:
        # Wait for the next navigation (user completed CAPTCHA)
        await page.waitForNavigation({'waitUntil': 'domcontentloaded'})
    # Ensure the real content is ready
    await page.waitForSelector(LIST_ITEM, timeout=0)
    print('✅ CAPTCHA solved, continuing…')

async def safe_goto(page, url):
    await page.goto(url, {'waitUntil': 'domcontentloaded'})
    if CAPTCHA_MARKER in page.url:
        await wait_until_captcha_is_gone(page)
    await page.waitForSelector(LIST_ITEM, timeout=0)


In [4]:

async def scrape_page(page, page_number):
    url = f'https://www.kinopoisk.ru/lists/movies/top250/?page={page_number}'
    print(f'Scraping {url}')
    await safe_goto(page, url)

    movie_elements = await page.querySelectorAll(LIST_ITEM)
    movies_data = []

    for el in movie_elements:
        # Strategy 1: try the known data-tid selector
        title_handle = await el.querySelector('[data-tid="63c89e5b"]')
        if title_handle:
            title_text = (await (await title_handle.getProperty('textContent')).jsonValue()).strip()
        else:
            # Fallback: grab the *entire* element's visible text
            title_text = (await page.evaluate('(e) => e.innerText', el)).split('\n')[0].strip()

        movies_data.append(title_text)

    # Random delay to mimic human behaviour and reduce CAPTCHA frequency
    await asyncio.sleep(randint(2, 5))
    return movies_data


In [5]:
async def scraper(max_pages=5):
    browser = await launch(
        headless=False,
        executablePath=r'C:\\Program Files (x86)\\Microsoft\\Edge\\Application\\msedge.exe',
        args=['--disable-blink-features=AutomationControlled']
    )
    page = await browser.newPage()
    page.setUserAgent(USER_AGENTS[randint(0, len(USER_AGENTS)-1)])

    all_movies = []
    for page_number in range(1, max_pages + 1):
        all_movies += await scrape_page(page, page_number)

    print(f'Total movies scraped: {len(all_movies)}')
    await browser.close()
    return all_movies


In [6]:
# Run the scraper (opens a non‑headless browser)
movies = await scraper(max_pages=5)
movies[:10]  # peek at first 10 results

  page.setUserAgent(USER_AGENTS[randint(0, len(USER_AGENTS)-1)])


Scraping https://www.kinopoisk.ru/lists/movies/top250/?page=1
🛑 CAPTCHA shown – please solve it in the browser window…
✅ CAPTCHA solved, continuing…
Scraping https://www.kinopoisk.ru/lists/movies/top250/?page=2
Scraping https://www.kinopoisk.ru/lists/movies/top250/?page=3
Scraping https://www.kinopoisk.ru/lists/movies/top250/?page=4
Scraping https://www.kinopoisk.ru/lists/movies/top250/?page=5
Total movies scraped: 250


['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']