In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import re
from pyppeteer import launch # type: ignore
from random import randint

%matplotlib inline

In [112]:
USER_AGENTS = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_7_10) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15',
]
CAPTCHA_MARKER = "captcha"
LIST_ITEM = '[data-test-id="movie-list-item"]'

In [None]:
async def wait_captcha(page):
  print('🛑 Captcha shown')
  while CAPTCHA_MARKER in page.url:
    await page.waitForNavigation({'waitUntil': 'domcontentloaded'})
  await page.waitForSelector(LIST_ITEM, timeout=0)
  print('✅ Captcha solved')

In [114]:
async def safe_goto(page, url):
  await page.goto(url, {'waitUntil': 'domcontentloaded'})
  if CAPTCHA_MARKER in page.url:
    await wait_captcha(page)
  await page.waitForSelector(LIST_ITEM, timeout=0)

In [None]:
async def scrape_movie(page, page_number):
    url = f"https://www.kinopoisk.ru/lists/movies/top250/?page={page_number}"
    print(f"Scraping: {url}")
    await safe_goto(page, url)

    movie_elements = await page.querySelectorAll('[data-test-id="movie-list-item"]')

    movies_data = []
    for movie in movie_elements:
        href_elem = await movie.querySelector('.base-movie-main-info_link__K161e')
        href = await (await href_elem.getProperty('href')).jsonValue() if href_elem else 'None'
        movie_id = re.search(r'(\d{3,})', href).group(1) if href != None else 'None'


        title_elem = await movie.querySelector('[class^="styles_mainTitle__"]')
        title = await (await title_elem.getProperty('textContent')).jsonValue() if title_elem else 'None'
        title = title.strip()

        secondary_elem = await movie.querySelector('[class^="desktop-list-main-info_secondaryTitleSlot__"]')
        secondary_text = await (
            await secondary_elem.getProperty('textContent')).jsonValue() if secondary_elem else 'None'
        secondary_text = secondary_text.strip()

        match = re.search(r'\b(19|20)\d{2}\b', secondary_text)
        year = match.group(0) if match else 'None'

        rating_elem = await movie.querySelector('[class^="styles_kinopoiskValuePositive__"]')
        rating = await (await rating_elem.getProperty('textContent')).jsonValue() if rating_elem else 'None'
        rating = rating.strip()

        additional_info_elems = await movie.querySelectorAll('[class^="desktop-list-main-info_additionalInfo__"]')
        country = genre = director = None

        if additional_info_elems:
            text1 = await (await additional_info_elems[0].getProperty('textContent')).jsonValue()
            text1 = text1.strip()
            if "Режиссёр:" in text1:
                left, director_text = text1.split("Режиссёр:", 1)
                director = director_text.strip()
                if "•" in left:
                    country, genre = [part.strip() for part in left.split("•", 1)]
                else:
                    country = left.strip()
                    genre = None
            else:
                if "•" in text1:
                    country, genre = [part.strip() for part in text1.split("•", 1)]
                else:
                    country = text1.strip()
                    genre = None
        

        movies_data.append({
            'id': movie_id,
            'Название': title,
            'Год': year,
            'Рейтинг': rating,
            'Страна': country,
            'Жанр': genre,  
            'Режиссёр': director
        })

    return movies_data

In [None]:
async def scraper():
    browser = await launch({
        "headless": False,
        "executablePath": r"C:\Program Files (x86)\Microsoft\Edge\Application\msedge.exe",
        "args": ["--disable-blink-features=AutomationControlled"]
    })
    page = await browser.newPage()
    await page.setUserAgent(USER_AGENTS[randint(0, len(USER_AGENTS)-1)])
    
    all_movies = []
    for page_number in range(1, 6):
        movies_on_page = await scrape_movie(page, page_number)
        all_movies.extend(movies_on_page)

    await browser.close()

    df = pd.DataFrame(all_movies)
    df["id"] = df["id"].astype(int)
    df["Название"] = df["Название"].astype(str)
    df["Год"] = df["Год"].astype(int)
    df["Рейтинг"] = df["Рейтинг"].astype(float)
    df["Страна"] = df["Страна"].astype(str)
    df["Жанр"] = df["Жанр"].astype(str)
    df["Режиссёр"] = df["Режиссёр"].astype(str)

    df.to_csv("movies.csv", index=False, header=True)
    return df

In [117]:
movies = await scraper()

Scraping: https://www.kinopoisk.ru/lists/movies/top250/?page=1
Scraping: https://www.kinopoisk.ru/lists/movies/top250/?page=2
Scraping: https://www.kinopoisk.ru/lists/movies/top250/?page=3
Scraping: https://www.kinopoisk.ru/lists/movies/top250/?page=4
Scraping: https://www.kinopoisk.ru/lists/movies/top250/?page=5


In [None]:
const votes = Array.from(document.querySelectorAll('tr.rating_item')).map(row => ({
  href:   row.querySelector('.profile_name a[href^="/user/"]').getAttribute('href'),
  rating: row.querySelector('.comm-title table td').textContent.trim()
}));

votes