In [1]:
import asyncio
import logging
import re

try:
    import uvloop
    asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
except ImportError:
    pass

import cProfile
import pstats
from functools import wraps
from io import StringIO

import aiohttp
import orjson
from cachetools import TTLCache
from lxml import etree

logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)

resolver = aiohttp.resolver.AsyncResolver(nameservers=["8.8.8.8", "8.8.4.4"])
conn = aiohttp.TCPConnector(ssl=False, limit=100, resolver=resolver)

PROXY = "http://swed:Artiffu$i0nSwed587@193.188.21.11:7357"
SUBTITLE_REGEX = re.compile(
    r'<text start="([\d.]+)" dur="([\d.]+)".*?>(.*?)</text>', re.DOTALL
)
JSON_REGEX = re.compile(r"ytInitialPlayerResponse\s*=\s*({.+?});", re.DOTALL)

subtitle_cache = TTLCache(maxsize=1000, ttl=3600)

def async_profiler(func):
    @wraps(func)
    async def wrapper(*args, **kwargs):
        pr = cProfile.Profile()
        pr.enable()
        result = await func(*args, **kwargs)
        pr.disable()
        s = StringIO()
        ps = pstats.Stats(pr, stream=s).sort_stats('cumulative')
        ps.print_stats()
        logger.info(f"Профиль для {func.__name__}:\n{s.getvalue()}")
        return result
    return wrapper

In [2]:
@async_profiler
async def create_session():
    return aiohttp.ClientSession(
        connector=conn,
        trust_env=True,
        timeout=aiohttp.ClientTimeout(
            total=10, connect=5, sock_connect=5, sock_read=5
        ),
    )


@async_profiler
async def fetch(session, url):
    try:
        async with session.get(url, proxy=PROXY) as response:
            return await response.text()
    except Exception as e:
        logger.error(f"Ошибка при запросе {url}: {str(e)}")
        return None


@async_profiler
async def extract_json_and_subtitle_url(html_content):
    start = html_content.find("ytInitialPlayerResponse = ") + 26
    end = html_content.find("};", start) + 1
    if start > 25 and end > 0:
        try:
            json_data = orjson.loads(html_content[start:end])
            captions = (
                json_data.get("captions", {})
                .get("playerCaptionsTracklistRenderer", {})
                .get("captionTracks", [])
            )
            for caption in captions:
                if caption.get("languageCode") == "ru":
                    return caption.get("baseUrl")
        except orjson.JSONDecodeError as e:
            logger.error(f"Ошибка при парсинге JSON: {str(e)}")
    return None


@async_profiler
async def parse_subtitles(xml_content):
    if not xml_content:
        return []
    root = etree.fromstring(xml_content.encode("utf-8"))
    return [
        {
            "start": float(text.get("start")),
            "duration": float(text.get("dur")),
            "text": text.text.strip()
            .replace("&amp;", "&")
            .replace("&lt;", "<")
            .replace("&gt;", ">"),
        }
        for text in root.findall(".//text")
    ]


@async_profiler
async def get_russian_subtitles(video_id):
    try:
        async with await create_session() as session:
            url = f"https://www.youtube.com/watch?v={video_id}"
            html_content = await fetch(session, url)
            if not html_content:
                return None, False

            subtitle_url = await extract_json_and_subtitle_url(html_content)
            if not subtitle_url:
                return None, False

            xml_content, is_auto_generated = await asyncio.gather(
                fetch(session, subtitle_url),
                asyncio.to_thread(lambda: "kind=asr" in subtitle_url),
            )

            subtitles = await parse_subtitles(xml_content)
            return subtitles, is_auto_generated

    except Exception as e:
        logger.error(
            f"Неожиданная ошибка при обработке видео {video_id}: {str(e)}"
        )

    return None, False


In [3]:
subtitles, is_auto_generated = await get_russian_subtitles("Ajy1lS9qJbs")

f"Автоматически сгенерированы: {is_auto_generated}" if subtitles else "Не удалось получить субтитры для видео"

2024-08-14 14:23:47,247 - ERROR - Неожиданная ошибка при обработке видео Ajy1lS9qJbs: Another profiling tool is already active
2024-08-14 14:23:47,256 - INFO - Профиль для get_russian_subtitles:
         812 function calls (787 primitive calls) in 0.002 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        2    0.000    0.000    0.001    0.000 /home/michael/github/youtube/.micromamba/envs/default/lib/python3.12/asyncio/base_events.py:1909(_run_once)
        1    0.000    0.000    0.001    0.001 /home/michael/github/youtube/.micromamba/envs/default/lib/python3.12/asyncio/events.py:86(_run)
        1    0.000    0.000    0.001    0.001 {method 'run' of '_contextvars.Context' objects}
        1    0.000    0.000    0.001    0.001 /home/michael/github/youtube/.micromamba/envs/default/lib/python3.12/site-packages/tornado/ioloop.py:742(_run_callback)
        1    0.000    0.000    0.001    0.001 /home/michael/github/youtube/.

'Не удалось получить субтитры для видео'