In [96]:
import requests
from bs4 import BeautifulSoup
import time
import random

anime_urls = []
num_pages = 400  # Максимум страниц
expected_links_count = 100  # Ожидается 100 ссылок (50 аниме, каждое представлено 2 раза)
max_attempts = 3  # Максимальное число попыток

# Используем сессию, чтобы сохранять куки и заголовки
session = requests.Session()
session.headers.update({
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' +
                  'AppleWebKit/537.36 (KHTML, like Gecko) ' +
                  'Chrome/90.0.4430.93 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
    'Accept-Language': 'ru-RU,ru;q=0.9'
})

for page in range(num_pages):
    limit = page * 50
    list_page_url = f"https://myanimelist.net/topanime.php?limit={limit}"
    attempt = 0
    links = []
    
    while attempt < max_attempts:
        response = session.get(list_page_url)
        soup = BeautifulSoup(response.text, 'html.parser')
        links = soup.find_all("a", class_="hoverinfo_trigger")
        
        if len(links) < expected_links_count:
            print(f"Страница {page+1}: найдено только {len(links)} ссылок. Ожидание для повторной загрузки...")
            time.sleep(3 + random.uniform(0, 2))  # увеличиваем задержку и добавляем случайный элемент
            attempt += 1
        else:
            break

    # Если на странице не нашлось ожидаемое количество ссылок, можно продолжить или логировать проблему
    if len(links) < expected_links_count:
        print(f"Страница {page+1}: недостаточное количество ссылок, продолжаем дальше.")

    for i in range(1, len(links), 2):
        url = links[i].get("href")
        anime_urls.append(url)

print("Общее количество собранных ссылок:", len(anime_urls))


Общее количество собранных ссылок: 20000


In [99]:
len(anime_urls)

20000

In [101]:
with open("anime_urls.txt", "w", encoding="utf-8") as f:
    for url in anime_urls:
        f.write(url + "\n")

In [109]:
import requests
import time
import random
import re
from bs4 import BeautifulSoup
from dateutil import parser as date_parser

def parse_anime_info(html):
    soup = BeautifulSoup(html, 'html.parser')
    info = {}
    
    # 1. Anime Name (animeTitle): <h1 class="title-name h1_bold_none">
    title_tag = soup.find("h1", class_="title-name h1_bold_none")
    info["animeTitle"] = title_tag.text.strip() if title_tag else None

    # Получаем блок с информацией слева
    leftside_div = soup.find("div", class_="leftside")
    if leftside_div:
        spaceit_divs = leftside_div.find_all("div", class_="spaceit_pad")
        # 2. Anime Type (animeType): 3-й <div> с классом spaceit_pad (индекс 2)
        info["animeType"] = spaceit_divs[2].text.strip() if len(spaceit_divs) >= 3 else None
        
        # 3. Number of episodes (animeNumEpisode): 4-й <div> (индекс 3)
        if len(spaceit_divs) >= 4:
            num_ep_text = spaceit_divs[3].text.strip()
            m = re.search(r'(\d+)', num_ep_text)
            info["animeNumEpisode"] = int(m.group(1)) if m else None
        else:
            info["animeNumEpisode"] = None
        
        # 4. Release and End Dates (releaseDate, endDate): 6-й <div> (индекс 5)
        if len(spaceit_divs) >= 6:
            date_text = spaceit_divs[5].text.strip()
            parts = date_text.split(" to ")
            if len(parts) == 2:
                try:
                    info["releaseDate"] = date_parser.parse(parts[0].strip())
                except Exception:
                    info["releaseDate"] = None
                # Если вторая часть – знак вопроса, оставляем None
                if parts[1].strip() == "?":
                    info["endDate"] = None
                else:
                    try:
                        info["endDate"] = date_parser.parse(parts[1].strip())
                    except Exception:
                        info["endDate"] = None
            else:
                info["releaseDate"] = info["endDate"] = None
        else:
            info["releaseDate"] = info["endDate"] = None
        
        # 5. Number of members (animeNumMembers): 15-й <div> (индекс 14), далее <span class="numbers members">
        if len(spaceit_divs) >= 15:
            members_span = spaceit_divs[14].find("span", class_="numbers members")
            if members_span:
                try:
                    info["animeNumMembers"] = int(members_span.text.strip().replace(",", ""))
                except Exception:
                    info["animeNumMembers"] = None
            else:
                info["animeNumMembers"] = None
        else:
            info["animeNumMembers"] = None
    else:
        info["animeType"] = None
        info["animeNumEpisode"] = None
        info["releaseDate"] = None
        info["endDate"] = None
        info["animeNumMembers"] = None

    # 6. Score (animeScore): <span itemprop="ratingValue">
    rating_span = soup.find("span", itemprop="ratingValue")
    try:
        info["animeScore"] = float(rating_span.text.strip()) if rating_span else None
    except:
        info["animeScore"] = None

    # 7. Users (animeUsers): <span itemprop="ratingCount">
    rating_count_span = soup.find("span", itemprop="ratingCount")
    try:
        info["animeUsers"] = int(rating_count_span.text.strip().replace(",", "")) if rating_count_span else None
    except:
        info["animeUsers"] = None

    # 8. Rank (animeRank): <span class="numbers ranked">
    rank_span = soup.find("span", class_="numbers ranked")
    try:
        rank_text = rank_span.text.strip().replace("#", "").replace(",", "") if rank_span else ""
        info["animeRank"] = int(rank_text) if rank_text.isdigit() else None
    except:
        info["animeRank"] = None

    # 9. Popularity (animePopularity): <span class="numbers popularity">
    popularity_span = soup.find("span", class_="numbers popularity")
    try:
        info["animePopularity"] = int(popularity_span.text.strip().replace(",", "")) if popularity_span else None
    except:
        info["animePopularity"] = None

    # 10. Synopsis (animeDescription): <p itemprop="description">
    synopsis_p = soup.find("p", itemprop="description")
    info["animeDescription"] = synopsis_p.text.strip() if synopsis_p else None

    # 11. Related Anime (animeRelated): из <td class="pb24">
    related_td = soup.find("td", class_="pb24")
    related_set = set()
    if related_td:
        for a in related_td.find_all("a", href=True):
            text = a.text.strip()
            if text:
                related_set.add(text)
    info["animeRelated"] = list(related_set)

    # 12. Characters (animeCharacters): из <div class="detail-characters-list clearfix">, все <h3 class="h3_characters_voice_actors">
    characters_list = []
    detail_div = soup.find("div", class_="detail-characters-list clearfix")
    if detail_div:
        for h3 in detail_div.find_all("h3", class_="h3_characters_voice_actors"):
            text = h3.text.strip()
            if text:
                characters_list.append(text)
    info["animeCharacters"] = characters_list

    # 13. Voices (animeVoices): в том же detail_div, найти все <td> с классами "va-t ar pl4 pr4"
    voices_list = []
    if detail_div:
        for td in detail_div.find_all("td", class_=lambda x: x and all(cls in x.split() for cls in ["va-t", "ar", "pl4", "pr4"])):
            text = td.text.strip()
            if text:
                voices_list.append(text)
    info["animeVoices"] = voices_list

    # 14. Staff (animeStaff): из detail_div, извлечь все <a> (имя и роль)
    staff_list = []
    if detail_div:
        for a in detail_div.find_all("a", href=True):
            name = a.text.strip()
            role = a.get("title", "").strip()
            if name:
                staff_list.append([name, role])
    info["animeStaff"] = staff_list

    return info

# Предположим, что список anime_urls уже заполнен (например, 20000 URL)
# Пример: anime_urls = ["https://myanimelist.net/anime/5114/Fullmetal_Alchemist__Brotherhood", ...]
# Здесь он должен быть получен на предыдущем шаге.

session = requests.Session()
session.headers.update({
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' +
                  'AppleWebKit/537.36 (KHTML, like Gecko) ' +
                  'Chrome/90.0.4430.93 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
    'Accept-Language': 'ru-RU,ru;q=0.9'
})

anime_info_list = []
total_urls = len(anime_urls)

for i, url in enumerate(anime_urls[:2]):
    try:
        response = session.get(url)
        if response.status_code == 200:
            anime_info = parse_anime_info(response.text)
            anime_info_list.append(anime_info)
            print(f"Обработан {i+1}/{total_urls}: {url}")
        else:
            print(f"Ошибка при запросе {url} - статус код: {response.status_code}")
    except Exception as e:
        print(f"Ошибка при обработке {url}: {e}")
    
    # Задержка для предотвращения блокировки (случайная задержка между 1 и 2 секундами)
    time.sleep(random.uniform(1, 2))

print("Общее количество обработанных аниме:", len(anime_info_list))
anime_info_list

Обработан 1/20000: https://myanimelist.net/anime/52991/Sousou_no_Frieren
Обработан 2/20000: https://myanimelist.net/anime/5114/Fullmetal_Alchemist__Brotherhood
Общее количество обработанных аниме: 2


[{'animeTitle': 'Sousou no Frieren',
  'animeType': "English: Frieren: Beyond Journey's End",
  'animeNumEpisode': None,
  'releaseDate': None,
  'endDate': None,
  'animeNumMembers': None,
  'animeScore': 9.31,
  'animeUsers': 602416,
  'animeRank': None,
  'animePopularity': None,
  'animeDescription': 'During their decade-long quest to defeat the Demon King, the members of the hero\'s party—Himmel himself, the priest Heiter, the dwarf warrior Eisen, and the elven mage Frieren—forge bonds through adventures and battles, creating unforgettable precious memories for most of them.\n\r\nHowever, the time that Frieren spends with her comrades is equivalent to merely a fraction of her life, which has lasted over a thousand years. When the party disbands after their victory, Frieren casually returns to her "usual" routine of collecting spells across the continent. Due to her different sense of time, she seemingly holds no strong feelings toward the experiences she went through.\n\r\nAs the 