In [1]:
import requests
from bs4 import BeautifulSoup
import time
import random
import pandas as pd
import re
from dateutil import parser as date_parser


In [None]:
anime_urls = []
num_pages = 400  # Количество страниц
expected_links_count = 100  # Ожидаемое количество <a> тегов (по 2 на аниме, 50 аниме)
max_attempts = 3  # Максимальное число попыток, если найдено меньше ожидаемого количества ссылок

session = requests.Session()
session.headers.update({
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' +
                  'AppleWebKit/537.36 (KHTML, like Gecko) ' +
                  'Chrome/90.0.4430.93 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
    'Accept-Language': 'ru-RU,ru;q=0.9'
})

for page in range(num_pages):
    limit = page * 50
    list_page_url = f"https://myanimelist.net/topanime.php?limit={limit}"
    attempt = 0
    links = []
    
    # Если найдено меньше ожидаемого количества ссылок, пробуем ещё раз с увеличенной задержкой
    while attempt < max_attempts:
        response = session.get(list_page_url)
        soup = BeautifulSoup(response.text, 'html.parser')
        links = soup.find_all("a", class_="hoverinfo_trigger")
        
        if len(links) < expected_links_count:
            print(f"Страница {page+1}: найдено только {len(links)} ссылок. Ожидание для повторной загрузки...")
            time.sleep(3 + random.uniform(0, 2))
            attempt += 1
        else:
            break
    
    if len(links) < expected_links_count:
        print(f"Страница {page+1}: недостаточное количество ссылок, продолжаем дальше.")
    
    # Берём каждую вторую ссылку (индексы 1, 3, 5, ...) чтобы избежать дублей
    for i in range(1, len(links), 2):
        url = links[i].get("href")
        anime_urls.append(url)
    
    time.sleep(1 + random.uniform(0, 1))

# Сохраняем все ссылки в файл "anime_urls.txt" (каждая ссылка с новой строки)
with open("anime_urls2.txt", "w", encoding="utf-8") as f:
    for url in anime_urls:
        f.write(url + "\n")

print("Общее количество собранных ссылок:", len(anime_urls))

In [3]:
# Чтение файла и сохранение ссылок в список
with open("anime_urls.txt", "r", encoding="utf-8") as file:
    anime_links = [line.strip() for line in file if line.strip()]
len(anime_links)

20000

In [5]:
import re
from dateutil import parser as date_parser
from bs4 import BeautifulSoup

def extract_fields(text):
    fields = {}
    
    # 1. Anime Type
    m_type = re.search(r"Type:\s*([^\n]+)", text)
    fields["animeType"] = m_type.group(1).strip() if m_type else None
    
    # 2. Number of episodes
    m_episodes = re.search(r"Episodes:\s*(\d+)", text)
    fields["animeNumEpisode"] = int(m_episodes.group(1)) if m_episodes else None
    
    # 3. Aired: извлекаем строку и парсим даты
    m_aired = re.search(r"Aired:\s*([^\n]+)", text)
    if m_aired:
        aired_str = m_aired.group(1).strip()
        if " to " in aired_str:
            release_str, end_str = aired_str.split(" to ", 1)
            try:
                fields["releaseDate"] = date_parser.parse(release_str.strip(), fuzzy=True)
            except Exception:
                fields["releaseDate"] = None
            try:
                fields["endDate"] = date_parser.parse(end_str.strip(), fuzzy=True)
            except Exception:
                fields["endDate"] = None
        else:
            try:
                fields["releaseDate"] = date_parser.parse(aired_str.strip(), fuzzy=True)
            except Exception:
                fields["releaseDate"] = None
            fields["endDate"] = None
    else:
        fields["Aired"] = None
        fields["releaseDate"] = None
        fields["endDate"] = None

    # 4. Score
    m_score = re.search(r"Score:\s*([\d\.]+)", text)
    fields["animeScore"] = float(m_score.group(1)) if m_score else None
    
    # 5. Users (число пользователей, оценивших аниме)
    m_users = re.search(r"scored by\s*([\d,]+)", text)
    if m_users:
        try:
            fields["animeUsers"] = int(m_users.group(1).replace(",", ""))
        except Exception:
            fields["animeUsers"] = None
    else:
        fields["animeUsers"] = None
        
    # 6. Rank
    m_rank = re.search(r"Ranked:\s*#(\d+)", text)
    fields["animeRank"] = int(m_rank.group(1)) if m_rank else None
    
    # 7. Popularity
    m_pop = re.search(r"Popularity:\s*#(\d+)", text)
    fields["animePopularity"] = int(m_pop.group(1)) if m_pop else None
    
    # 8. Number of members
    m_members = re.search(r"Members:\s*([\d,]+)", text)
    if m_members:
        try:
            fields["animeNumMembers"] = int(m_members.group(1).replace(",", ""))
        except Exception:
            fields["animeNumMembers"] = None
    else:
        fields["animeNumMembers"] = None

    # Создаем временный объект BeautifulSoup для поиска по DOM
    temp_soup = BeautifulSoup(text, "html.parser")
    
    # 9. Synopsis (animeDescription): извлекаем текст из <p itemprop="description">
    synopsis_tag = temp_soup.find_all("p")
    fields["animeDescription"] = synopsis_tag.get_text(strip=True) if synopsis_tag else None
    
    # 10. Related Anime (animeRelated): ищем все <a> внутри <td class="pb24"> и оставляем уникальные значения
    related_td = temp_soup.find("td", class_="pb24")
    related_set = set()
    if related_td:
        for a in related_td.find_all("a", href=True):
            a_text = a.get_text(strip=True)
            if a_text:
                related_set.add(a_text)
    fields["animeRelated"] = list(related_set)
    
    # 11. Characters и 12. Voices:
    # Извлекаем все <td class="borderClass">. Нечётные элементы – Characters, чётные – Voices.
    tds = temp_soup.find_all("div", class_="detail-characters-list clearfix")
    animeCharacters = []
    animeVoices = []
    for i, td in enumerate(tds):
        if i % 2 == 0:
            animeCharacters.append(td.get_text(strip=True))
        else:
            animeVoices.append(td.get_text(strip=True))
    fields["animeCharacters"] = animeCharacters
    fields["animeVoices"] = animeVoices

    # 13. Staff (animeStaff): извлекаем информацию о персонале (имя и роль)
    animeStaff = []
    staff_container = temp_soup.find("div", class_="staff-section")
    if staff_container:
        for row in staff_container.find_all("tr"):
            cols = row.find_all("td")
            if len(cols) >= 2:
                name = cols[0].get_text(strip=True)
                role = cols[1].get_text(strip=True)
                animeStaff.append([name, role])
    fields["animeStaff"] = animeStaff
    
    return fields


In [7]:
def extract_additional_fields(soup):
    additional = {}
    
    # 9. Synopsis (animeDescription): извлекаем текст из <p itemprop="description">
    synopsis_tag = soup.find("p", itemprop="description")
    additional["animeDescription"] = synopsis_tag.get_text(strip=True) if synopsis_tag else None
    
    # 10. Related Anime (animeRelated): ищем контейнер <div class="related-entries">
    # внутри него находим все <div class="title"> и извлекаем текст и href из вложенного <a>
    related_div = soup.find("div", class_="related-entries")
    related_dict = {}
    if related_div:
        title_divs = related_div.find_all("div", class_="title")
        for div in title_divs:
            a_tag = div.find("a", href=True)
            if a_tag:
                title_text = a_tag.get_text(strip=True)
                href = a_tag.get("href")
                if title_text and href and title_text not in related_dict:
                    related_dict[title_text] = href
    additional["animeRelated"] = [{"title": title, "href": href} for title, href in related_dict.items()]
    
    # 11. Characters: находим контейнер <div class="detail-characters-list clearfix">
    # и внутри него все <h3 class="h3_characters_voice_actors">
    animeCharacters = []
    detail_div = soup.find("div", class_="detail-characters-list clearfix")
    if detail_div:
        h3_tags = detail_div.find_all("h3", class_="h3_characters_voice_actors")
        for h3 in h3_tags:
            text_val = h3.get_text(strip=True)
            if text_val:
                animeCharacters.append(text_val)
    if animeCharacters == []:
        animeCharacters = None
    additional["animeCharacters"] = animeCharacters
        
    # 12. Voices: в том же контейнере detail_div ищем все <td> с классом, содержащим "va-t", "ar", "pl4", "pr4"
    animeVoices = []
    if detail_div:
        voice_tds = detail_div.find_all("td", class_=lambda x: x and all(cls in x.split() for cls in ["va-t", "ar", "pl4", "pr4"]))
        for td in voice_tds:
            # Извлекаем текст из вложенных <a>
            for a in td.find_all("a", href=True):
                text_val = a.get_text(strip=True)
                if text_val:
                    animeVoices.append(text_val)
    if animeVoices == []:
        animeVoices = None
    additional["animeVoices"] = animeVoices

    # 13. Staff (animeStaff): теперь ищем все <a> внутри того же контейнера detail_div
    animeStaff = []
    if detail_div:
        for a in detail_div.find_all("a", href=True):
            name = a.get_text(strip=True)
            if name:
                animeStaff.append(name)
    if animeStaff == []:
        animeStaff = None
    additional["animeStaff"] = animeStaff
    
    return additional


In [11]:
import re
import time
import random
from dateutil import parser as date_parser
from bs4 import BeautifulSoup
import requests

# Настраиваем сессию с заголовками, имитирующими браузер
headers = {
    "User-Agent": ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                   "AppleWebKit/537.36 (KHTML, like Gecko) "
                   "Chrome/107.0.5304.110 Safari/537.36")
}
session = requests.Session()
session.headers.update(headers)

data = []  # Список для хранения собранных данных

for link in anime_links[:]:
    try:
        response = session.get(link)
        if response.status_code == 200:
            # Создаем объект BeautifulSoup для всей страницы
            soup = BeautifulSoup(response.text, "html.parser")
            
            # Извлекаем название аниме
            title_tag = soup.find("h1", class_="title-name h1_bold_none")
            anime_title = title_tag.get_text(strip=True) if title_tag else "Нет названия"
            
            # Для базовых характеристик используем объединенный текст из всех <div class="spaceit_pad">
            divs = soup.find_all("div", class_="spaceit_pad")
            combined_text = "\n".join(div.get_text(separator=" ", strip=True) for div in divs)
            fields = extract_fields(combined_text)
            
            # Извлекаем дополнительные поля с использованием DOM (soup)
            additional = extract_additional_fields(soup)
            
            # Собираем итоговую запись
            record = {
                "url": link,
                "animeTitle": anime_title,
            }
            record.update(fields)
            record.update(additional)
            data.append(record)
        else:
            print(f"Ошибка загрузки {link} - статус код: {response.status_code}")
    except Exception as e:
        print(f"Ошибка при обработке {link}: {e}")


In [29]:
df2 = pd.DataFrame(data)

In [31]:
df2.shape

(20000, 16)

In [33]:
df2

Unnamed: 0,url,animeTitle,animeType,animeNumEpisode,releaseDate,endDate,animeScore,animeUsers,animeRank,animePopularity,animeNumMembers,animeDescription,animeRelated,animeCharacters,animeVoices,animeStaff
0,https://myanimelist.net/anime/52991/Sousou_no_...,Sousou no Frieren,TV,28.0,2023-09-29,2024-03-22,9.31,604590.0,1,160,1037287,During their decade-long quest to defeat the D...,"[{'title': 'Sousou no Frieren 2nd Season', 'hr...","[Frieren, Fern, Stark, Himmel, Übel, Flamme, E...","[Tanezaki, Atsumi, Ichinose, Kana, Kobayashi, ...","[Frieren, Tanezaki, Atsumi, Fern, Ichinose, Ka..."
1,https://myanimelist.net/anime/5114/Fullmetal_A...,Fullmetal Alchemist: Brotherhood,TV,64.0,2009-04-05,2010-07-04,9.10,2196752.0,2,3,3483954,After a horrific alchemy experiment goes wrong...,"[{'title': 'Fullmetal Alchemist', 'href': 'htt...","[Elric, Edward, Elric, Alphonse, Mustang, Roy,...","[Park, Romi, Kugimiya, Rie, Miki, Shinichiro, ...","[Elric, Edward, Park, Romi, Elric, Alphonse, K..."
2,https://myanimelist.net/anime/9253/Steins_Gate,Steins;Gate,TV,24.0,2011-04-06,2011-09-14,9.07,1450196.0,3,14,2668679,Eccentric scientist Rintarou Okabe has a never...,[{'title': 'Steins;Gate: Oukoubakko no Porioma...,"[Okabe, Rintarou, Makise, Kurisu, Shiina, Mayu...","[Miyano, Mamoru, Imai, Asami, Hanazawa, Kana, ...","[Okabe, Rintarou, Miyano, Mamoru, Makise, Kuri..."
3,https://myanimelist.net/anime/60022/One_Piece_...,One Piece Fan Letter,TV Special,1.0,2024-10-20,NaT,9.06,69269.0,4,2343,91711,Although the golden age of piracy is about to ...,[{'title': 'One Piece Novel: Mugiwara Stories'...,"[Girl, Marine Older Brother, Monkey D., Luffy,...","[Kikuchi, Kokoro, Kase, Yasuyuki, Tanaka, Mayu...","[Girl, Kikuchi, Kokoro, Marine Older Brother, ..."
4,https://myanimelist.net/anime/38524/Shingeki_n...,Shingeki no Kyojin Season 3 Part 2,TV,10.0,2019-04-29,2019-07-01,9.05,1671420.0,5,21,2408638,Seeking to restore humanity's diminishing hope...,"[{'title': 'Shingeki no Kyojin Season 3', 'hre...","[Levi, Yeager, Eren, Ackerman, Mikasa, Arlert,...","[Kamiya, Hiroshi, Kaji, Yuuki, Ishikawa, Yui, ...","[Levi, Kamiya, Hiroshi, Yeager, Eren, Kaji, Yu..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,https://myanimelist.net/anime/53102/Chunkun,Chunkun,Movie,1.0,2018-07-01,NaT,,,20005,23742,116,"On a spring day, the heroine falls asleep in h...",[],,,
19996,https://myanimelist.net/anime/43824/Chuntian_L...,Chuntian Li De Xiaotian Shu,Movie,1.0,1992-03-02,NaT,,,20006,24674,99,No synopsis information has been added to this...,[],,,
19997,https://myanimelist.net/anime/54428/Chuuchuu,Chuuchuu,Movie,1.0,2015-02-02,NaT,,,20007,22685,146,A man was born. He met a woman whom he fell in...,[],,,
19998,https://myanimelist.net/anime/24927/Chuuchuu_B...,Chuuchuu Banban,Movie,1.0,1970-03-17,NaT,,,20008,18619,372,No synopsis information has been added to this...,[],"[Professor, Muller, Buck, Doragorou, Bun]","[Matsushima, Minori, Sakamoto, Shinpei, Ootake...","[Professor, Matsushima, Minori, Muller, Buck, ..."


In [23]:
df2.to_csv("anime.csv", index=False)
