In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import re
import time
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

In [2]:
df_list = []

In [3]:
import aiohttp
import asyncio
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
from tqdm import tqdm

soup_lock = asyncio.Lock()  # Acts like your original thread-safe soup usage

def scrape_anime_detail_sync(html, page_url):
    soup = BeautifulSoup(html, "html.parser")
    detail_block = soup.find("div", class_="detail")
    if not detail_block:
        return None

    genres, country, premiered, date_aired, broadcast, duration, status, rating, votes, studios, producers = [], "nan", "nan", "nan", "nan", "nan", "nan", np.nan, "nan", [], []

    for div in detail_block.find_all("div"):
        text = div.text.strip()
        if text.startswith("Genres:"):
            genres = [a.text.strip() for a in div.find("span").find_all("a")]
        elif text.startswith("Country:"):
            country = div.find("span").find("a").text.strip()
        elif text.startswith("Premiered:"):
            premiered = div.find("span").text.strip()
        elif text.startswith("Date aired:"):
            date_aired = div.find("span").text.strip()
        elif text.startswith("Broadcast:"):
            broadcast = div.find("span").text.strip()
        elif text.startswith("Duration:"):
            duration = div.find("span").text.strip()
        elif text.startswith("Status:"):
            status = div.find("span").text.strip()
        elif text.startswith("MAL:") or text.startswith("Scores:"):
            span = div.find("span")
            if span:
                contents = span.contents
                if len(contents) > 0:
                    rating = contents[0].strip() if isinstance(contents[0], str) else contents[0].text.strip()
                if len(contents) > 1:
                    votes_text = contents[1].strip() if isinstance(contents[1], str) else contents[1].text.strip()
                    # Extract number only from votes string, e.g. 'by 13,298 users' -> 13298
                    votes_match = re.search(r'(\d[\d,]*)', votes_text)
                    votes = votes_match.group(1).replace(',', '') if votes_match else "nan"
        elif text.startswith("Studios:"):
            studios = [a.text.strip() for a in div.find("span").find_all("a")]
        elif text.startswith("Producers:"):
            producers = [a.text.strip() for a in div.find("span").find_all("a")]

    return {
        "URL": "https://animekai.bz" + page_url,
        "Genres": genres,
        "Country": country,
        "premiered": premiered,
        "Release Date": date_aired,
        "Broadcast": broadcast,
        "EP1 duration": duration,
        "Status": status,
        "Rating": rating,
        "Votes": votes,
        "Studio": studios,
        "Producer": producers
    }

async def scrape_anime_detail_combined(session, page_url, en_title, jp_title, sub, dub, ep, typ, tag, headers):
    try:
        async with session.get("https://animekai.bz" + page_url, headers=headers) as r:
            html = await r.text()

        detail = scrape_anime_detail_sync(html, page_url)
        if detail:
            detail.update({
                "Eng Title": en_title,
                "Jap Title": jp_title,
                "eng sub": sub,
                "eng dub": dub,
                "episode": ep,
                "type": typ,
                "18+": tag
            })
        return detail
    except Exception as e:
        print(f"Failed to scrape {page_url}: {e}")
        return None

async def process_page(i):
    url = f"https://animekai.bz/browser?keyword=&type%5B%5D=movie&type%5B%5D=tv&type%5B%5D=ova&type%5B%5D=ona&type%5B%5D=special&sort=title_az&page={i}"
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win 64 ; x64) Apple WeKit /537.36(KHTML , like Gecko) Chrome/80.0.3987.162 Safari/537.36'}
    animekai_webpage = requests.get(url, headers=headers).text
    soup2 = BeautifulSoup(animekai_webpage, "html.parser")

    en_titles = [tag.text.strip() for tag in soup2.find_all('a', class_="title")]
    jp_titles = [i.get("data-jp", "np.nan").strip() for i in soup2.find_all("a", class_="title")]

    sub_list = [
        int(tag.find("span", class_="sub").text) if tag.find("span", class_="sub") and tag.find("span", class_="sub").text.isdigit() else "nan"
        for tag in soup2.find_all("div", class_="aitem")
    ]
    dub_list = [
        int(tag.find("span", class_="dub").text) if tag.find("span", class_="dub") and tag.find("span", class_="dub").text.isdigit() else np.nan
        for tag in soup2.find_all("div", class_="aitem")
    ]
    episodes_and_type = [
        [span.find("b").text.strip() for span in tag.find_all("span") if span.find("b")]
        for tag in soup2.find_all("div", class_="aitem")
    ]

    ep_list, type_list = [], []
    for entry in episodes_and_type:
        if len(entry) == 2:
            ep_val = int(entry[0]) if entry[0].isdigit() else np.nan
            type_val = entry[1]
        elif len(entry) == 1:
            ep_val = np.nan
            type_val = entry[0]
        else:
            ep_val = np.nan
            type_val = np.nan
        ep_list.append(ep_val)
        type_list.append(type_val)

    tag_list = [bool(tag.find(class_="adult")) for tag in soup2.find_all("div", class_="tags")]
    page_urls = [block.find("a", class_="poster").get("href") for block in soup2.find_all("div", class_="aitem")]

    anime_data = zip(en_titles, jp_titles, sub_list, dub_list, ep_list, type_list, tag_list, page_urls)
    results = []

    async with aiohttp.ClientSession() as session:
        tasks = [
            scrape_anime_detail_combined(session, url, en, jp, sub, dub, ep, typ, tag, headers)
            for en, jp, sub, dub, ep, typ, tag, url in anime_data
        ]
        for f in tqdm(asyncio.as_completed(tasks), total=len(tasks), desc=f"Details on page {i}", colour="blue", leave=False, position=0):
            result = await f
            if result:
                results.append(result)

    return pd.DataFrame(results)

async def main():
    df_list = []
    for i in tqdm(range(1, 394), desc="Scraping pages", position=1, colour="green"):
        df = await process_page(i)
        df_list.append(df)
    final_df = pd.concat(df_list).reset_index(drop=True)
    
    final_df["Rating"] = pd.to_numeric(final_df["Rating"], errors="coerce").astype("float16")
    final_df["Release Date"] = pd.to_datetime(final_df["Release Date"], format="%Y-%m-%d", errors="coerce")  
    
    final_df.to_csv("all_anime.csv")
    
# Run it
import nest_asyncio
nest_asyncio.apply()  # Needed if running inside Jupyter
asyncio.run(main())  


                                                                                                                       2.43it/s][0m[A
                                                                                                                       7.23it/s][0m[A
                                                                                                                       6.11it/s][0m[A
                                                                                                                       1.02it/s][0m[A
                                                                                                                       4.33it/s][0m[A
                                                                                                                       1.08it/s][0m[A
                                                                                                                       4.53it/s][0m[A
                                               

Failed to scrape /watch/onigiri-ni-naritai-cogimyun-pvqq: 'NoneType' object has no attribute 'text'


                                                                                                                       
                                                                                                                       2.36it/s][0m[A
                                                                                                                       8.92it/s][0m[A
                                                                                                                       5.47it/s][0m[A
                                                                                                                       0.46it/s][0m[A
                                                                                                                       3.64it/s][0m[A
                                                                                                                       7.04it/s][0m[A
                                                                

In [4]:
df = pd.read_csv("all_anime.csv")
# Replace common placeholders with np.nan
df.replace({
    "": np.nan,
    "?": np.nan,
    "Unknown 2025": np.nan,
    "by ? users": np.nan,
    "[]": np.nan
}, inplace=True)
df.to_csv("all_anime.csv")