In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [2]:
def scrape_url(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    anime_rank = []
    anime_title = []
    links = []
    anime_score = []
    anime_eps = []
    anime_date = []
    anime_members = []

    # Scrape rank
    for x in soup.select('td.rank.ac'):
        anime_rank.append(x.get_text(strip=True).strip())

    # Scrape title
    for t in soup.select('td.title.al.va-t.word-break a'):
        anime_title.append(t.get_text(strip=True).strip())

    anime_title = [title for title in anime_title if title.strip()]
    
    #Scrape link
    for t in soup.select('td.title.al.va-t.word-break'):
        link = t.select_one('a')['href'] if t.select_one('a') else "No Link Available"
        links.append(link)

    for x in soup.select('td.score.ac.fs14'):
        anime_score.append(x.get_text(strip=True).strip())

    # Scrape episode count, date, and members
    for y in soup.select('div.information'):
        text = y.get_text(separator=" ").strip()

        # Extract episode count
        if " eps" in text:
            anime_eps.append(text.split(" eps")[0].split("(")[-1].strip())
        else:
            anime_eps.append("N/A")

        lines = text.split("\n")
        
        if len(lines) > 1:
            anime_date.append(lines[1].strip())  
        else:
            anime_date.append("N/A")
        
        if len(lines) > 2:
            anime_members.append(lines[2].strip().split()[0])  
        else:
            anime_members.append("N/A")
    
    data = {
        'rank': anime_rank,
        'title': anime_title,
        'score': anime_score,
        'episodes': anime_eps,
        'date': anime_date,
        'members': anime_members,
        'links': links
    }
    
    return data

In [3]:
url_1 = 'https://myanimelist.net/topanime.php?limit=0'
url_2 = 'https://myanimelist.net/topanime.php?limit=50'
url_3 = 'https://myanimelist.net/topanime.php?limit=100'
url_4 = 'https://myanimelist.net/topanime.php?limit=150'
data_url_1 = scrape_url(url_1)
data_url_2 = scrape_url(url_2)
data_url_3 = scrape_url(url_3)
data_url_4 = scrape_url(url_4)
combined_data = {key: data_url_1.get(key, []) + data_url_2.get(key,[]) + data_url_3.get(key,[]) + data_url_4.get(key,[]) for key in data_url_1.keys()}

In [None]:
expected_count = 200

def pad_list(lst, length, default_value=None):
    return lst + [default_value] * (length - len(lst))

combined_data['rank'] = pad_list(combined_data['rank'], expected_count)
combined_data['title'] = pad_list(combined_data['title'], expected_count)
combined_data['score'] = pad_list(combined_data['score'], expected_count)
combined_data['episodes'] = pad_list(combined_data['episodes'], expected_count)
combined_data['date'] = pad_list(combined_data['date'], expected_count)
combined_data['members'] = pad_list(combined_data['members'], expected_count)
combined_data['links'] = pad_list(combined_data['links'], expected_count)

for idx in range(expected_count):
    if combined_data['links'][idx] is None:
        combined_data['links'][idx] = "No Link Available"

for idx, (rank, title, score, eps, date, members, link) in enumerate(zip(
    combined_data['rank'],
    combined_data['title'],
    combined_data['score'],
    combined_data['episodes'],
    combined_data['date'],
    combined_data['members'],
    combined_data['links']
), 1):
    if link is None:
        link = "No Link Available"
    print(f'{rank}, {title}, {score}, {eps}, {date}, {members}, {link}')

In [None]:
df = pd.DataFrame(
    {'Rank': combined_data['rank'],
     'Title': combined_data['title'],
     'Total Episode': combined_data['episodes'],
     'Release Date': combined_data['date'],
     'Members': combined_data['members'],
     'Score': combined_data['score'],
     'Link': combined_data['links']}
    )

print(df.head())

df.to_csv('anime.csv', index=False)