In [64]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

In [65]:
# URL to scrape
url = "https://myanimelist.net/topmanga.php?type=manga"

In [66]:
# Send a GET request
headers = {"User-Agent": "Mozilla/5.0"}
response = requests.get(url, headers=headers)

In [67]:
# Parse the HTML content
soup = BeautifulSoup(response.text, 'html.parser')

In [68]:
soup


<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">

<html class="appearance-none" lang="en">
<head>
<link crossorigin="anonymous" href="//www.googletagmanager.com/" rel="preconnect"/>
<link crossorigin="anonymous" href="https://cdn.myanimelist.net" rel="preconnect"/>
<title>
  Top Manga - Top Manga - MyAnimeList.net
</title>
<meta content="Browse the highest-ranked manga on MyAnimeList, the internet's largest manga database. Find the top manga, novels, one-shots and more!" name="description"/>
<meta content="anime, myanimelist, anime news, manga" name="keywords"/>
<link href="https://myanimelist.net/topmanga.php?type=manga&amp;limit=50" rel="next"/>
<meta content="en_US" property="og:locale"/><meta content="360769957454434" property="fb:app_id"/><meta content="MyAnimeList.net" property="og:site_name"/><meta content="summary" name="twitter:card"/><meta content="@myanimelist" name="twitter:site"/><meta content=" 

In [69]:
# Extract manga details
titles = []
ranks = []
scores = []
authors = []

In [70]:
for row in soup.select(".ranking-list"):
    # Rank
    rank = row.select_one(".top-anime-rank-text").text.strip()
    ranks.append(rank)

    # Title
    title = row.select_one(".manga_h3 a").text.strip()
    titles.append(title)

    # Score
    score_tag = row.select_one(".score-label")
    scores.append(score_tag.text.strip() if score_tag else "N/A")

    # Author
    author_tag = row.select(".information.di-ib.mt4")
    authors.append(author_tag[0].text.strip() if author_tag else "N/A")

In [71]:
# Create DataFrame
df = pd.DataFrame({
    "Rank": ranks,
    "Title": titles,
    "Score": scores,
    "Author": authors
})

In [72]:
# Display DataFrame
df.head()

Unnamed: 0,Rank,Title,Score,Author
0,1,Berserk,9.47,Manga (? vols)\n Aug 1989 - \n 7...
1,2,JoJo no Kimyou na Bouken Part 7: Steel Ball Run,9.32,Manga (24 vols)\n Jan 2004 - Apr 2011\n...
2,3,Vagabond,9.26,Manga (37 vols)\n Sep 1998 - May 2015\n...
3,4,One Piece,9.22,Manga (? vols)\n Jul 1997 - \n 6...
4,5,Monster,9.16,Manga (18 vols)\n Dec 1994 - Dec 2001\n...


In [73]:
# Function to extract volumes, release date, and members count
def extract_info(info):
    volumes_match = re.search(r'\((\d+|\?) vols\)', info)
    date_match = re.search(r'([A-Za-z]+ \d{4}(?: - [A-Za-z]+ \d{4})?)', info)
    members_match = re.search(r'(\d{1,3}(?:,\d{3})*) members', info)

    volumes = volumes_match.group(1) if volumes_match else "Unknown"
    release_date = date_match.group(1) if date_match else "Unknown"
    members = members_match.group(1) if members_match else "Unknown"

    return volumes, release_date, members

In [74]:
# Assuming df['Author'] contains the information
df[['Volumes', 'Release Date', 'Members']] = df['Author'].apply(lambda x: pd.Series(extract_info(x)))

df.drop(columns=['Author'], inplace=True)  # Remove the original column

In [75]:
# Display modified DataFrame
df.head()

Unnamed: 0,Rank,Title,Score,Volumes,Release Date,Members
0,1,Berserk,9.47,?,Aug 1989,735627
1,2,JoJo no Kimyou na Bouken Part 7: Steel Ball Run,9.32,24,Jan 2004 - Apr 2011,285336
2,3,Vagabond,9.26,37,Sep 1998 - May 2015,413484
3,4,One Piece,9.22,?,Jul 1997,650050
4,5,Monster,9.16,18,Dec 1994 - Dec 2001,262585


In [80]:
df.to_csv("top_mangas_mal.csv", index=False)