# 1.1. Get the list of animes   
We start from the list of animes to include in your corpus of documents. In particular, we focus on the top animes ever list. From this list we want to collect the url associated to each anime in the list. The list is long and splitted in many pages. We ask you to retrieve only the urls of the animes listed in the first 400 pages (each page has 50 animes so you will end up with 20000 unique anime urls).

The output of this step is a .txt file whose single line corresponds to an anime's url.

#1.2. Crawl animes    
Once you get all the urls in the first 400 pages of the list, you:

Download the html corresponding to each of the collected urls.
After you collect a single page, immediately save its html in a file. In this way, if your program stops, for any reason, you will not lose the data collected up to the stopping point. More details in Important (2).
Organize the entire set of downloaded html pages into folders. Each folder will contain the htmls of the animes in page 1, page 2, ... of the list of animes.


In [2]:
# Install BeautifulSoup, this will be needed to crawl the web
!pip3 install beautifulsoup4



In [121]:
# Import asyncio, this will be needed to perform asynchronous operations
import asyncio
# HTTP Requests library
import requests
from bs4 import BeautifulSoup
# Importing multiprocessing to assign operations to threadpools
import multiprocessing
from multiprocessing.dummy import Pool as ThreadPool
# Importing this to create necessary directories
import pathlib
from pathlib import Path

In [122]:
'''
Defining the amount of cores available for the process to use. If this slows your machine too much, hardcode it. 
EX: AVAILABLE_CORES = 4
'''
AVAILABLE_CORES = multiprocessing.cpu_count()
print(AVAILABLE_CORES)

8


In [112]:
def fetch_urls_in_page(page, pages):
    soup = BeautifulSoup(page.content, "html.parser")
    animeUrls = soup.find_all("a", class_="hoverinfo_trigger fl-l ml12 mr8", id=lambda x: x and x.startswith('#area'), href=True)
    animeUrls = [a['href'] for a in animeUrls]
    pages[pages.index(page)] = animeUrls

def fetch_page(page_num, pages):
    pages[page_num] = requests.get(f"https://myanimelist.net/topanime.php{'?limit={}'.format(50*page_num) if(page_num > 0) else ''}")

In [113]:
pages = [None] * 400
pages_num = range(0,400)
pool = ThreadPool(AVAILABLE_CORES)

# Prepare the pages needed to find all the urls
pool.map(lambda num : fetch_page(num, pages), pages_num)   
print("Done fetching the pages!")
print("Going to fetch urls")
pool.map(lambda page : fetch_urls_in_page(page, pages), pages)

Done fetching the pages!
Going to fetch urls


[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

In [123]:
def fetch_anime_and_parse_html(url, folder, index):
    # Get current page
    req = requests.get(url)
    if(req.status_code != 200) : 
        raise Exception(f"My anime list has closed the connection.\nComplete the captcha and restart the process.\nCurrent Page was : {index}")
    # Define page's absolute destination path
    _directory_path = f"{pathlib.Path().resolve()}/dataset/page_{folder}"
    # Check if current page's destination folder exists... if not, create it!
    Path(_directory_path).mkdir(parents=True, exist_ok=True)
    # Write the html file in the destination directory.
    with open(f"{_directory_path}/article_{index}.html", 'w') as file:
        file.write(req.text)
    

def fetch_animes_and_save_file(urls, folderNumber):
    pool = ThreadPool(AVAILABLE_CORES)
    pool.map(lambda url : fetch_anime_and_parse_html(url, folderNumber, (50*(folderNumber-1)) + urls.index(url) +1), urls)
    

In [None]:
# Fetch animes for every requested page

'''
Here we fetch and save animes in html files. 
Starting_page defines from which page you want to resume the process. (It works as an index)

EX: 
    to start from scratch:
        starting_page = 0
    if you want to start from the 10th page:
        starting_page = 9
    if you want to set 200 as an upper bound:
        last_page = 199   
'''
starting_page = 76
last_page = len(resuming_from)
resuming_from = pages[starting_page:]
for i in range(0, last_page) : 
    fetch_animes_and_save_file(resuming_from[i], starting_page+i+1)