# 1.1. Get the list of animes   
We start from the list of animes to include in your corpus of documents. In particular, we focus on the top animes ever list. From this list we want to collect the url associated to each anime in the list. The list is long and splitted in many pages. We ask you to retrieve only the urls of the animes listed in the first 400 pages (each page has 50 animes so you will end up with 20000 unique anime urls).

The output of this step is a .txt file whose single line corresponds to an anime's url.

#1.2. Crawl animes    
Once you get all the urls in the first 400 pages of the list, you:

Download the html corresponding to each of the collected urls.
After you collect a single page, immediately save its html in a file. In this way, if your program stops, for any reason, you will not lose the data collected up to the stopping point. More details in Important (2).
Organize the entire set of downloaded html pages into folders. Each folder will contain the htmls of the animes in page 1, page 2, ... of the list of animes.


In [None]:
!git clone https://github.com/S4b3/ADM-HW3-Group1.git

Cloning into 'ADM-HW3-Group1'...
remote: Enumerating objects: 38664, done.[K
remote: Counting objects: 100% (38664/38664), done.[K
remote: Compressing objects: 100% (17730/17730), done.[K
remote: Total 38664 (delta 20973), reused 38614 (delta 20929), pack-reused 0[K
Receiving objects: 100% (38664/38664), 118.05 MiB | 22.82 MiB/s, done.
Resolving deltas: 100% (20973/20973), done.
Checking out files: 100% (38251/38251), done.


In [None]:
# Install BeautifulSoup, this will be needed to crawl the web
#!pip3 install beautifulsoup4
#!pip3 install tqdm
#!pip3 install nltk



In [1]:
# Import asyncio, this will be needed to perform asynchronous operations
import asyncio
# HTTP Requests library
import requests
from bs4 import BeautifulSoup
# Importing multiprocessing to assign operations to threadpools
import multiprocessing
from multiprocessing.dummy import Pool as ThreadPool
# Importing this to create necessary directories
import pathlib
from pathlib import Path
from datetime import datetime
import re
import csv
from tqdm import tqdm
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import EnglishStemmer
from nltk.tokenize import word_tokenize
from joblib import Parallel, delayed
import json 

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Valentino\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Valentino\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
'''
Defining the amount of cores available for the process to use. If this slows your machine too much, hardcode it. 
EX: AVAILABLE_CORES = 4
'''
AVAILABLE_CORES = multiprocessing.cpu_count()
print(AVAILABLE_CORES)

# Initializing ThreadPools 
pool = ThreadPool(AVAILABLE_CORES)

12


In [None]:
'''
This function performs an HTTP Get Request to MyAnimeList and places its results in a given array.
Params: 
    [index] : Simply the page index. Sets up the url for pagination and defines where the page will be placed inside [destination_array]
    [destination_array] : where the retrieved page will be stored. The result will be placed in index [index]
'''
def fetch_page(index, destination_array):
    destination_array[index] = requests.get(f"https://myanimelist.net/topanime.php{'?limit={}'.format(50*index) if(index > 0) else ''}")
    
'''
Finds all URL contained in a MyAnimeList top animes page, then substitutes them to the starting page inside [pages] array.
Params: 
    [page]  : MyAnimeList's Top Animes HTML Page
    [pages] : Array containing all the pages. 
'''
def fetch_urls_in_page(page, pages):
    # Defining an html parser
    soup = BeautifulSoup(page.content, "html.parser")
    # Find all URLs
    animeUrls = soup.find_all("a", class_="hoverinfo_trigger fl-l ml12 mr8", id=lambda x: x and x.startswith('#area'), href=True)
    animeUrls = [a['href'] for a in animeUrls]
    # Substitues starting page with its URLs
    pages[pages.index(page)] = animeUrls



In [None]:
# Defining pages variables based on how many pages we want to retrieve
pages = [None] * 400
pages_num = range(0,400)

# Crawl Top Animes pages 
pool.map(lambda num : fetch_page(num, pages), pages_num)   
print("Done fetching the pages!")

print("Going to fetch urls")
# Scraping all URLs present in the crawled pages
pool.map(lambda page : fetch_urls_in_page(page, pages), pages)
print("Done fetching urls")

Done fetching the pages!
Going to fetch urls
Done fetching urls


In [None]:
'''
Performs a GET Request on a given [url] and saves its results as an HTML inside a folder called "page_[folder]".
The HTML file will be named "article_[index].html"
'''
def fetch_anime_and_parse_html(url, folder, index):
    # Get current page
    req = requests.get(url)
    # MyAnimeList might refuse to respond to large amount of requests, if this happens, we need to stop the process
    if(req.status_code != 200) : 
        raise Exception(f"My anime list has closed the connection.\nComplete the captcha and restart the process.\nCurrent Page was : {index}")
    # Define page's absolute destination path
    _directory_path = f"{pathlib.Path().resolve()}/dataset/page_{folder}"
    # Check if current page's destination folder exists... if not, create it!
    Path(_directory_path).mkdir(parents=True, exist_ok=True)
    # Write the html file in the destination directory.
    with open(f"{_directory_path}/article_{index}.html", 'w') as file:
        file.write(req.text)
    

'''
Assigns fetching to all available threads and calls (fetch_anime_and_parse_html) with given [folderNumber]
'''
def fetch_animes_and_save_file(urls, folderNumber):
    pool = ThreadPool(AVAILABLE_CORES)
    pool.map(lambda url : fetch_anime_and_parse_html(url, folderNumber, (50*(folderNumber-1)) + urls.index(url) +1), urls)
    

In [None]:
# Fetch animes for every requested page

'''
Here we fetch and save animes in html files. 
Starting_page defines from which page you want to resume the process. (It works as an index)

EX: 
    to start from scratch:
        starting_page = 0
    if you want to start from the 10th page:
        starting_page = 9
    if you want to set 200 as an upper bound:
        last_page = 199   
'''
starting_page = 384
last_page = len(pages)
pages_to_process = pages[starting_page:]
for i in range(0, len(pages_to_process)) : 
    fetch_animes_and_save_file(pages_to_process[i], starting_page+i+1)

1.3 Parse downloaded pages
At this point, you should have all the html documents about the animes of interest and you can start to extract the animes informations. The list of information we desire for each anime and their format is the following:

Anime Name (to save as animeTitle): String.  
Anime Type (to save as animeType): String.   
Number of episode (to save as animeNumEpisode): Integer.   
Release and End Dates of anime (to save as releaseDate and endDate): Convert both release and end date into datetime format.     
Number of members (to save as animeNumMembers): Integer.    
Score (to save as animeScore): Float.   
Users (to save as animeUsers): Integer    
Rank (to save as animeRank): Integer.    
Popularity (to save as animePopularity): Integer.    
Synopsis (to save as animeDescription): String.    
Related Anime (to save as animeRelated): Extract all the related animes, but only keep unique       values and those that have a hyperlink associated to them. List of strings.        
Characters (to save as animeCharacters): List of strings.         
Voices (to save as animeVoices): List of strings.     
Staff (to save as animeStaff): Include the staff  name and their responsibility/task in a list of lists.     

In [5]:
## Defining classes for each argument:
def extract_element_from_html(html, html_tag, class_name="", attrs= {}) :
  # title class_name
  soup = BeautifulSoup(html, "html.parser")
  # Find given content
  content = soup.find(html_tag, class_=class_name, attrs= attrs)
  # print(f"Found {html_tag}: {content}")
  return content

def extract_element_from_information_content_by_span_text(html, span_text) :
  # title class_name
  soup = BeautifulSoup(html, "html.parser")
  # Find given gontent
  pads = soup.find_all("div", class_="spaceit_pad", )
  for el in pads :
    span = el.find('span')
    if(span != None and span.text == span_text):
      a = el.find('a')
      if(a != None): 
        return a.text
      contents = el.contents
      if(len(contents) >= 2): 
        return contents[2].strip("\n ")
  return ""

def extract_related_animes(html):
  soup = BeautifulSoup(html, "html.parser")
  subtag = soup.find("table", "anime_detail_related_anime")
  #print(f"Found subtag {subtag}")
  related_animes = []
  if(subtag != None): 
    for el in subtag.find_all("a", href=True):
      #print(el)
      text = el.text
      if(text not in related_animes):
        related_animes.append(text)
  return related_animes


def extract_text_list_from_soup_and_class_names(soup, html_tag, class_name):
  tag_list = soup.find_all(html_tag, class_name)
  output = []
  for el in tag_list:
    text = el.text
    if(text not in output):
      output.append(text)
  return output 

def extract_soups_tag_list(html, html_tag, class_name):
  soup = BeautifulSoup(html, "html.parser")
  output = soup.find_all(html_tag, class_name)
  #print(len(output))
  return output

def parseDate(date, formats, file_path):
  for fmt in formats:
    try:
        return datetime.strptime(date, fmt)
    except ValueError:
        pass
  print(f"No valid date format found for : {date} on {file_path}")
  return ""

def extract_informations_from_anime_html(file_path):
    with open(file_path, 'r', encoding="utf-8") as f:
        file_path = str(file_path)
        try:
            html = f.read()
        except:
            print("Exception reading html")
        animeTitle = extract_element_from_html(html, "h1", "title-name h1_bold_none")
        animeTitle = "" if animeTitle == None else animeTitle.text

        animeType = extract_element_from_information_content_by_span_text(html, "Type:")
        animeNumEpisode = extract_element_from_information_content_by_span_text(html, "Episodes:")
        rel_and_end_dates = extract_element_from_information_content_by_span_text(html, "Aired:")

        dates = rel_and_end_dates.split(" to ")
        date_formats = ["%b %d, %Y", "%Y", "%b %Y"]

        releaseDate = ""
        if (dates[0] != None) :
          releaseDate = parseDate(dates[0], date_formats, file_path)


        endDate = ""
        if (len(dates) >= 2 and dates[1] != None) :
          endDate = parseDate(dates[1], date_formats, file_path)

        animeNumMembers = ""
        try : 
          animeNumMembers = int(extract_element_from_html(html, "span", "numbers members").text.split()[1].replace(',', ''))
        except Exception as e :
          pass
          #print(f"animeNumMembers - {e} on {file_path}");

        animeScore = ""
        try:
          animeScore = float(extract_element_from_html(html, "div", "score-label").text)
        except Exception as e :
          pass
          #print(f"animeScore - {e} on {file_path}");
        animeUsers = ""
        try: 
          animeUsers = int(extract_element_from_html(html, "div", "fl-l score").get('data-user').split()[0].replace(',',''))
        except Exception as e :
          pass
          #print(f"animeUsers - {e} on {file_path}");
        animeRank = ""
        try: 
          animeRank = int(extract_element_from_html(html, "span", "numbers ranked").text.split()[1].replace('#', '').replace(',',''))
        except Exception as e :
          pass
          #print(f"animeRank - {e} on {file_path}");

        animePopularity = ""
        try:
          animePopularity = int(extract_element_from_html(html, "span", "numbers popularity").text.split()[1].replace('#', '').replace(',',''))
        except Exception as e :
          pass
          #print(f"animePopularity - {e} on {file_path}");
        animeDescription = ""
        try:
          animeDescription = extract_element_from_html(html, "p", "", {"itemprop": "description"}).text
        except Exception as e :
          pass
          #print(f"animeDescription - {e} on {file_path}");
        animeRelated = extract_related_animes(html)
        char_voices_staff_table = extract_soups_tag_list(html, "div", "detail-characters-list clearfix")

        animeCharacters = []
        try: 
          animeCharacters = extract_text_list_from_soup_and_class_names(char_voices_staff_table[0], "h3", "h3_characters_voice_actors")
        except Exception as e :
          pass
          #print(f"animeCharacters {e} on {file_path}")

        animeVoices = []
        try: 
          animeVoices = extract_text_list_from_soup_and_class_names(char_voices_staff_table[0], "td", "va-t ar pl4 pr4")
          animeVoices = [voice.strip('\n').split('\n')[0] for voice in animeVoices]
        except Exception as e :
          pass
          #print(f"animeVoices {e} on {file_path}") 

        animeStaff = []
        try: 
          animeStaff = extract_text_list_from_soup_and_class_names(char_voices_staff_table[1], "td", "borderClass")
          animeStaff = [re.split('\n+', staff) for staff in list(filter(None, [staff.strip('\n') for staff in animeStaff]))]
        except Exception as e :
          pass
          #print(f"animeStaff {e} on {file_path}") 

        article_i = re.findall(re.compile('[0-9]+'), file_path.split('/n')[-1])[-1]
        inherited_name = f"anime_{article_i}.tsv"
        #print(inherited_name)
        Path("./tsv_dataset").mkdir(parents=True, exist_ok=True)

        with open('./tsv_dataset/{}'.format(inherited_name), 'wt', encoding="utf-8") as out_file:
            tsv_writer = csv.writer(out_file, delimiter='\t')
            tsv_writer.writerow(['animeTitle','animeType','animeNumEpisode','releaseDate','endDate','animeNumMembers','animeScore',
                                 'animeUsers','animeRank','animePopularity','animeDescription','animeRelated','animeCharacters','animeVoices','animeStaff'])
            tsv_writer.writerow([animeTitle, animeType, animeNumEpisode, releaseDate, endDate, animeNumMembers, animeScore,
                                 animeUsers, animeRank, animePopularity, animeDescription, animeRelated, animeCharacters, animeVoices, animeStaff])



In [6]:
#502

extract_informations_from_anime_html('./dataset/page_1/article_1.html')

In [7]:
# find all files into directories
matches = pathlib.Path("./dataset").glob("**/*.html")
# multiprocess parsing every html into a tsv
result = Parallel(n_jobs=AVAILABLE_CORES)(delayed(extract_informations_from_anime_html)(path) for path in tqdm(matches))
#pool.map(extract_informations_from_anime_html, matches)

19124it [21:05, 15.11it/s]


# 2. Search Engine   
Now, we want to create two different Search Engines that, given as input a query, return the animes that match the query.   

First, you must pre-process all the information collected for each anime by:

**Removing stopwords**   
**Removing punctuation**   
**Stemming**   
**Anything else you think it's needed**   

For this purpose, you can use the nltk library.

In [53]:
def preprocess_string(input_string, stemmer = EnglishStemmer(), tokenizer = word_tokenize ) :
    if not input_string :
        return ''
    # define stopwords
    stop_words = set(stopwords.words('english'))
    # define punctuation
    punctuation = string.punctuation
    translation_table = str.maketrans('', '', punctuation)
    output = []
    for token in [t.lower() for t in tokenizer(input_string)]:
        #print(f"Processing token : {token}")
        #print("removing punctuation")
        token = token.translate(translation_table)
        try :
            if token == '' or token in stop_words:
              #print("token was a stopword, continuing.")
              continue
        except Exception as e:
            print(f"{e} thrown while using stop_words")
        #print("token was NOT a stopword")
        #print(f"token after punctuation removal: {token}")
        if stemmer:
            #print("Stemming token")
            token = stemmer.stem(token)
            #print(f"token after stemming was {token}")
        output.append(token)
        #print(output)
    return ' '.join(output)
      
def preprocess_tsv(file_path):
    file_name = str(file_path).split('\\')[-1]
    with open(file_path, 'r', newline='', encoding="utf-8") as f:
        Path("./preprocessed_dataset").mkdir(parents=True, exist_ok=True)
        output = {}
        tsv = csv.reader(f, delimiter='\t')
        
        columns = next(tsv)
        next(tsv)
        data = next(tsv)
        for i in range(len(columns)) :
            if(columns[i] not in ['releaseDate', 'endDate']) :
                output[columns[i]] = preprocess_string(data[i])
            else :
                output[columns[i]] = data[i]
        print(file_name)
        print(file_name.split('.tsv')[0])
        with open('./preprocessed_dataset/{}.json'.format(file_name.split('/')[-1].split('.')[0]), 'w', encoding="utf-8") as out_file:
            json.dump(output, out_file)
        
                
#         for row in tsv :
#             if(row) :
#                 tsv_value = ''
#                 if(len(row) > 1):
#                     if (row[0] not in ['releaseDate', 'endDate']) :
#                         tsv_value = preprocess_row(row[1])
#                     else :
#                         tsv_value = row[1]
#                 output[row[0]] = tsv_value

            
        
        


In [54]:
preprocess_tsv('./tsv_dataset/anime_1.tsv')

./tsv_dataset/anime_1.tsv
./tsv_dataset/anime_1


In [None]:
tsv_matches = pathlib.Path("./tsv_dataset").glob("*.tsv")
result = pool.map(preprocess_tsv, tsv_matches)