## 1. Import libraries and locate selenium webdriver


In [1]:
## load libraries

import numpy as np
import pandas as pd

from bs4 import BeautifulSoup, SoupStrainer
import requests
import time, os

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains

chromedriver = "C:/Users/weiji/chromedriver.exe" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver


In [2]:
#define function to direct selenium to click on the links within the page
def hover(self):
    wd = webdriver.connection
    element = wd.find_element_by_link_text(self.locator)
    hov = ActionChains(wd).move_to_element(element)
    hov.perform()

How are MyAnimeList scores calculated?

All scores given in the database are calculated as a weighted score.

Weighted Score = (v / (v + m)) * S + (m / (v + m)) * C

S = Average score for the anime/manga
v = Number users giving a score for the anime/manga †
m = Minimum number of scored users required to get a calculated score
C = The mean score across the entire Anime/Manga database
† Note that v does not correspond to the "number of scored users" as seen on the database page. Scores from users who have not viewed 1/5 of the series upon its completion are not included. Scores given from illegitimate accounts created to sway votes are also not included in the scoring algorithm.

Not Yet Aired entries have no score and will display N/A. Entries that do not meet the minimum number of scored users will also not display a calculated score.

Top Anime/Manga Rankings

The "Top Upcoming" and "Most Popular" rankings are ordered by the number of users who have added the entry to their list. All other Top Anime and Top Manga rankings are ordered by weighted score, as calculated above. Please note that while R18+ entries calculate a weighted score, they are excluded from the rankings.

## 1a. Load top anime pages

In [3]:
#this function will load the top anime pages

def MAL_loadtopanime(page_num):
    '''
    Load the top anime from MAL based on page_num, 50 per page
    So X = (page_num - 1) * 50 + 1) to (page_num * 50)
    Returns a soup of the list of X top anime on page_num

    Complexity: O(requests.get or BeautifulSoup)
    '''
    limit = (page_num - 1) * 50
    url = "https://myanimelist.net/topanime.php?limit=" + str(limit)
    response = requests.get(url)
    if response.status_code != 200:
        print('Enountered', response.status_code, 'error while reading page', page_num , 'of MAL Top Anime')
    else:
        return BeautifulSoup(response.text, 'lxml').find_all(class_="detail")
        #return BeautifulSoup(response.text, 'lxml', parse_only=SoupStrainer(class_=["detail", "hoverinfo_trigger"]))

In [4]:
#test to retrieve the first entry in top anime
soup_mal_top50 = MAL_loadtopanime(1)

## 1b. Retrieve header and URL from top animes

In [5]:
#this function will read an entry within the top anime and retrieve the header and URL

def MAL_initEntry_top(soup):
    '''
    Takes the soup of an entry in the MAL top 50
    Returns a dictionay entry with Title and URL of the anime
    '''
    entry = {}
    entry['Title'] = soup.find(class_="hoverinfo_trigger").text
    #entry['URL'] = soup.find(class_="hoverinfo_trigger").get('href')
    #entry['URL'] = [a['href'] for a in soup.find_all('a', href = True) if a.text]
   
    #entry['URL'] = [a['href'] for a in soup.find(class_="hoverinfo_trigger") if a.text]
    
    #entry['URL'] = [a['href'] for a in soup.select('.hoverinfo_trigger a')]
    
    #for a in soup.select('.hoverinfo_trigger a'):
        #entry['URL'] = a['href']
    
    for a in soup.find(class_="hoverinfo_trigger"):
        entry['URL'] = a['href']
    
    return entry

#soup.find_all(href=True)

In [6]:
#testing to see if MAL_initEntry_top function works
for soup in soup_mal_top50:
    number1 = MAL_initEntry_top(soup)
    break

print(number1['Title'])
print(number1['URL'])

Fullmetal Alchemist: Brotherhood
https://myanimelist.net/anime/5114/Fullmetal_Alchemist__Brotherhood


In [7]:
#this function is used to obtain the entries

def MAL_retrieveEntry(entry):
    response = requests.get(entry['URL'])
    if response.status_code != 200:
        print('Encountered ' + str(response.status_code) + ' error reading ' + entry['Title'])
        return -1
    else:
        return BeautifulSoup(response.text, 'html5', parse_only=SoupStrainer(id='content'))

In [8]:
number1

{'Title': 'Fullmetal Alchemist: Brotherhood',
 'URL': 'https://myanimelist.net/anime/5114/Fullmetal_Alchemist__Brotherhood'}

In [9]:
soup_mal_number1 = MAL_retrieveEntry(number1)




In [10]:
#check the elements and class of the sidebar
number1sidebar = soup_mal_number1

## 1c. Retrieve data from sidebar

In [11]:
#inspect the class of the sidebar data
number1sidebar.find_all(class_="dark_text")

[<span class="dark_text">English:</span>,
 <span class="dark_text">Synonyms:</span>,
 <span class="dark_text">Japanese:</span>,
 <span class="dark_text">Type:</span>,
 <span class="dark_text">Episodes:</span>,
 <span class="dark_text">Status:</span>,
 <span class="dark_text">Aired:</span>,
 <span class="dark_text">Premiered:</span>,
 <span class="dark_text">Broadcast:</span>,
 <span class="dark_text">Producers:</span>,
 <span class="dark_text">Licensors:</span>,
 <span class="dark_text">Studios:</span>,
 <span class="dark_text">Source:</span>,
 <span class="dark_text">Genres:</span>,
 <span class="dark_text">Duration:</span>,
 <span class="dark_text">Rating:</span>,
 <span class="dark_text">Score:</span>,
 <span class="dark_text">Ranked:</span>,
 <span class="dark_text">Popularity:</span>,
 <span class="dark_text">Members:</span>,
 <span class="dark_text">Favorites:</span>]

In [12]:
#this function will scrape the data for the sidebar
def MAL_retrieveSidebar(anime_dict, soup):
    '''
    Returns anime_dict with additional raw data for the sidebar of the anime entry 
    '''
    headers = soup.find_all(class_="dark_text")
    #headers = soup.find(class_="dark_text")
    
    for header in headers:
        column_name = header.text.strip()[:-1]
        # print(column_name) # error-checking
        entry = header.next_sibling.strip()
        # print(entry) # error-checking
        if(entry.strip() == "None found,"): # no entries
            entry = []
        elif(entry == ""):
            # special case for score
            if(column_name == 'Score'):
                entry = header.findNext().text
            else:
                entry_soup = header.findNext('a')
                # print(entry_soup.text, " ||| ", entry_soup.findNext('a'), " ||| ", entry_soup.findNext().name) # error-checking
                
                # create a list of items if more than one entry; signifed by the plural in column_name
                if(column_name[-1] != 's'):
                    entry = entry_soup.text
                else:
                    entry = [entry_soup.text]
                    while(entry_soup.findNext().name != 'div'):
                        entry_soup = entry_soup.findNext('a')
                        entry.append(entry_soup.text)
        anime_dict[column_name] = entry
        # print('---')
    return anime_dict

In [13]:
number1 = MAL_retrieveSidebar(number1, soup_mal_number1)

In [14]:
number1

{'Title': 'Fullmetal Alchemist: Brotherhood',
 'URL': 'https://myanimelist.net/anime/5114/Fullmetal_Alchemist__Brotherhood',
 'English': 'Fullmetal Alchemist: Brotherhood',
 'Synonyms': 'Hagane no Renkinjutsushi: Fullmetal Alchemist, Fullmetal Alchemist (2009), FMA, FMAB',
 'Japanese': '鋼の錬金術師 FULLMETAL ALCHEMIST',
 'Type': 'TV',
 'Episodes': '64',
 'Status': 'Finished Airing',
 'Aired': 'Apr 5, 2009 to Jul 4, 2010',
 'Premiered': 'Spring 2009',
 'Broadcast': 'Sundays at 17:00 (JST)',
 'Producers': ['Aniplex',
  'Square Enix',
  'Mainichi Broadcasting System',
  'Studio Moriken'],
 'Licensors': ['Funimation', 'Aniplex of America'],
 'Studios': ['Bones'],
 'Source': 'Manga',
 'Genres': ['Action',
  'Military',
  'Adventure',
  'Comedy',
  'Drama',
  'Magic',
  'Fantasy',
  'Shounen'],
 'Duration': '24 min. per ep.',
 'Rating': 'R - 17+ (violence & profanity)',
 'Score': '9.21',
 'Ranked': '#1',
 'Popularity': '#3',
 'Members': '2,185,475',
 'Favorites': '178,517'}

In [15]:
#this function will scrape the duration of the anime episode
def MAL_ppDuration(duration_entry):
    '''
    Takes in the raw duration entry in the form (xx min.) (xx hr.) (per ep)
    Returns total number of minutes per episode
    '''
    duration_array = duration_entry.split()
    duration_norm = 0
    if 'min.' in duration_array:
        duration_norm += int(duration_array[duration_array.index('min.') - 1])
    if 'hr.' in duration_array:
        duration_norm += 60 * int(duration_array[duration_array.index('hr.') - 1])
    return duration_norm

In [16]:
#this function will scrape the time period data of the episode
def MAL_ppAired(anime_dict):
    '''
    Takes in the original anime sidebar dictionary
    Return the dictionary with 'Started' and 'Ended' columns added in Timestamp format
    '''
    def MAL_toDatetime(date_string):
        try:
            try:
                return pd.to_datetime(aired_array[0], format="%b %d, %Y")
            except:
                try:
                    return pd.to_datetime(aired_array[0], format="%b, %Y")
                except:
                    return pd.to_datetime(aired_array[0], format="%Y")
        except:
            return np.nan

    aired_array = anime_dict['Aired'].split(' to ')
    anime_dict['Started'] = MAL_toDatetime(aired_array[0])
    if len(aired_array) > 1:
        anime_dict['Ended'] = MAL_toDatetime(aired_array[1])
    return anime_dict

In [17]:
#this function will do some basic text preprocessing of the data from the sidebar
def remove_commas(input_str):
    return int(input_str.strip().replace(',',''))

def MAL_ppSidebar(sidebar_dict):
    '''
    Postprocessing of MAL sidebar
    Take in the unprocessed sidebar dictionary
    Return the processed sidebar dictionary
    '''

    if(sidebar_dict['Episodes'] == 'Unknown'):
        sidebar_dict['Episodes'] = np.nan
    else:
        sidebar_dict['Episodes'] = int(sidebar_dict['Episodes'])
    sidebar_dict['Duration'] = MAL_ppDuration(sidebar_dict['Duration'])
    sidebar_dict = MAL_ppAired(sidebar_dict)
    sidebar_dict['Members'] = remove_commas(sidebar_dict['Members'])
    sidebar_dict['Favorites'] = remove_commas(sidebar_dict['Favorites'])

    return sidebar_dict

In [18]:
number1 = MAL_ppSidebar(number1)


## 1d. Retrieve data from top bar

In [19]:
#this function will retrieve the data from the top bar, which are the scores and voters
def MAL_retrieveTopbar(anime_dict, soup):
    '''
    Retrieves the score and voters from the topbar of the anime entry
    Return the modified anime_dict
    '''

    topbar = soup.find(class_='anime-detail-header-stats')
    
    if(anime_dict['Score'] == 'N/A'):
        anime_dict['Score'] = np.nan 
        anime_dict['Voters'] = np.nan # no score means no one voted
    else:
        anime_dict['Score'] = float(topbar.find(class_='score-label').text)
        anime_dict['Voters'] = remove_commas(topbar.find(class_='score').get('data-user').split()[0])

    return anime_dict

In [20]:
#check whether the function to scrape top bar data works
number1 = MAL_retrieveTopbar(number1, soup_mal_number1)
print(number1['Score'])
print(number1['Voters'])

9.21
1325568


## 1e. Retrieve related data

In [21]:
#this function will scrape the data from the related table

def MAL_retrieveRelated(anime_dict, soup):
    '''
    Retrieves related anime from the anime entry
    Returns the modified anime_dict
    '''
    related = soup.find(class_='anime_detail_related_anime')
    if(related is None): # nothing to add, so return
        return anime_dict
    related_rows = related.find_all('tr')
    for item in related_rows:
        header = item.find('td').text.strip()[:-1]
        entry = item.find('td').find_next().text.strip()
        anime_dict[header] = [item.strip() for item in entry.split(',')]
    
    return anime_dict

In [22]:
#combine the related data - with adaptation, alternative version, side story and spin off

number1 = MAL_retrieveRelated(number1, soup_mal_number1)
number1

{'Title': 'Fullmetal Alchemist: Brotherhood',
 'URL': 'https://myanimelist.net/anime/5114/Fullmetal_Alchemist__Brotherhood',
 'English': 'Fullmetal Alchemist: Brotherhood',
 'Synonyms': 'Hagane no Renkinjutsushi: Fullmetal Alchemist, Fullmetal Alchemist (2009), FMA, FMAB',
 'Japanese': '鋼の錬金術師 FULLMETAL ALCHEMIST',
 'Type': 'TV',
 'Episodes': 64,
 'Status': 'Finished Airing',
 'Aired': 'Apr 5, 2009 to Jul 4, 2010',
 'Premiered': 'Spring 2009',
 'Broadcast': 'Sundays at 17:00 (JST)',
 'Producers': ['Aniplex',
  'Square Enix',
  'Mainichi Broadcasting System',
  'Studio Moriken'],
 'Licensors': ['Funimation', 'Aniplex of America'],
 'Studios': ['Bones'],
 'Source': 'Manga',
 'Genres': ['Action',
  'Military',
  'Adventure',
  'Comedy',
  'Drama',
  'Magic',
  'Fantasy',
  'Shounen'],
 'Duration': 24,
 'Rating': 'R - 17+ (violence & profanity)',
 'Score': 9.21,
 'Ranked': '#1',
 'Popularity': '#3',
 'Members': 2185475,
 'Favorites': 178517,
 'Started': Timestamp('2009-04-05 00:00:00'),
 '

In [23]:
#this function will combine the different functions for scraping the data frm the different parts of the page
#it will include the sidebar, topbar and related data

def MAL_createdict_top(soup):
    '''
    Takes in the soup of top anime and returns a dictionary list 
    '''
    mal_top = []
    for anime_soup in soup:

        # proceses the entry in the top 50 page(s) and gets the related page
        mal_entry = MAL_initEntry_top(anime_soup) 
        soup_mal_entry = MAL_retrieveEntry(mal_entry)
        
        # processes the anime page
        mal_entry = MAL_retrieveSidebar(mal_entry, soup_mal_entry)
        mal_entry = MAL_ppSidebar(mal_entry)
        mal_entry = MAL_retrieveTopbar(mal_entry, soup_mal_entry)
        mal_entry = MAL_retrieveRelated(mal_entry, soup_mal_entry)

        mal_top.append(mal_entry)
    
    return mal_top

In [24]:
mal_top50 = MAL_createdict_top(soup_mal_top50)

In [25]:
mal_df = pd.DataFrame(mal_top50)


In [26]:
mal_df.head(20)

Unnamed: 0,Title,URL,English,Synonyms,Japanese,Type,Episodes,Status,Aired,Premiered,...,Alternative version,Side story,Spin-off,Prequel,Alternative setting,Sequel,Other,Summary,Character,Parent story
0,Fullmetal Alchemist: Brotherhood,https://myanimelist.net/anime/5114/Fullmetal_A...,Fullmetal Alchemist: Brotherhood,"Hagane no Renkinjutsushi: Fullmetal Alchemist,...",鋼の錬金術師 FULLMETAL ALCHEMIST,TV,64,Finished Airing,"Apr 5, 2009 to Jul 4, 2010",Spring 2009,...,[Fullmetal Alchemist],"[Fullmetal Alchemist: Brotherhood Specials, Fu...",[Fullmetal Alchemist: Brotherhood - 4-Koma The...,,,,,,,
1,Shingeki no Kyojin: The Final Season,https://myanimelist.net/anime/40028/Shingeki_n...,Attack on Titan Final Season,"Shingeki no Kyojin Season 4, Attack on Titan S...",進撃の巨人 The Final Season,TV,16,Currently Airing,"Dec 7, 2020 to ?",Winter 2021,...,,,,[Shingeki no Kyojin Season 3 Part 2],,,,,,
2,Steins;Gate,https://myanimelist.net/anime/9253/Steins_Gate,Steins;Gate,,STEINS;GATE,TV,24,Finished Airing,"Apr 6, 2011 to Sep 14, 2011",Spring 2011,...,[Steins;Gate: Kyoukaimenjou no Missing Link - ...,,,,"[ChäoS;HEAd, Robotics;Notes, ChäoS;Child, Occu...",[Steins;Gate: Oukoubakko no Poriomania],[Steins;Gate: Soumei Eichi no Cognitive Comput...,,,
3,Gintama°,https://myanimelist.net/anime/28977/Gintama°,Gintama Season 4,Gintama' (2015),銀魂°,TV,51,Finished Airing,"Apr 8, 2015 to Mar 30, 2016",Spring 2015,...,,[Gintama°: Umai-mono wa Atomawashi ni Suru to ...,,[Gintama Movie 2: Kanketsu-hen - Yorozuya yo E...,,[Gintama.],,,,
4,Hunter x Hunter (2011),https://myanimelist.net/anime/11061/Hunter_x_H...,Hunter x Hunter,HxH (2011),HUNTER×HUNTER（ハンター×ハンター）,TV,148,Finished Airing,"Oct 2, 2011 to Sep 24, 2014",Fall 2011,...,"[Hunter x Hunter, Hunter x Hunter: Original Vi...","[Hunter x Hunter Movie 1: Phantom Rouge, Hunte...",,,,,,,,
5,Shingeki no Kyojin Season 3 Part 2,https://myanimelist.net/anime/38524/Shingeki_n...,Attack on Titan Season 3 Part 2,,進撃の巨人 Season3 Part.2,TV,10,Finished Airing,"Apr 29, 2019 to Jul 1, 2019",Spring 2019,...,,,,[Shingeki no Kyojin Season 3],,[Shingeki no Kyojin: The Final Season],,[Shingeki no Kyojin: Chronicle],,
6,Gintama',https://myanimelist.net/anime/9969/Gintama,Gintama Season 2,Gintama (2011),銀魂',TV,51,Finished Airing,"Apr 4, 2011 to Mar 26, 2012",Spring 2011,...,[Gintama: Yorinuki Gintama-san on Theater 2D],,,[Gintama],,[Gintama': Enchousen],,,[SKET Dance],
7,Ginga Eiyuu Densetsu,https://myanimelist.net/anime/820/Ginga_Eiyuu_...,Legend of the Galactic Heroes,"LoGH, LotGH, Gin'eiden, GinEiDen, Heldensagen ...",銀河英雄伝説,OVA,110,Finished Airing,"Jan 8, 1988 to Mar 17, 1997",,...,[Ginga Eiyuu Densetsu: Die Neue These - Kaikou...,"[Ginga Eiyuu Densetsu Gaiden, Ginga Eiyuu Dens...",,[Ginga Eiyuu Densetsu: Arata Naru Tatakai no O...,,,,,,
8,Gintama': Enchousen,https://myanimelist.net/anime/15417/Gintama__E...,Gintama: Enchousen,"Gintama' (2012), Gintama' Overdrive, Kintama",銀魂' 延長戦,TV,13,Finished Airing,"Oct 4, 2012 to Mar 28, 2013",Fall 2012,...,,[Gintama': Futon ni Haitte kara Buki Nokoshi n...,,[Gintama'],,[Gintama Movie 2: Kanketsu-hen - Yorozuya yo E...,,,,
9,3-gatsu no Lion 2nd Season,https://myanimelist.net/anime/35180/3-gatsu_no...,March Comes In Like A Lion 2nd Season,Sangatsu no Lion Second Season,3月のライオン 第2シリーズ,TV,22,Finished Airing,"Oct 14, 2017 to Mar 31, 2018",Fall 2017,...,,,,[3-gatsu no Lion],,,[Lion meets HachiClo],,,


In [31]:

# do some testing for the top 500 entry before starting on for loop

mal_top500_soup = MAL_loadtopanime(10)
mal_top500_dict = MAL_createdict_top(mal_top500_soup)
mal_top500_df = pd.DataFrame(mal_top500_dict)




## 2. Web scraping function

In [28]:
#the loop below will scrape each page of the top anime with delay of 2 mins

import time

counter = 1
start_reading = False

print("loading MAL list...", flush=True)

filename = './data/dataset' #input file directory for data to be stored here
filename_part = filename + '_pt' + str(counter)

if not os.path.exists(filename_part + '.csv'):
    start_reading = True
    print("Loading part", counter, flush=True)

    mal_soup = MAL_loadtopanime(counter)
    mal_dict = MAL_createdict_top(mal_soup)
    mal_df = pd.DataFrame(mal_dict)

    mal_df.to_csv(filename_part + '.csv')
    time.sleep(120)

end_of_animelist = False
while not end_of_animelist:
    counter += 1
    filename_part = filename + '_pt' + str(counter)
    if not os.path.exists(filename_part + '.csv'):
        if not start_reading:
            start_reading = True
            print("Resuming loading part", counter, flush=True)

        mal_soup = MAL_loadtopanime(counter)

        if mal_soup is not None:
            print("Loading part", counter, flush=True)

            mal_dict = MAL_createdict_top(mal_soup)
            mal_df = pd.DataFrame(mal_dict)
            mal_df.to_csv(filename_part + '.csv')
            time.sleep(120)

        else:
            end_of_animelist = True
            print("Done.")

mal_df_list = []
for i in range(1, counter):
    filename_part = filename + '_pt' + str(i)
    mal_df_list.append(pd.read_csv(filename_part + '.csv'))

mal_df = pd.concat(mal_df_list, ignore_index=True)
mal_df.to_csv(filename + '.csv')


loading MAL list...
Resuming loading part 351
Enountered 504 error while reading page 351 of MAL Top Anime
Done.


In [29]:
mal_df_list = []
for i in range(1, counter):
    filename_part = filename + '_pt' + str(i)
    mal_df_list.append(pd.read_csv(filename_part + '.csv'))

mal_df = pd.concat(mal_df_list, ignore_index=True)
mal_df.to_csv(filename + '_full.csv')