# MAL Loop

## 1: Import, define functions

In [1]:
import time
import json
from bs4 import BeautifulSoup, Tag, NavigableString
import requests

In [2]:
def tagstringextract(x):
    """extract string from NavigableString"""
    if isinstance(x, NavigableString):
        return x.lstrip().rstrip()
    elif isinstance(x, Tag):
        return x.get_text().lstrip().rstrip()
    elif isinstance(x, list):
        return ''.join([tagstringextract(a) for a in x])
    else:
        raise Exception('Detected neither Tag nor NavigableString')

`malkey` extracts the primary key from the url. As an examle, `malkey('https://myanimelist.net/anime/1/Cowboy_Bebop')` returns `[1]`. Special consideration is given for different Tag types that MyAnimeList uses.

In [3]:
def malkey(x):
    """returns an array of primary key[s] from url string[s]"""

    if isinstance(x, str):
        return [int(s) for s in x.split('/') if s.isdigit()]
    
    elif isinstance(x, list):
        if len(x) > 1:
            return [malkey(y) for y in x]  
        else:
            return malkey(x[0])
    
    elif isinstance(x, Tag):
        if len(x) == 1:
            if isinstance(x.contents[0], Tag):
                return malkey(x.contents[0]['href'])
            elif isinstance(x.contents[0], NavigableString):
                return malkey(x['href'])
            else:
                raise Exception('Detected neither Tag nor NavigableString')
        else:
            return [malkey(y)[0] for y in x if isinstance(y, Tag) and len(y) > 0]
                
    else:
        raise Exception('Detected neither str nor list of str')

`chunks` to be used for extracting Related Anime table information in pairs.

In [7]:
def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i + n]

## 2: Load url list

In [8]:
with open('urllist.txt', 'r', encoding='utf8') as filehandle:  
    urllist = json.load(filehandle)

For testing, we will use a small portion of the total MAL sitemap.

In [9]:
# for testing, use small beginning of loop
urllist = urllist[0:10]
urllist

['https://myanimelist.net/anime/1/Cowboy_Bebop',
 'https://myanimelist.net/anime/5/Cowboy_Bebop__Tengoku_no_Tobira',
 'https://myanimelist.net/anime/6/Trigun',
 'https://myanimelist.net/anime/7/Witch_Hunter_Robin',
 'https://myanimelist.net/anime/8/Bouken_Ou_Beet',
 'https://myanimelist.net/anime/15/Eyeshield_21',
 'https://myanimelist.net/anime/16/Hachimitsu_to_Clover',
 'https://myanimelist.net/anime/17/Hungry_Heart__Wild_Striker',
 'https://myanimelist.net/anime/18/Initial_D_Fourth_Stage',
 'https://myanimelist.net/anime/19/Monster']

### Prepare empty lists for loop

Global lists, variables to use/append inside the loop.

In [10]:
mallist = []
failedlist = []
sequelkeys = []
maxindex = len(urllist)
minlooptime = 0.10
failedcount = 0

Setting the loop time intentionally too fast in order to test error handling. To avoid 429 requests should come at least 2 seconds apart.

## 3: Loop

In [11]:
for idx, url in enumerate(urllist):
    # start timer
    timestart = time.time()
    
    try:
        page = requests.get(url,timeout=5)
        page.raise_for_status()
    except requests.exceptions.RequestException as e:
        failedcount += 1 
        print(e)
        print('Current Failed Count: ' + str(failedcount))
        failedlist.append(url)
        
        if time.time() - timestart < minlooptime:
            time.sleep(minlooptime - (time.time() - timestart))
        continue

    
    # get primary key, start new row
    animekey = malkey(url)[0]
    newrow = {'key': animekey}
    
    # create soup
    anime = BeautifulSoup(page.content, 'html.parser')
    
    # get title
    title = {'Title': anime.select('.h1 span')[0].string}
    newrow.update(title)
    
    # get sidebar information
    sidebar = [a for a in 
               [[x for x in entry if x != '\n'] for entry in anime.select('.js-scrollfix-bottom div')]
               if len(a) > 1 and a[0].name == 'span']
    sidebardict = dict([entry[0].get_text(), tagstringextract(entry[1:])] for entry in sidebar)
    newrow.update(sidebardict)
    
    # add to global list
    mallist.append(newrow)
    
    # related
    relatedanime = anime.select('.anime_detail_related_anime .borderClass')
    relatedlist = [[animekey, alpha[0].text, y] for alpha in chunks(relatedanime, 2) for y in malkey(alpha[1])]
    
    sequelkeys += relatedlist
    
    # pause if too fast
    if time.time() - timestart < minlooptime:
        time.sleep(minlooptime - (time.time() - timestart))
    print('Processed anime [key = ' + str(animekey) + '], ' 
          + '%d' % (idx+1) + '/' + str(maxindex) + ', ' + '%.2f' % (100*(idx+1)/maxindex) + '%: in ' 
          + '%.4f' % (time.time() - timestart) + ' seconds.')


Processed anime [key = 1], 1/10, 10.00%: in 1.1201 seconds.
Processed anime [key = 5], 2/10, 20.00%: in 1.0241 seconds.
429 Client Error: Too Many Requests for url: https://myanimelist.net/anime/6/Trigun
Current Failed Count: 1
Processed anime [key = 7], 4/10, 40.00%: in 0.9321 seconds.
429 Client Error: Too Many Requests for url: https://myanimelist.net/anime/8/Bouken_Ou_Beet
Current Failed Count: 2
Processed anime [key = 15], 6/10, 60.00%: in 0.9941 seconds.
Processed anime [key = 16], 7/10, 70.00%: in 1.0431 seconds.
Processed anime [key = 17], 8/10, 80.00%: in 0.9021 seconds.
Processed anime [key = 18], 9/10, 90.00%: in 2.2582 seconds.
Processed anime [key = 19], 10/10, 100.00%: in 1.1011 seconds.


## 4: Check output

In [14]:
mallist[0:2]

[{'key': 1,
  'Title': 'Cowboy Bebop',
  'English:': 'Cowboy Bebop',
  'Japanese:': 'カウボーイビバップ',
  'Type:': 'TV',
  'Episodes:': '26',
  'Status:': 'Finished Airing',
  'Aired:': 'Apr 3, 1998 to Apr 24, 1999',
  'Premiered:': 'Spring 1998',
  'Broadcast:': 'Saturdays at 01:00 (JST)',
  'Producers:': 'Bandai Visual',
  'Licensors:': 'Funimation,Bandai Entertainment',
  'Studios:': 'Sunrise',
  'Source:': 'Original',
  'Genres:': 'Action,Adventure,Comedy,Drama,Sci-Fi,Space',
  'Duration:': '24 min. per ep.',
  'Rating:': 'R - 17+ (violence & profanity)',
  'Score:': "8.821(scored by436,438users)1\n        indicates a weighted score. Please note that 'Not yet aired' titles are excluded.",
  'Ranked:': "#2922\n    based on the top anime page. Please note that 'Not yet aired' and 'R18+' titles are excluded.",
  'Popularity:': '#38',
  'Members:': '868,482',
  'Favorites:': '46,832'},
 {'key': 5,
  'Title': 'Cowboy Bebop: Tengoku no Tobira',
  'English:': 'Cowboy Bebop: The Movie',
  'Synony

In [16]:
sequelkeys[0:10]

[[1, 'Adaptation:', 173],
 [1, 'Adaptation:', 174],
 [1, 'Side story:', 5],
 [1, 'Side story:', 17205],
 [1, 'Summary:', 4037],
 [5, 'Parent story:', 1],
 [15, 'Adaptation:', 43],
 [15, 'Side story:', 1317],
 [15, 'Side story:', 6418],
 [16, 'Adaptation:', 1009]]

In [17]:
failedlist

['https://myanimelist.net/anime/6/Trigun',
 'https://myanimelist.net/anime/8/Bouken_Ou_Beet']

## 4: Save files

Save raw data and list of failed attempts to file.

In [51]:
# with open('animedataraw.txt', 'w', encoding='utf8') as filehandle:  
#     json.dump(mallist, filehandle)
# with open('failedlist.txt', 'w', encoding='utf8') as filehandle:  
#     json.dump(failedlist, filehandle)
# with open('sequelkeys.txt', 'w', encoding='utf8') as filehandle:  
#     json.dump(sequelkeys, filehandle)