In [11]:
import requests
import re
from bs4 import BeautifulSoup
import time
import json

# Scrapes the MAL recent users page in order to capture usernames
def get_usernames():
    url = "https://myanimelist.net/users.php"
    try:
        page = requests.get(url)
        soup = BeautifulSoup(page.text)
        users = soup.findAll("a", {'href': re.compile(r'\/profile\/.*')})
    except:
        return None

In [12]:
'''
Function to clean the json information into a list containing tuples of the format
anime_id, watched_percentage, score)
watched_percentage is calculated as num_watched_episodes / anime_num_episodes
Note: An unscored anime is calculated as a 0 by MAL

flags is a list of statuses of anime on a users anime list that can be used
to filter out anime based on the viewing status of the user
'''
def clean(anime_list, flags):
    cleaned_list = []
    for anime in anime_list:
        # Skips anime that don't have the given status
        anime_status = anime.get('status')
        if anime_status in flags:
            
            anime_id = anime.get('anime_id', None)
        
            num_watched = anime.get('num_watched_episodes', 0)
            num_episodes = anime.get('anime_num_episodes', 1)
            # If an anime is on MAL but has not been released yet, num_episodes will be 0
            if num_episodes == 0:
                watched_percentage = 0
            else:
                watched_percentage = num_watched / num_episodes
        
            score = anime.get('score', 0)
        
            cleaned_list.append((anime_id, watched_percentage, score))
        
    return cleaned_list       

In [14]:
# Scrapes each user's anime list
def get_anime_lists(users, flags):
    merged_anime_list = []
    for user in users:
        # Skips all of the NavigableStrings
        # If a user's anime list is private, then trying to get the json will throw an error
        try:
            user_url = "https://myanimelist.net/animelist/" + user.text + "/load.json?status=7&offset=0"
            user_page = requests.get(user_url)
            
            # Sleep to comply with the rate limiting
            time.sleep(15)
            
            # Cleans the json information into a list
            user_json = json.loads(user_page.text)      
            anime_list = clean(user_json, flags)
            merged_anime_list.extend(anime_list)
            
        except:
            pass
            
    return merged_anime_list

In [15]:
def scrape():
    users = get_usernames()
    if users is not None:
        merged_anime_list = get_anime_lists(users)
    
    #TODO: write to csv
    
    
        

In [16]:
# Watching - status = 1
# Completed - status = 2
# On Hold - status = 3
# Dropped - status = 4
# status = 5 seems to be unused for some reason, however, include in default flags?
# Plan to watch - status = 6

#TODO: write wrapper function that executes scrape every 5 minutes?
#https://stackoverflow.com/questions/474528/what-is-the-best-way-to-repeatedly-execute-a-function-every-x-seconds-in-python
