In [9]:
from bs4 import BeautifulSoup
import csv
import json
import os
import requests
import re
import sched
import sys
import time

counter = 0
file = os.path.join(os.path.realpath('.'), "data.csv")

# Scrapes the MAL recent users page in order to capture usernames
def get_usernames():
    url = "https://myanimelist.net/users.php"
    try:
        page = requests.get(url)
        soup = BeautifulSoup(page.text)
        users = soup.findAll("a", {'href': re.compile(r'\/profile\/.*')})
        return users
    except:
        return None

In [2]:
'''
Function to clean the json information into a list containing tuples of the format
(anime_id, status, watched_percentage, score)

Status:
• Watching - status = 1
• Completed - status = 2
• On Hold - status = 3
• Dropped - status = 4
• status = 5 seems to be unused
• Plan to watch - status = 6

watched_percentage is calculated as num_watched_episodes / anime_num_episodes
Note: An unscored anime is calculated as a 0 by MAL
'''
def clean(anime_list):
    cleaned_list = []
    for anime in anime_list: 
        anime_id = anime.get('anime_id', None)
        anime_status = anime.get('status', 0)
        num_watched = anime.get('num_watched_episodes', 0)
        num_episodes = anime.get('anime_num_episodes', 1)
        
        # If an anime is on MAL but has not been released yet, num_episodes will be 0
        if num_episodes == 0:
            watched_percentage = 0
        else:
            watched_percentage = num_watched / num_episodes
        score = anime.get('score', 0)
        cleaned_list.append((anime_id, anime_status, watched_percentage, score))
    return cleaned_list       

In [3]:
# Scrapes each user's anime list
def get_anime_lists(users):
    merged_anime_list = []
    for user in users:
        
        # Skips all of the NavigableStrings
        # If a user's anime list is private, then trying to get the json will throw an error
        try:
            user_url = "https://myanimelist.net/animelist/" + user.text + "/load.json?status=7&offset=0"
            user_page = requests.get(user_url)
            
            # Sleep to comply with the rate limiting
            time.sleep(15)
            
            # Cleans the json information into a list
            user_json = json.loads(user_page.text)      
            anime_list = clean(user_json)
            merged_anime_list.extend(anime_list)
        except:
            pass       
    return merged_anime_list

In [4]:
# Gets the cleaned anime lists and writes it to the given csv
def scrape():
    global file
    users = get_usernames()
    if users is not None:
        merged_anime_list = get_anime_lists(users)        
        kwargs = {'newline': ''}
        mode = 'a'
        with open(file, mode, **kwargs) as fp:
            writer = csv.writer(fp, delimiter=',')
            writer.writerows(merged_anime_list)   

In [5]:
def run_task():
    global counter
    try:
        scrape()
        print("done")
    except Exception as e:
        print(e)
    finally:
        counter += 1
        if counter >= 2016:
            return
        s.enter(300, 1, run_task)

In [None]:
kwargs = {'newline': ''}
mode = 'w'

with open(file, mode, **kwargs) as fp:
    writer = csv.writer(fp, delimiter=',')
    writer.writerow(["anime_id", "status", "watched_percentage", "score"])
    
s = sched.scheduler(time.time, time.sleep)
s.enter(60, 1, run_task)
s.run()