In [1]:
from bs4 import BeautifulSoup
import csv
import datetime
#import json
import logging
import os
import requests
import re
import sched
import sys
import time
import timeit

# Global variables 
usernames_set = set()
counter = 0
# Writes to data.csv in the same filepath as the script
file = os.path.join(os.path.realpath('.'), "data.csv")

In [2]:
# Scrapes the MAL recent users page in order to capture usernames
def get_usernames():
    logging.debug("Getting users list")
    url = "https://myanimelist.net/users.php"
    try:
        page = requests.get(url)
        soup = BeautifulSoup(page.text)
        users = soup.findAll("a", {'href': re.compile(r'\/profile\/.*')})
    except Exception as e:
        logging.error("Error getting users list:{}".format(str(e)))
        return None
    logging.debug("Finished getting users list")
    return users

In [3]:
'''
Function to clean the json information into a list containing tuples of the format
(user, anime_id, status, watched_percentage, score)

Status:
• Watching - status = 1
• Completed - status = 2
• On Hold - status = 3
• Dropped - status = 4
• status = 5 seems to be unused
• Plan to watch - status = 6

watched_percentage is calculated as num_watched_episodes / anime_num_episodes
Note: An unscored anime is calculated as a 0 by MAL
'''
def clean(anime_list, user):
    logging.debug("Cleaning anime list for {}".format(user.text))
    cleaned_list = []
    for anime in anime_list: 
        anime_id = anime.get('anime_id', None)
        anime_status = anime.get('status', 0)
        num_watched = anime.get('num_watched_episodes', 0)
        num_episodes = anime.get('anime_num_episodes', 1)
        
        # If an anime is on MAL but has not been released yet, num_episodes will be 0
        if num_episodes == 0:
            watched_percentage = 0
        else:
            watched_percentage = num_watched / num_episodes
        score = anime.get('score', 0)
        cleaned_list.append((user.text, anime_id, anime_status, watched_percentage, score))
    logging.info("Finished cleaning anime list for {}".format(user.text))
    return cleaned_list       

In [4]:
# Scrapes each user's anime list
def get_anime_lists(users):
    global usernames_set
    logging.debug("Getting anime lists")
    merged_anime_list = []
    for user in users:
        
        # Checks to see if the user's list has already been scraped
        if user.text in usernames_set:
            logging.debug("Repeated user:{}".format(user.text))
            continue
        usernames_set.add(user.text)
        
        # If a user's anime list is private, 
        # then trying to get the json in clean() will throw an error
        try:
            user_url = "https://myanimelist.net/animelist/" + user.text + "/load.json?status=7&offset=0"
            user_page = requests.get(user_url)
            
            # Sleep to comply with the rate limiting
            time.sleep(5)
            
            # Cleans the json information into a list
            user_json = user_page.json()
            anime_list = clean(user_json, user)
            merged_anime_list.extend(anime_list)
        except Exception as e:
            logging.info("Failed getting anime list for {}:{}".format(user.text, str(e)))
            continue
    logging.debug("Finished getting anime lists")
    return merged_anime_list

In [5]:
# Gets the cleaned anime lists and writes it to the given csv
def scrape():
    global file
    logging.debug("Starting to scrape")
    start = timeit.default_timer()
    users = get_usernames()
    if users is not None:
        logging.debug("Starting to append to csv")
        merged_anime_list = get_anime_lists(users)        
        kwargs = {'newline': ''}
        mode = 'a'
        with open(file, mode, **kwargs) as fp:
            writer = csv.writer(fp, delimiter=',')
            writer.writerows(merged_anime_list)
        logging.debug("Finished appending to csv")
    stop = timeit.default_timer()
    logging.info("Runtime of scrape() was {} seconds".format(stop-start))
    logging.debug("Finished scraping")

In [6]:
'''
Helper function for sched
Currently set to run 2016 times, with 1 run every 5 min
See Data Collection.pdf for more information 
'''
def run_task():
    global counter
    logging.info("Counter = {}".format(counter))
    try:
        scrape()
    except Exception as e:
        logging.exception(str(e))
    counter += 1
    if counter >= 1512:
        return
    s.enter(400, 1, run_task)

In [None]:
#Logging
timestamp = datetime.datetime.utcnow().strftime("%m_%d_%Y-%H_%M_%S")
log_filepath = os.path.join(os.path.realpath('.'), "{}.log".format(timestamp))
logging.basicConfig(
    filename = log_filepath,
    filemode = 'w',
    format = "%(asctime)s-%(levelname)s-%(message)s",
    level = logging.INFO)


kwargs = {'newline': ''}
mode = 'w'
with open(file, mode, **kwargs) as fp:
    writer = csv.writer(fp, delimiter=',')
    writer.writerow(["user", "anime_id", "status", "watched_percentage", "score"])   
s = sched.scheduler(time.time, time.sleep)
s.enter(60, 1, run_task)
s.run()