# Anime Analysis by Ben Osborn and OsbornAI

## This project consists of the scraping and creation of a dataset containing information about all anime's listed on MyAnimeList. This data is analysed, and a model is created to predict the anime's rating based on the pages features

### Imports

In [92]:
import requests
from bs4 import BeautifulSoup
import time
import csv

### Scraping and dataset creation

#### Parses through the labels from the soup elements

In [5]:
# Change name to clean label
def parseLabel(element):
    string = element.text
    
    split_colens = string.split(':')
    removed_label = split_colens[1:]
    
    for i, label in enumerate(removed_label):
        removed_label[i] = label.replace('\n', '').strip()
    
    joined = " ".join(removed_label)
    
    return joined

#### Parses through the list soup elements

In [85]:
def parseList(element): # Have to check that a tags exist for ever single page and developer
    ret_list = [a.text for a in element.find_all('a')]
    
    return ", ".join(ret_list)

#### Scrapes the page from the show and returns a row of data

In [102]:
def createRow(url, field_names):
    req = requests.get(url)
    soup = BeautifulSoup(req.content, 'html.parser')

    description = soup.find('p', itemprop='description').text.replace('\n', '') # --------- Check this one

    side_panel = soup.find('td', class_='borderClass')
    side_panel_subdiv = side_panel.find('div')
    side_panel_divs = side_panel_subdiv.find_all('div')
    
    name = parseLabel(side_panel_divs[7]) # Good
    show_type = parseLabel(side_panel_divs[10]) # Good
    episodes = parseLabel(side_panel_divs[11]) # Good
    status = parseLabel(side_panel_divs[12]) # Good
    
    aired_raw = parseLabel(side_panel_divs[13]) 
    aired = [time.strftime('%d-%m-%Y', time.strptime(date.strip().replace(',', ''), '%b %d %Y')) for date in aired_raw.split("to")]
    aired_start = aired[0] # Good
    aired_end = aired[1] # Good
    
    broadcast_time_raw = parseLabel(side_panel_divs[15])
    broadcast_time_split = [element.strip() for element in broadcast_time_raw.split('at')]
    broadcast_time_split[0] = broadcast_time_split[0][:-1]
    broadcast_time_split[1] = broadcast_time_split[1][:5]
    broadcast_time_joined = " ".join(broadcast_time_split)
    broadcast_time = time.strftime('%A %H:%M', time.strptime(broadcast_time_joined, '%A %H %M')) # Good (Possibly unnecessary)
    
    producers = parseList(side_panel_divs[16]) # Good
    licensors = parseList(side_panel_divs[17]) # Good
    studios = parseList(side_panel_divs[18]) # Good
    source = parseLabel(side_panel_divs[19]) # Good
    genres = parseList(side_panel_divs[20]) # Good
    
    episode_length_raw = parseLabel(side_panel_divs[21])
    episode_length = episode_length_raw.split(' ')[0] # Good
    
    rating_raw = parseLabel(side_panel_divs[22])
    rating = rating_raw.split(' ')[0] # Good
    
    score_and_scorers = ", ".join([part.text for part in side_panel_divs[23].find_all('span')][1:]) # Good
        
    members = "".join(parseLabel(side_panel_divs[28]).split(',')) # Good
    favourites = "".join(parseLabel(side_panel_divs[29]).split(',')) # Good
    
    ret_list = name, show_type, episodes, status, aired_start, aired_end, broadcast_time, producers, licensors, studios, \
            source, genres, episode_length, rating, score_and_scorers, members, favourites, description
    
    assert(len(ret_list) == len(field_names)), f"Length of fieldname's should match be of size {len(ret_list)}"
    
    ret_dict = {key: value for key, value in zip(field_names, ret_list)}
    
    return ret_dict
    
# createRow("https://myanimelist.net/anime/5114/Fullmetal_Alchemist__Brotherhood", ['name', 'type', 'episodes', 'status', 'aired_start', 'aired_end', 
#                       'broadcast_time', 'producers', 'licensors', 'studios', 'source',
#                       'genres', 'episode_length', 'rating', 'score_and_scorers', 'members',
#                       'favourites', 'description'])

#### Go through the amount of pages specified then scrape the information for each show, then store them to a CSV file

In [106]:
# Multithread this to make it faster

def genDataset(pages_to_scrape, csv_filename, start_page=0): # Where resume is the page of which it left off from
    fieldnames = ['name', 'type', 'episodes', 'status', 'aired_start', 'aired_end', 
                      'broadcast_time', 'producers', 'licensors', 'studios', 'source',
                      'genres', 'episode_length', 'rating', 'score_and_scorers', 'members',
                      'favourites', 'description']
    
    with open(csv_filename, 'w', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        
        for i in range(start_page, pages_to_scrape):
            url_page = f"https://myanimelist.net/topanime.php?limit={i*50}"
            req_list = requests.get(url_page)
            soup_list = BeautifulSoup(req_list.content, 'html.parser')
            shows = soup_list.find_all('tr', class_='ranking-list')

            for show in shows:
                link = show.find('a').get('href')
                data_row = createRow(link, fieldnames)
                print(data_row)

In [107]:
genDataset(10, 'mal-data-12-11-2020.csv')

{'name': 'Fullmetal Alchemist Brotherhood', 'type': 'TV', 'episodes': '64', 'status': 'Finished Airing', 'aired_start': '05-04-2009', 'aired_end': '04-07-2010', 'broadcast_time': 'Sunday 17:00', 'producers': 'Aniplex, Square Enix, Mainichi Broadcasting System, Studio Moriken', 'licensors': 'Funimation, Aniplex of America', 'studios': 'Bones', 'source': 'Manga', 'genres': 'Action, Military, Adventure, Comedy, Drama, Magic, Fantasy, Shounen', 'episode_length': '24', 'rating': 'R', 'score_and_scorers': '9.22, 1226238', 'members': '2022089', 'favourites': '167577', 'description': '"In order for something to be obtained, something of equal value must be lost."\rAlchemy is bound by this Law of Equivalent Exchange—something the young brothers Edward and Alphonse Elric only realize after attempting human transmutation: the one forbidden act of alchemy. They pay a terrible price for their transgression—Edward loses his left leg, Alphonse his physical body. It is only by the desperate sacrifice 

ValueError: time data 'Wednesdays at 02 05 (JST)' does not match format '%b %d %Y'