# Anime Analysis by Ben Osborn and OsbornAI

## This project consists of the scraping and creation of a dataset containing information about all anime's listed on MyAnimeList. This data is analysed, and a model is created to predict the anime's rating based on the pages features

### Imports

In [92]:
import requests
from bs4 import BeautifulSoup
import time
import csv

### Scraping and dataset creation

#### Parses through the labels from the soup elements

In [5]:
# Change name to clean label
def parseLabel(element):
    string = element.text
    
    split_colens = string.split(':')
    removed_label = split_colens[1:]
    
    for i, label in enumerate(removed_label):
        removed_label[i] = label.replace('\n', '').strip()
    
    joined = " ".join(removed_label)
    
    return joined

#### Parses through the list soup elements

In [85]:
def parseList(element): # Have to check that a tags exist for ever single page and developer
    ret_list = [a.text for a in element.find_all('a')]
    
    return ", ".join(ret_list)

#### Define the field names globally

In [153]:
field_names = ['name', 'show_type', 'episodes', 'status', 'aired_start', 'aired_end', 
                   'broadcast_time', 'producers', 'licensors', 'studios', 'source',
                   'genres', 'episode_length', 'rating', 'score_and_scorers', 'members',
                   'favorites', 'description']

#### Scrapes the page from the show and returns a row of data

In [176]:
def createRow(url):
    global field_names
    
    req = requests.get(url)
    soup = BeautifulSoup(req.content, 'html.parser')

    side_panel = soup.find('td', class_='borderClass')
    side_panel_subdiv = side_panel.find('div')
    side_panel_divs = side_panel_subdiv.find_all('div')
    
    ret_dict = {field_name: 'NaT' for field_name in field_names}
    
    ret_dict['description'] = soup.find('p', itemprop='description').text # --------- Check this one
    
    for panel in side_panel_divs:
        split = str(panel.text.split(':')[0].strip())
        if split == "English":
            print(parseLabel(panel))
            ret_dict['name'] = parseLabel(panel)
        if split == "Type":
            ret_dict['show_type'] = parseLabel(panel)
        if split == "Episodes":
            ret_dict['episodes'] = parseLabel(panel)
        if split == "Status":
            ret_dict['status'] = parseLabel(panel)
        if split == "Aired":
            aired_raw = parseLabel(panel)
#             Its in here because it goes to '?'
            aired = [time.strftime('%d-%m-%Y', time.strptime(date.strip().replace(',', ''), '%b %d %Y')) for date in aired_raw.split(' to ')]
            ret_dict['aired_start'] = aired[0]
            if len(aired) == 2:
                ret_dict['aired_end'] = aired[1]
        if split == "Broadcast":
            broadcast_time_raw = parseLabel(panel)
            broadcast_time_split = [element.strip() for element in broadcast_time_raw.split(' at ')]
            if len(broadcast_time_split) == 2:
                broadcast_time_split[0] = broadcast_time_split[0][:-1]
                broadcast_time_split[1] = broadcast_time_split[1][:5]
                broadcast_time_joined = " ".join(broadcast_time_split)
                ret_dict['broadcast_time'] = time.strftime('%A %H:%M', time.strptime(broadcast_time_joined, '%A %H %M')) # Good (Possibly unnecessary)
        if split == "Producers":
            ret_dict['producers'] = parseList(panel)
        if split == "Licensors":
            ret_dict['licensors'] = parseList(panel)
        if split == "Studios":
            ret_dict['studios'] = parseList(panel)
        if split == "Source":
            ret_dict['source'] = parseLabel(panel)
        if split == "Genres":
            ret_dict['genres'] = parseList(panel)
        if split == "Duration":
            ret_dict['episode_length'] = parseLabel(panel).split(' ')[0]
        if split == "Rating":
            ret_dict['rating'] = parseLabel(panel).split(' ')[0]
        if split == "Score":
            ret_dict['score_and_scorers'] = ", ".join([part.text for part in panel.find_all('span')][1:])
        if split == "Members":
            ret_dict['members'] = "".join(parseLabel(panel).split(','))
        if split == "Favorites":
            ret_dict['favorites'] = "".join(parseLabel(panel).split(','))
            
    return ret_dict
    
# createRow("https://myanimelist.net/anime/28977/Gintama%C2%B0")

#### Go through the amount of pages specified then scrape the information for each show, then store them to a CSV file

In [174]:
# Multithread this to make it faster

def genDataset(pages_to_scrape, csv_filename, start_page=0): # Where resume is the page of which it left off from
    global field_names
    
    with open(csv_filename, 'w', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=field_names)
        
        for i in range(start_page, pages_to_scrape):
            url_page = f"https://myanimelist.net/topanime.php?limit={i*50}"
            req_list = requests.get(url_page)
            soup_list = BeautifulSoup(req_list.content, 'html.parser')
            shows = soup_list.find_all('tr', class_='ranking-list')

            for j, show in enumerate(shows):
                link = show.find('a').get('href')
#                 This needs a try block so if it fails for one it will continue to work
#                 try:
                print(f"Page {i} show {j}")
                data_row = createRow(link)
                writer.writerow(data_row)
#                 except Exception as e:
#                     print(f"Encountered error '{e}' on page {i}!")

In [178]:
genDataset(10, 'mal-data-12-11-2020.csv')

Page 0 show 0
Fullmetal Alchemist Brotherhood
Page 0 show 1
Steins;Gate
Page 0 show 2
Gintama Season 4
Page 0 show 3
Hunter x Hunter
Page 0 show 4
Legend of the Galactic Heroes
Page 0 show 5
Gintama Season 2
Page 0 show 6
Attack on Titan Season 3 Part 2
Page 0 show 7
Gintama Enchousen
Page 0 show 8
March Comes In Like A Lion 2nd Season
Page 0 show 9
A Silent Voice
Page 0 show 10
Your Name.
Page 0 show 11
Gintama Season 5
Page 0 show 12
Gintama
Page 0 show 13
Page 0 show 14
Clannad ~After Story~
Page 0 show 15
Owarimonogatari Second Season
Page 0 show 16
Code Geass Lelouch of the Rebellion R2
Page 0 show 17
Haikyu!! 3rd Season
Page 0 show 18
Mob Psycho 100 II
Page 0 show 19
Gintama. Silver Soul Arc - Second Half War
Page 0 show 20
Spirited Away
Page 0 show 21
Page 0 show 22
Gintama. Silver Soul Arc
Page 0 show 23
Cowboy Bebop
Page 0 show 24
Kaguya-sama Love is War Season 2
Page 0 show 25
Descending Stories Showa Genroku Rakugo Shinju
Page 0 show 26
Page 0 show 27
Monogatari Series Secon

ValueError: time data '?' does not match format '%b %d %Y'