# Anime Analysis by Ben Osborn and OsbornAI

## This project consists of the scraping and creation of a dataset containing information about all anime's listed on MyAnimeList. This data is analysed, and a model is created to predict the anime's rating based on the pages features

### Imports

In [1]:
import requests
from bs4 import BeautifulSoup
from csv import DictWriter
import os
import pandas as pd
import matplotlib.pyplot as plt
import time

%matplotlib inline

### Scraping and dataset creation

#### Parses through the labels from the soup elements

In [2]:
# Change name to clean label
def parseLabel(element):
    string = element.text
    
    split_colens = string.split(':')
    removed_label = split_colens[1:]
    
    for i, label in enumerate(removed_label):
        removed_label[i] = label.replace('\n', '').strip()
    
    joined = " ".join(removed_label)
    
    return joined

#### Parses through the list soup elements

In [3]:
def parseList(element): # Have to check that a tags exist for ever single page and developer
    ret_list = [a.text for a in element.find_all('a')]
    
    return ", ".join(ret_list)

#### Define the field names globally

In [4]:
field_names = ['name_english', 'name_japanese', 'show_type', 'episodes', 'status', 'aired', 'broadcast_time', 'producers', 
               'licensors', 'studios', 'source', 'genres', 'episode_length', 'rating', 'score_and_scorers', 
               'members', 'favorites', 'description']

#### Scrapes the page from the show and returns a row of data

In [5]:
def createRow(url):
    global field_names
    ret_dict = {field_name: '' for field_name in field_names}

    req = requests.get(url)
    soup = BeautifulSoup(req.content, 'html.parser')

    side_panel = soup.find('td', class_='borderClass')
    side_panel_subdiv = side_panel.find('div')
    side_panel_divs = side_panel_subdiv.find_all('div')

    try:
        ret_dict['description'] = soup.find('p', itemprop='description').text

    except Exception as e:
        print(f"Encountered an error '{e}' for description at '{url}'.")

    for panel in side_panel_divs:
        try:
            split = str(panel.text.split(':')[0].strip())

            if split == "English":
                ret_dict['name_english'] = parseLabel(panel)

            if split == "Japanese":
                ret_dict['name_japanese'] = parseLabel(panel)

            if split == "Type":
                ret_dict['show_type'] = parseLabel(panel)

            if split == "Episodes":
                ret_dict['episodes'] = parseLabel(panel)

            if split == "Status":
                ret_dict['status'] = parseLabel(panel)

            if split == "Aired":
                ret_dict['aired'] = parseLabel(panel)

            if split == "Broadcast":
                ret_dict['broadcast_time'] = parseLabel(panel)

            if split == "Producers":
                ret_dict['producers'] = parseList(panel)

            if split == "Licensors":
                ret_dict['licensors'] = parseList(panel)

            if split == "Studios":
                ret_dict['studios'] = parseList(panel)

            if split == "Source":
                ret_dict['source'] = parseLabel(panel)

            if split == "Genres":
                ret_dict['genres'] = parseList(panel)

            if split == "Duration":
                ret_dict['episode_length'] = parseLabel(panel)

            if split == "Rating":
                ret_dict['rating'] = parseLabel(panel).split(' ')[0]

            if split == "Score":
                ret_dict['score_and_scorers'] = ", ".join([part.text for part in panel.find_all('span')][1:])

            if split == "Members":
                ret_dict['members'] = "".join(parseLabel(panel).split(','))

            if split == "Favorites":
                ret_dict['favorites'] = "".join(parseLabel(panel).split(','))

        except Exception as e:
            print(f"Encountered an error '{e}' at '{url}'.")
            
    return ret_dict
    
# createRow("https://myanimelist.net/anime/5114/Fullmetal_Alchemist__Brotherhood")

#### Go through the amount of pages specified then scrape the information for each show, then store them to a CSV file

In [6]:
# Introduce sharding of CSV to this

def genDataset(end_page, csv_filename, start_page=0): # Where resume is the page of which it left off from
    global field_names
    link = ['Unknown']
            
    for i in range(start_page, end_page):
        
        print(f"Scraping page {i}...")

        csv_path = os.path.join(os.getcwd(), 'csv', f'{csv_filename}-{i}.csv')

        with open(csv_path, 'w', newline='') as csvfile:
            writer = DictWriter(csvfile, fieldnames=field_names)
            
            writer.writeheader()

            url_page = f"https://myanimelist.net/topanime.php?limit={i*50}"
            req_list = requests.get(url_page)
            soup_list = BeautifulSoup(req_list.content, 'html.parser')
            shows = soup_list.find_all('tr', class_='ranking-list')

            for show in shows:
                try:
                    link[0] = show.find('a').get('href')
                    data_row = createRow(link[0])
                    writer.writerow(data_row)

                except Exception as e:
                    print(f"Encountered error '{e}' at '{link[0]}'.")
    
    print("Dataset creation complete!")
    
# Testing on a small dataset to begin with
# genDataset(5, 'mal-data-11-13-2020', start_page=3)

### Data handling and cleaning

#### Concatenate CSV files and put them into a data frame

In [7]:
data_path = os.path.join(os.getcwd(), 'csv')

dfs = []
for csv in os.listdir(data_path):
    dfs.append(pd.read_csv(os.path.join(data_path, csv), index_col=0))

df = pd.concat(dfs)

#### Type conversion and data cleanup

In [8]:
df = df[df['episodes'] != 'Unknown']
df['episodes'] = df['episodes'].astype(int)

df['favorites'] = df['favorites'].astype(int)

df['members'] = df['members'].astype(int)

df = df.dropna()

In [9]:
def parseScoreAndScorer(score_and_scorers_raw, position):
    try:
        numerical_version = float(score_and_scorers_raw.split(', ')[position].strip())
        return numerical_version
    except:
        return pd.NaT
    
df['score'] = df['score_and_scorers'].apply(lambda s: parseScoreAndScorer(s, 0)).astype(float)
df['scorer'] = df['score_and_scorers'].apply(lambda s: parseScoreAndScorer(s, 1)).astype(int)
df = df.dropna()
df = df.drop('score_and_scorers', axis=1)

#### Time data parsing

In [10]:
def broadcastParse(broadcast_raw): # This parses through the broadcast time
    broadcast_split = [bc_time.strip() for bc_time in broadcast_raw.split(' at ')]
    if (len(broadcast_split) == 1) or ('Unknown' in broadcast_split):
        return pd.NaT
    broadcast_split[0] = broadcast_split[0][:-1]
    broadcast_split[1] = broadcast_split[1][:5]
    broadcast_string = " ".join(broadcast_split)
    
    return broadcast_string

df['broadcast_time'] = df['broadcast_time'].apply(broadcastParse)
df = df.dropna()
df['broadcast_time'] = pd.to_datetime(df['broadcast_time'], format='%A %H %M')

In [11]:
# This one needs to parse the minutes/hours for the episode length
def timeParse(time_raw):
    time_split = time_raw.split(' ')
    if time_split[1] == 'min.':
        return int(time_split[0])
    elif time_split[1] == 'hr.':
        return int(time_split[0]) * 60 + int(time_split[2]) # This will cause an error if it has different formatting
    
df['episode_length'] = df['episode_length'].apply(timeParse)

In [12]:
def parseAired(aired_raw, position):
    aired_split = aired_raw.split(' to ')
    try:
        aired_parsed = " ".join(aired_split[position].strip().split(", "))
        aired_formatted = str(time.strftime('%d %m %Y', time.strptime(aired_parsed, '%b %d %Y')))
        return aired_formatted
    except:
        return pd.NaT

df['aired_start'] = pd.to_datetime(df['aired'].apply(lambda s: parseAired(s, 0)), format='%d %m %Y')
df['aired_end'] = pd.to_datetime(df['aired'].apply(lambda s: parseAired(s, 1)), format='%d %m %Y')
df = df.dropna()
df = df.drop('aired', axis=1)

In [17]:
df.max()

name_japanese                                                黒子のバスケ
show_type                                                        TV
episodes                                                        201
status                                              Finished Airing
broadcast_time                                  1900-01-01 23:45:00
producers                                                  add some
licensors                                                  add some
studios                                                    ufotable
source                                                    Web manga
genres                                             Thriller, Sci-Fi
episode_length                                                   50
rating                                                           R+
members                                                     2329007
favorites                                                    167719
description       Young Thorfinn grew up listeni