# Anime Analysis by Ben Osborn and OsbornAI

## This project consists of the scraping and creation of a dataset containing information about all anime's listed on MyAnimeList. This data is analysed, and a model is created to predict the anime's rating based on the pages features

### Imports

In [1]:
import requests
from bs4 import BeautifulSoup
import time
import csv

### Scraping and dataset creation

#### Parses through the labels from the soup elements

In [2]:
# Change name to clean label
def parseLabel(element):
    string = element.text
    
    split_colens = string.split(':')
    removed_label = split_colens[1:]
    
    for i, label in enumerate(removed_label):
        removed_label[i] = label.replace('\n', '').strip()
    
    joined = " ".join(removed_label)
    
    return joined

#### Parses through the list soup elements

In [3]:
def parseList(element): # Have to check that a tags exist for ever single page and developer
    ret_list = [a.text for a in element.find_all('a')]
    
    return ", ".join(ret_list)

#### Define the field names globally

In [4]:
field_names = ['name', 'show_type', 'episodes', 'status', 'aired', 'broadcast_time', 'producers', 
               'licensors', 'studios', 'source', 'genres', 'episode_length', 'rating', 'score_and_scorers', 
               'members', 'favorites', 'description']

#### Scrapes the page from the show and returns a row of data

In [18]:
def createRow(url, page_number):
    global field_names
    ret_dict = {field_name: 'NaT' for field_name in field_names}

    try:
        req = requests.get(url)
        soup = BeautifulSoup(req.content, 'html.parser')

        side_panel = soup.find('td', class_='borderClass')
        side_panel_subdiv = side_panel.find('div')
        side_panel_divs = side_panel_subdiv.find_all('div')

        try:
            ret_dict['description'] = soup.find('p', itemprop='description').text
            
        except Exception as e:
            print(f"Encountered an error '{e}' for description on an unknown anime on page number {page_number}!")

        for panel in side_panel_divs:
            try:
                split = str(panel.text.split(':')[0].strip())
                if split == "English":
                    ret_dict['name'] = parseLabel(panel)

                if split == "Type":
                    ret_dict['show_type'] = parseLabel(panel)

                if split == "Episodes":
                    ret_dict['episodes'] = parseLabel(panel)

                if split == "Status":
                    ret_dict['status'] = parseLabel(panel)

                if split == "Aired":
                    ret_dict['aired'] = parseLabel(panel)

                if split == "Broadcast":
                    ret_dict['broadcast_time'] = parseLabel(panel)

                if split == "Producers":
                    ret_dict['producers'] = parseList(panel)

                if split == "Licensors":
                    ret_dict['licensors'] = parseList(panel)

                if split == "Studios":
                    ret_dict['studios'] = parseList(panel)

                if split == "Source":
                    ret_dict['source'] = parseLabel(panel)

                if split == "Genres":
                    ret_dict['genres'] = parseList(panel)

                if split == "Duration":
                    ret_dict['episode_length'] = parseLabel(panel)

                if split == "Rating":
                    ret_dict['rating'] = parseLabel(panel).split(' ')[0]

                if split == "Score":
                    ret_dict['score_and_scorers'] = ", ".join([part.text for part in panel.find_all('span')][1:])

                if split == "Members":
                    ret_dict['members'] = "".join(parseLabel(panel).split(','))

                if split == "Favorites":
                    ret_dict['favorites'] = "".join(parseLabel(panel).split(','))

            except Exception as e:
                print(f"Encountered an error '{e}' on anime {ret_dict['name']} on page number {page_number}!")
                
    except Exception as e:
        print(f"Encountered an error '{e}' on an unknown anime on page number {page_number}!")
            
    return ret_dict
    
# createRow("https://myanimelist.net/anime/5114/Fullmetal_Alchemist__Brotherhood")

#### Go through the amount of pages specified then scrape the information for each show, then store them to a CSV file

In [19]:
# Multithread this to make it faster

def genDataset(pages_to_scrape, csv_filename, start_page=0): # Where resume is the page of which it left off from
    global field_names
    
    with open(csv_filename, 'w', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=field_names)
        
        for i in range(start_page, pages_to_scrape):
            url_page = f"https://myanimelist.net/topanime.php?limit={i*50}"
            req_list = requests.get(url_page)
            soup_list = BeautifulSoup(req_list.content, 'html.parser')
            shows = soup_list.find_all('tr', class_='ranking-list')

            for j, show in enumerate(shows):
                link = show.find('a').get('href')
                data_row = createRow(link)
                writer.writerow(data_row)
                time.sleep(2)
    
    print("Dataset creation complete!")

In [17]:
genDataset(300, 'mal-data-12-11-2020.csv')