# Anime Analysis by Ben Osborn and OsbornAI

## This project consists of the scraping and creation of a dataset containing information about all anime's listed on MyAnimeList. This data is analysed, and a model is created to predict the anime's rating based on the pages features

### Imports

In [1]:
import requests
from bs4 import BeautifulSoup
from csv import DictWriter
import os
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

### Scraping and dataset creation

#### Parses through the labels from the soup elements

In [2]:
# Change name to clean label
def parseLabel(element):
    string = element.text
    
    split_colens = string.split(':')
    removed_label = split_colens[1:]
    
    for i, label in enumerate(removed_label):
        removed_label[i] = label.replace('\n', '').strip()
    
    joined = " ".join(removed_label)
    
    return joined

#### Parses through the list soup elements

In [3]:
def parseList(element): # Have to check that a tags exist for ever single page and developer
    ret_list = [a.text for a in element.find_all('a')]
    
    return ", ".join(ret_list)

#### Define the field names globally

In [4]:
field_names = ['name_english', 'name_japanese', 'show_type', 'episodes', 'status', 'aired', 'broadcast_time', 'producers', 
               'licensors', 'studios', 'source', 'genres', 'episode_length', 'rating', 'score_and_scorers', 
               'members', 'favorites', 'description']

#### Scrapes the page from the show and returns a row of data

In [5]:
def createRow(url):
    global field_names
    ret_dict = {field_name: '' for field_name in field_names}

    req = requests.get(url)
    soup = BeautifulSoup(req.content, 'html.parser')

    side_panel = soup.find('td', class_='borderClass')
    side_panel_subdiv = side_panel.find('div')
    side_panel_divs = side_panel_subdiv.find_all('div')

    try:
        ret_dict['description'] = soup.find('p', itemprop='description').text

    except Exception as e:
        print(f"Encountered an error '{e}' for description at '{url}'.")

    for panel in side_panel_divs:
        try:
            split = str(panel.text.split(':')[0].strip())

            if split == "English":
                ret_dict['name_english'] = parseLabel(panel)

            if split == "Japanese":
                ret_dict['name_japanese'] = parseLabel(panel)

            if split == "Type":
                ret_dict['show_type'] = parseLabel(panel)

            if split == "Episodes":
                ret_dict['episodes'] = parseLabel(panel)

            if split == "Status":
                ret_dict['status'] = parseLabel(panel)

            if split == "Aired":
                ret_dict['aired'] = parseLabel(panel)

            if split == "Broadcast":
                ret_dict['broadcast_time'] = parseLabel(panel)

            if split == "Producers":
                ret_dict['producers'] = parseList(panel)

            if split == "Licensors":
                ret_dict['licensors'] = parseList(panel)

            if split == "Studios":
                ret_dict['studios'] = parseList(panel)

            if split == "Source":
                ret_dict['source'] = parseLabel(panel)

            if split == "Genres":
                ret_dict['genres'] = parseList(panel)

            if split == "Duration":
                ret_dict['episode_length'] = parseLabel(panel)

            if split == "Rating":
                ret_dict['rating'] = parseLabel(panel).split(' ')[0]

            if split == "Score":
                ret_dict['score_and_scorers'] = ", ".join([part.text for part in panel.find_all('span')][1:])

            if split == "Members":
                ret_dict['members'] = "".join(parseLabel(panel).split(','))

            if split == "Favorites":
                ret_dict['favorites'] = "".join(parseLabel(panel).split(','))

        except Exception as e:
            print(f"Encountered an error '{e}' at '{url}'.")
            
    return ret_dict
    
# createRow("https://myanimelist.net/anime/5114/Fullmetal_Alchemist__Brotherhood")

#### Go through the amount of pages specified then scrape the information for each show, then store them to a CSV file

In [6]:
# Introduce sharding of CSV to this

def genDataset(end_page, csv_filename, start_page=0): # Where resume is the page of which it left off from
    global field_names
    link = ['Unknown']
            
    for i in range(start_page, end_page):
        
        print(f"Scraping page {i}...")

        csv_path = os.path.join(os.getcwd(), 'csv', f'{csv_filename}-{i}.csv')

        with open(csv_path, 'w', newline='') as csvfile:
            writer = DictWriter(csvfile, fieldnames=field_names)
            
            writer.writeheader()

            url_page = f"https://myanimelist.net/topanime.php?limit={i*50}"
            req_list = requests.get(url_page)
            soup_list = BeautifulSoup(req_list.content, 'html.parser')
            shows = soup_list.find_all('tr', class_='ranking-list')

            for show in shows:
                try:
                    link[0] = show.find('a').get('href')
                    data_row = createRow(link[0])
                    writer.writerow(data_row)

                except Exception as e:
                    print(f"Encountered error '{e}' at '{link[0]}'.")
    
    print("Dataset creation complete!")
    
# Testing on a small dataset to begin with
# genDataset(5, 'mal-data-11-13-2020', start_page=3)

### Data handling and cleaning

#### Concatenate CSV files and put them into a data frame

In [7]:
data_path = os.path.join(os.getcwd(), 'csv')

dfs = []
for csv in os.listdir(data_path):
    dfs.append(pd.read_csv(os.path.join(data_path, csv), index_col=0))

df = pd.concat(dfs)

#### Converting episodes from string to numerical and

In [8]:
df = df[df['episodes'] != 'Unknown']
df['episodes'] = df['episodes'].astype(int)

df['favorites'] = df['favorites'].astype(int)

df['members'] = df['members'].astype(int)

df = df.dropna()

#### Time data parsing

In [9]:
def broadcastParse(broadcast_raw): # This parses through the broadcast time
    broadcast_split = [bc_time.strip() for bc_time in broadcast_raw.split(' at ')]
    if (len(broadcast_split) == 1) or ('Unknown' in broadcast_split):
        return pd.NaT
    broadcast_split[0] = broadcast_split[0][:-1]
    broadcast_split[1] = broadcast_split[1][:5]
    broadcast_string = " ".join(broadcast_split)
    
    return broadcast_string

df['broadcast_time'] = df['broadcast_time'].apply(broadcastParse)
df = df.dropna()
df['broadcast_time'] = pd.to_datetime(df['broadcast_time'], format='%A %H %M')

In [10]:
# This one needs to parse the minutes/hours for the episode length
def timeParse(time_raw):
    time_split = time_raw.split(' ')
    if time_split[1] == 'min.':
        return int(time_split[0])
    elif time_split[1] == 'hr.':
        return int(time_split[0]) * 60 + int(time_split[2]) # This will cause an error if it has different formatting
    
df['episode_length'] = df['episode_length'].apply(timeParse)

In [11]:
df

Unnamed: 0_level_0,name_japanese,show_type,episodes,status,aired,broadcast_time,producers,licensors,studios,source,genres,episode_length,rating,score_and_scorers,members,favorites,description
name_english,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
Fullmetal Alchemist Brotherhood,鋼の錬金術師 FULLMETAL ALCHEMIST,TV,64,Finished Airing,"Apr 5, 2009 to Jul 4, 2010",1900-01-01 17:00:00,"Aniplex, Square Enix, Mainichi Broadcasting Sy...","Funimation, Aniplex of America",Bones,Manga,"Action, Military, Adventure, Comedy, Drama, Ma...",24,R,"9.22, 1232300",2023958,167719,"""In order for something to be obtained, someth..."
Steins;Gate,STEINS;GATE,TV,24,Finished Airing,"Apr 6, 2011 to Sep 14, 2011",1900-01-01 02:05:00,"Frontier Works, Media Factory, Movic, AT-X, Ka...",Funimation,White Fox,Visual novel,"Thriller, Sci-Fi",24,PG-13,"9.12, 884618",1635137,139832,The self-proclaimed mad scientist Rintarou Oka...
Gintama Season 4,銀魂°,TV,51,Finished Airing,"Apr 8, 2015 to Mar 30, 2016",1900-01-01 18:00:00,"TV Tokyo, Aniplex, Dentsu","Funimation, Crunchyroll",Bandai Namco Pictures,Manga,"Action, Comedy, Historical, Parody, Samurai, S...",24,PG-13,"9.11, 126846",363575,10347,"Gintoki, Shinpachi, and Kagura return as the f..."
Hunter x Hunter,HUNTER×HUNTER（ハンター×ハンター）,TV,148,Finished Airing,"Oct 2, 2011 to Sep 24, 2014",1900-01-01 10:55:00,"VAP, Nippon Television Network, Shueisha",Viz Media,Madhouse,Manga,"Action, Adventure, Fantasy, Shounen, Super Power",23,PG-13,"9.11, 827525",1457626,130325,Hunter x Hunter is set in a world where Hunter...
Gintama Season 2,銀魂',TV,51,Finished Airing,"Apr 4, 2011 to Mar 26, 2012",1900-01-01 18:00:00,"TV Tokyo, Aniplex, Dentsu, Trinity Sound, Mira...",add some,Sunrise,Manga,"Action, Sci-Fi, Comedy, Historical, Parody, Sa...",24,PG-13,"9.08, 140502",336989,6009,"After a one-year hiatus, Shinpachi Shimura ret..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
My Teen Romantic Comedy SNAFU TOO!,やはり俺の青春ラブコメはまちがっている。続,TV,13,Finished Airing,"Apr 3, 2015 to Jun 26, 2015",1900-01-01 01:28:00,"TBS, Marvelous AQL, NBCUniversal Entertainment...",Sentai Filmworks,feel.,Light novel,"Slice of Life, Comedy, Drama, Romance, School",24,PG-13,"8.28, 406147",647423,19348,Yahari Ore no Seishun Love Comedy wa Machigatt...
A Certain Scientific Railgun T,とある科学の超電磁砲[レールガン]T,TV,25,Finished Airing,"Jan 10, 2020 to Sep 25, 2020",1900-01-01 22:00:00,"Square Enix, Movic, AT-X, Warner Bros. Japan, ...",Funimation,J.C.Staff,Manga,"Action, Sci-Fi, Super Power",24,R,"8.27, 31127",119445,3494,"The Daihasei Festival has begun, and that of c..."
Toradora!,とらドラ！,TV,25,Finished Airing,"Oct 2, 2008 to Mar 26, 2009",1900-01-01 01:20:00,"Genco, Starchild Records, Magic Capsule, Yomiu...","NIS America, Inc.",J.C.Staff,Light novel,"Slice of Life, Comedy, Romance, School",24,PG-13,"8.27, 857555",1420197,47820,Ryuuji Takasu is a gentle high school student ...
Laid-Back Camp,ゆるキャン△,TV,12,Finished Airing,"Jan 4, 2018 to Mar 22, 2018",1900-01-01 23:00:00,"Sotsu, AT-X, Sony Music Communications, MAGES....",add some,C-Station,Manga,"Slice of Life, Comedy",23,PG-13,"8.27, 112057",266810,5434,While the perfect getaway for most girls her a...


In [15]:
def parseAired(aired_raw, position):
    aired_split = aired_raw.split(' to ')
    try:
        aired_parsed = " ".join(aired_split[position].split(", "))
        return aired_parsed
    except:
        return pd.NaT

df['aired_start'] = df['aired'].apply(lambda s: parseAired(s, 0))
df['aired_end'] = df['aired'].apply(lambda s: parseAired(s, 1))
df = df.dropna()

Unnamed: 0_level_0,name_japanese,show_type,episodes,status,aired,broadcast_time,producers,licensors,studios,source,genres,episode_length,rating,score_and_scorers,members,favorites,description,aired_start
name_english,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
