# Data Exploration and Analysis
The following notebook will take a look at the data sets that will be used in the recommendation engine and prep the data for use in the engine itself.

## Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

## Read in Data

In [2]:
animes_df = pd.read_csv('data/animes.csv')
users_df = pd.read_csv('data/profiles.csv')
reviews_df = pd.read_csv('data/reviews.csv')

print("Animes DF: {}\nUsers DF: {}\nReviews DF: {}".format(animes_df.shape, users_df.shape, reviews_df.shape))

Animes DF: (19311, 12)
Users DF: (81727, 5)
Reviews DF: (192112, 7)


### Animes DataFrame

In [3]:
animes_df.head(5)

Unnamed: 0,uid,title,synopsis,genre,aired,episodes,members,popularity,ranked,score,img_url,link
0,28891,Haikyuu!! Second Season,Following their participation at the Inter-Hig...,"['Comedy', 'Sports', 'Drama', 'School', 'Shoun...","Oct 4, 2015 to Mar 27, 2016",25.0,489888,141,25.0,8.82,https://cdn.myanimelist.net/images/anime/9/766...,https://myanimelist.net/anime/28891/Haikyuu_Se...
1,23273,Shigatsu wa Kimi no Uso,Music accompanies the path of the human metron...,"['Drama', 'Music', 'Romance', 'School', 'Shoun...","Oct 10, 2014 to Mar 20, 2015",22.0,995473,28,24.0,8.83,https://cdn.myanimelist.net/images/anime/3/671...,https://myanimelist.net/anime/23273/Shigatsu_w...
2,34599,Made in Abyss,The Abyss—a gaping chasm stretching down into ...,"['Sci-Fi', 'Adventure', 'Mystery', 'Drama', 'F...","Jul 7, 2017 to Sep 29, 2017",13.0,581663,98,23.0,8.83,https://cdn.myanimelist.net/images/anime/6/867...,https://myanimelist.net/anime/34599/Made_in_Abyss
3,5114,Fullmetal Alchemist: Brotherhood,"""In order for something to be obtained, someth...","['Action', 'Military', 'Adventure', 'Comedy', ...","Apr 5, 2009 to Jul 4, 2010",64.0,1615084,4,1.0,9.23,https://cdn.myanimelist.net/images/anime/1223/...,https://myanimelist.net/anime/5114/Fullmetal_A...
4,31758,Kizumonogatari III: Reiketsu-hen,After helping revive the legendary vampire Kis...,"['Action', 'Mystery', 'Supernatural', 'Vampire']","Jan 6, 2017",1.0,214621,502,22.0,8.83,https://cdn.myanimelist.net/images/anime/3/815...,https://myanimelist.net/anime/31758/Kizumonoga...


In [53]:
animes_df.genre.isnull().sum()

0

In [4]:
genres = []
for genre_set in animes_df.genre:
    values = genre_set.strip("[]").split(",")
    values = [w.strip()[1:-1] for w in values]
    genres.extend(values)

genres = set(genres)
print("The number of genres is {}.".format(len(genres)))
print(genres)

The number of genres is 44.
{'', 'Yuri', 'Slice of Life', 'Police', 'Fantasy', 'Cars', 'Vampire', 'Romance', 'Thriller', 'Mecha', 'Parody', 'Horror', 'Military', 'Sci-Fi', 'Samurai', 'Shounen', 'Shounen Ai', 'Space', 'Martial Arts', 'Dementia', 'Historical', 'Mystery', 'Kids', 'Hentai', 'Drama', 'Comedy', 'Adventure', 'Shoujo', 'School', 'Demons', 'Supernatural', 'Game', 'Super Power', 'Shoujo Ai', 'Magic', 'Ecchi', 'Action', 'Josei', 'Harem', 'Music', 'Yaoi', 'Sports', 'Seinen', 'Psychological'}


Notice the first element is empty. This happend when calling set(genres). We can quickly delete that.

In [5]:
genres = list(genres)
genres.pop(0)
print("The number of genres is {}.".format(len(genres)))
print(genres)

The number of genres is 43.
['Yuri', 'Slice of Life', 'Police', 'Fantasy', 'Cars', 'Vampire', 'Romance', 'Thriller', 'Mecha', 'Parody', 'Horror', 'Military', 'Sci-Fi', 'Samurai', 'Shounen', 'Shounen Ai', 'Space', 'Martial Arts', 'Dementia', 'Historical', 'Mystery', 'Kids', 'Hentai', 'Drama', 'Comedy', 'Adventure', 'Shoujo', 'School', 'Demons', 'Supernatural', 'Game', 'Super Power', 'Shoujo Ai', 'Magic', 'Ecchi', 'Action', 'Josei', 'Harem', 'Music', 'Yaoi', 'Sports', 'Seinen', 'Psychological']


In [6]:
animes_df.genre[0].find("Sports")

12

In [7]:
def split_genres(movie):
    try:
        if movie.find(genre) > -1:
            return 1
        else:
            return 0
    except AttributeError:
        return 0

for genre in genres:
    animes_df[genre] = animes_df['genre'].apply(split_genres)

In [8]:
animes_df.iloc[59]

uid                                                            269
title                                                       Bleach
synopsis         Ichigo Kurosaki is an ordinary high schooler—u...
genre            ['Action', 'Adventure', 'Comedy', 'Super Power...
aired                                  Oct 5, 2004 to Mar 27, 2012
episodes                                                     366.0
members                                                    1002578
popularity                                                      25
ranked                                                       757.0
score                                                         7.87
img_url          https://cdn.myanimelist.net/images/anime/3/404...
link                      https://myanimelist.net/anime/269/Bleach
Yuri                                                             0
Slice of Life                                                    0
Police                                                        

In [13]:
top_ranked = animes_df.sort_values(by='ranked', ascending=True).drop_duplicates()
top_ranked.head(20)

Unnamed: 0,uid,title,synopsis,genre,aired,episodes,members,popularity,ranked,score,...,Magic,Ecchi,Action,Josei,Harem,Music,Yaoi,Sports,Seinen,Psychological
3080,5114,Fullmetal Alchemist: Brotherhood,"""In order for something to be obtained, someth...","['Action', 'Military', 'Adventure', 'Comedy', ...","Apr 5, 2009 to Jul 4, 2010",64.0,1615084,4,1.0,9.23,...,1,0,1,0,0,0,0,0,0,0
773,9253,Steins;Gate,The self-proclaimed mad scientist Rintarou Oka...,"['Thriller', 'Sci-Fi']","Apr 6, 2011 to Sep 14, 2011",24.0,1331710,7,2.0,9.11,...,0,0,0,0,0,0,0,0,0,0
772,11061,Hunter x Hunter (2011),Hunter x Hunter is set in a world where Hunte...,"['Action', 'Adventure', 'Fantasy', 'Shounen', ...","Oct 2, 2011 to Sep 24, 2014",148.0,1052761,20,3.0,9.11,...,0,0,1,0,0,0,0,0,0,0
18278,32281,Kimi no Na wa.,"Mitsuha Miyamizu, a high school girl, yearns t...","['Romance', 'Supernatural', 'School', 'Drama']","Aug 26, 2016",1.0,1139878,15,4.0,9.09,...,0,0,0,0,0,0,0,0,0,0
18276,38524,Shingeki no Kyojin Season 3 Part 2,Seeking to restore humanity’s diminishing hope...,"['Action', 'Drama', 'Fantasy', 'Military', 'My...","Apr 29, 2019 to Jul 1, 2019",10.0,446370,175,5.0,9.07,...,0,0,1,0,0,0,0,0,0,0
18067,28977,Gintama°,"Gintoki, Shinpachi, and Kagura return as the f...","['Action', 'Comedy', 'Historical', 'Parody', '...","Apr 8, 2015 to Mar 30, 2016",51.0,281594,351,6.0,9.05,...,0,0,1,0,0,0,0,0,0,0
18066,9969,Gintama',"After a one-year hiatus, Shinpachi Shimura ret...","['Action', 'Sci-Fi', 'Comedy', 'Historical', '...","Apr 4, 2011 to Mar 26, 2012",51.0,278110,353,7.0,9.04,...,0,0,1,0,0,0,0,0,0,0
18065,820,Ginga Eiyuu Densetsu,The 150-year-long stalemate between the two in...,"['Military', 'Sci-Fi', 'Space', 'Drama']","Jan 8, 1988 to Mar 17, 1997",110.0,175423,620,8.0,9.03,...,0,0,0,0,0,0,0,0,0,0
18064,35180,3-gatsu no Lion 2nd Season,"Now in his second year of high school, Rei Kir...","['Drama', 'Game', 'Seinen', 'Slice of Life']","Oct 14, 2017 to Mar 31, 2018",22.0,169544,657,9.0,9.02,...,0,0,0,0,0,0,0,0,1,0
18063,28851,Koe no Katachi,"As a wild youth, elementary school student Sho...","['Drama', 'School', 'Shounen']","Sep 17, 2016",1.0,842277,53,10.0,9.01,...,0,0,0,0,0,0,0,0,0,0


In [21]:
def get_top_rated(n, df=animes_df):
    '''
    INPUT:
    df - animes df from cells above
    n - number of recs to return
    
    OUTPUT:
    recs -  the name and img url of the all time top rated animes
    '''
    recs = []
    top_ranked = df.sort_values(by='ranked', ascending=True).drop_duplicates()
    
    for i in range(n):
        recs.append((top_ranked.iloc[i].title, top_ranked.iloc[i].img_url))
                    
    return recs

In [22]:
top_rated = get_top_rated(10)
print(top_rated[0])

[('Fullmetal Alchemist: Brotherhood', 'https://cdn.myanimelist.net/images/anime/1223/96541.jpg'), ('Steins;Gate', 'https://cdn.myanimelist.net/images/anime/5/73199.jpg'), ('Hunter x Hunter (2011)', 'https://cdn.myanimelist.net/images/anime/11/33657.jpg'), ('Kimi no Na wa.', 'https://cdn.myanimelist.net/images/anime/5/87048.jpg'), ('Shingeki no Kyojin Season 3 Part 2', 'https://cdn.myanimelist.net/images/anime/1517/100633.jpg'), ('Gintama°', 'https://cdn.myanimelist.net/images/anime/3/72078.jpg'), ("Gintama'", 'https://cdn.myanimelist.net/images/anime/4/50361.jpg'), ('Ginga Eiyuu Densetsu', 'https://cdn.myanimelist.net/images/anime/13/13225.jpg'), ('3-gatsu no Lion 2nd Season', 'https://cdn.myanimelist.net/images/anime/3/88469.jpg'), ('Koe no Katachi', 'https://cdn.myanimelist.net/images/anime/1122/96435.jpg')]


In [24]:
print(top_rated[5])

('Gintama°', 'https://cdn.myanimelist.net/images/anime/3/72078.jpg')
