## GROUP 4 Project 2 AnimeList Analysis

###  Step 1: Data Cleaning

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
#paths are relative paths, please don't run cell if you don't have the data downloaded
pre_2020 = pd.read_csv('data/anime_pre2020.csv', index_col=False)
post_2020 = pd.read_csv('data/anime_main.csv', index_col=False)

In [7]:
pre_2020.shape

(19311, 12)

In [12]:
#drop duplicates
pre_2020 = pre_2020.drop_duplicates()
pre_2020.shape

(16368, 12)

In [13]:
pre_2020.head()

Unnamed: 0,uid,title,synopsis,genre,aired,episodes,members,popularity,ranked,score,img_url,link
0,28891,Haikyuu!! Second Season,Following their participation at the Inter-Hig...,"['Comedy', 'Sports', 'Drama', 'School', 'Shoun...","Oct 4, 2015 to Mar 27, 2016",25.0,489888,141,25.0,8.82,https://cdn.myanimelist.net/images/anime/9/766...,https://myanimelist.net/anime/28891/Haikyuu_Se...
1,23273,Shigatsu wa Kimi no Uso,Music accompanies the path of the human metron...,"['Drama', 'Music', 'Romance', 'School', 'Shoun...","Oct 10, 2014 to Mar 20, 2015",22.0,995473,28,24.0,8.83,https://cdn.myanimelist.net/images/anime/3/671...,https://myanimelist.net/anime/23273/Shigatsu_w...
2,34599,Made in Abyss,The Abyss—a gaping chasm stretching down into ...,"['Sci-Fi', 'Adventure', 'Mystery', 'Drama', 'F...","Jul 7, 2017 to Sep 29, 2017",13.0,581663,98,23.0,8.83,https://cdn.myanimelist.net/images/anime/6/867...,https://myanimelist.net/anime/34599/Made_in_Abyss
3,5114,Fullmetal Alchemist: Brotherhood,"""In order for something to be obtained, someth...","['Action', 'Military', 'Adventure', 'Comedy', ...","Apr 5, 2009 to Jul 4, 2010",64.0,1615084,4,1.0,9.23,https://cdn.myanimelist.net/images/anime/1223/...,https://myanimelist.net/anime/5114/Fullmetal_A...
4,31758,Kizumonogatari III: Reiketsu-hen,After helping revive the legendary vampire Kis...,"['Action', 'Mystery', 'Supernatural', 'Vampire']","Jan 6, 2017",1.0,214621,502,22.0,8.83,https://cdn.myanimelist.net/images/anime/3/815...,https://myanimelist.net/anime/31758/Kizumonoga...


In [14]:
post_2020.shape

(20741, 35)

In [17]:
post_2020 = post_2020.drop_duplicates()
post_2020.shape

(20741, 35)

In [34]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(post_2020.head())

   mal_id                               title   type  score  scored_by  \
0    5114    Fullmetal Alchemist: Brotherhood     TV   9.13    1865027   
1   11061              Hunter x Hunter (2011)     TV   9.05    1502709   
2   38524  Shingeki no Kyojin Season 3 Part 2     TV   9.07    1322101   
3    9253                         Steins;Gate     TV   9.08    1248451   
4   28851                      Koe no Katachi  Movie   8.95    1392476   

            status  episodes airing_from   airing_to        source  members  \
0  Finished Airing      64.0  2009-04-05  2010-07-04         Manga  2922030   
1  Finished Airing     148.0  2011-10-02  2014-09-24         Manga  2408403   
2  Finished Airing      10.0  2019-04-29  2019-07-01         Manga  1871324   
3  Finished Airing      24.0  2011-04-06  2011-09-14  Visual novel  2262118   
4  Finished Airing       1.0  2016-09-17  2016-09-17         Manga  1992945   

   favorites episode_duration                          rating   sfw  \
0     204

In [25]:
#joining pre-covid data to post covid data. validate id and name are unique to each other
pre_2020.groupby(['uid','title']).count().head(20)['synopsis']

uid  title                          
1    Cowboy Bebop                       1
5    Cowboy Bebop: Tengoku no Tobira    1
6    Trigun                             1
7    Witch Hunter Robin                 1
8    Bouken Ou Beet                     1
15   Eyeshield 21                       1
16   Hachimitsu to Clover               1
17   Hungry Heart: Wild Striker         1
18   Initial D Fourth Stage             1
19   Monster                            1
20   Naruto                             1
21   One Piece                          1
22   Tennis no Ouji-sama                1
23   Ring ni Kakero 1                   1
24   School Rumble                      1
25   Sunabouzu                          1
26   Texhnolyze                         2
27   Trinity Blood                      1
28   Yakitate!! Japan                   1
29   Zipang                             1
Name: synopsis, dtype: int64

In [26]:
post_2020.groupby(['mal_id','title']).count().head(20)['synopsis']

mal_id  title                          
1       Cowboy Bebop                       1
5       Cowboy Bebop: Tengoku no Tobira    1
6       Trigun                             1
7       Witch Hunter Robin                 1
8       Bouken Ou Beet                     1
15      Eyeshield 21                       1
16      Hachimitsu to Clover               1
17      Hungry Heart: Wild Striker         1
18      Initial D Fourth Stage             1
19      Monster                            1
20      Naruto                             1
21      One Piece                          1
22      Tennis no Ouji-sama                1
23      Ring ni Kakero 1                   1
24      School Rumble                      1
25      Sunabouzu                          1
26      Texhnolyze                         1
27      Trinity Blood                      1
28      Yakitate!! Japan                   1
29      Zipang                             1
Name: synopsis, dtype: int64

In [28]:
#with validation, id and nanme are unique to each other. Considering more anime aired in 2022, we will be joining the 2020 data to 2022data. 
anime_merge = pd.merge(post_2020, pre_2020, 
                      left_on=  ['mal_id','title'],
                       right_on= ['uid','title'], 
                       how = 'left')

In [31]:
#cleaning column names
anime_merge.columns

Index(['mal_id', 'title', 'type', 'score_x', 'scored_by', 'status',
       'episodes_x', 'airing_from', 'airing_to', 'source', 'members_x',
       'favorites', 'episode_duration', 'rating', 'sfw', 'start_year',
       'start_season', 'broadcast_day', 'broadcast_time', 'genres', 'themes',
       'demographics', 'studios', 'producers', 'licensors', 'synopsis_x',
       'background', 'mal_created_at', 'mal_updated_at', 'picture_url',
       'mal_url', 'trailer_url', 'title_english', 'title_japanese',
       'title_synonyms', 'uid', 'synopsis_y', 'genre', 'aired', 'episodes_y',
       'members_y', 'popularity', 'ranked', 'score_y', 'img_url', 'link'],
      dtype='object')

In [35]:
new_name_dict = {'mal_id':'id_2022', 
                 'title': 'title', 
                 'type': 'type', 
                 'score_x': 'score_2022', 
                 'scored_by': 'scored_by_2022', 
                 'status': 'status',
                 'episodes_x': 'episodes_2022', 
                 'airing_from': 'airing_from', 
                 'airing_to': 'airing_to', 
                 'source': 'source', 
                 'members_x': 'members_2022',
                 'favorites': 'favorites_2022', 
                 'episode_duration': 'episode_duration', 
                 'rating': 'pg_rating', 
                 'sfw': 'sfw', 
                 'start_year': 'start_year',
                 'start_season': 'start_season', 
                 'broadcast_day': 'broadcast_day', 
                 'broadcast_time': 'broadcast_time', 
                 'genres' : 'genres', 
                 'themes': 'themes',
                 'demographics': 'demographics', 
                 'studios': 'studios', 
                 'producers': 'producers', 
                 'licensors': 'licensors', 
                 'synopsis_x': 'synopsis_2022',
                 'background': 'background', 
                 'mal_created_at': 'mal_created_at', 
                 'mal_updated_at': 'mal_updated_at', 
                 'picture_url': 'picture_url',
                 'mal_url' : 'mal_url', 
                 'trailer_url': 'trailer_url', 
                 'title_english': 'title_english', 
                 'title_japanese': 'title_japanese',
                 'title_synonyms': 'title_synonyms', 
                 'uid' : 'id_2020', 
                 'synopsis_y' :'synopsis_2020', 
                 'genre': 'genre', 
                 'aired': 'aired', 
                 'episodes_y': 'episodes_2020',
                 'members_y' : 'members_2020', 
                 'popularity' : 'popularity_2020', 
                 'ranked': 'ranked_2020', 
                 'score_y' : 'score_2020', 
                 'img_url' : 'img_url', 
                 'link': 'link'}

In [39]:
anime_merge = anime_merge.rename(columns=new_name_dict)

In [40]:
anime_merge.head(10)

Unnamed: 0,id_2022,title,type,score_2022,scored_by_2022,status,episodes_2022,airing_from,airing_to,source,...,synopsis_2020,genre,aired,episodes_2020,members_2020,popularity_2020,ranked_2020,score_2020,img_url,link
0,5114,Fullmetal Alchemist: Brotherhood,TV,9.13,1865027,Finished Airing,64.0,2009-04-05,2010-07-04,Manga,...,"""In order for something to be obtained, someth...","['Action', 'Military', 'Adventure', 'Comedy', ...","Apr 5, 2009 to Jul 4, 2010",64.0,1615084.0,4.0,1.0,9.23,https://cdn.myanimelist.net/images/anime/1223/...,https://myanimelist.net/anime/5114/Fullmetal_A...
1,11061,Hunter x Hunter (2011),TV,9.05,1502709,Finished Airing,148.0,2011-10-02,2014-09-24,Manga,...,Hunter x Hunter is set in a world where Hunte...,"['Action', 'Adventure', 'Fantasy', 'Shounen', ...","Oct 2, 2011 to Sep 24, 2014",148.0,1052761.0,20.0,3.0,9.11,https://cdn.myanimelist.net/images/anime/11/33...,https://myanimelist.net/anime/11061/Hunter_x_H...
2,38524,Shingeki no Kyojin Season 3 Part 2,TV,9.07,1322101,Finished Airing,10.0,2019-04-29,2019-07-01,Manga,...,Seeking to restore humanity’s diminishing hope...,"['Action', 'Drama', 'Fantasy', 'Military', 'My...","Apr 29, 2019 to Jul 1, 2019",10.0,446370.0,175.0,5.0,9.07,https://cdn.myanimelist.net/images/anime/1517/...,https://myanimelist.net/anime/38524/Shingeki_n...
3,9253,Steins;Gate,TV,9.08,1248451,Finished Airing,24.0,2011-04-06,2011-09-14,Visual novel,...,The self-proclaimed mad scientist Rintarou Oka...,"['Thriller', 'Sci-Fi']","Apr 6, 2011 to Sep 14, 2011",24.0,1331710.0,7.0,2.0,9.11,https://cdn.myanimelist.net/images/anime/5/731...,https://myanimelist.net/anime/9253/Steins_Gate
4,28851,Koe no Katachi,Movie,8.95,1392476,Finished Airing,1.0,2016-09-17,2016-09-17,Manga,...,"As a wild youth, elementary school student Sho...","['Drama', 'School', 'Shounen']","Sep 17, 2016",1.0,842277.0,53.0,10.0,9.01,https://cdn.myanimelist.net/images/anime/1122/...,https://myanimelist.net/anime/28851/Koe_no_Kat...
5,32281,Kimi no Na wa.,Movie,8.86,1669497,Finished Airing,1.0,2016-08-26,2016-08-26,Original,...,"Mitsuha Miyamizu, a high school girl, yearns t...","['Romance', 'Supernatural', 'School', 'Drama']","Aug 26, 2016",1.0,1139878.0,15.0,4.0,9.09,https://cdn.myanimelist.net/images/anime/5/870...,https://myanimelist.net/anime/32281/Kimi_no_Na_wa
6,2904,Code Geass: Hangyaku no Lelouch R2,TV,8.91,1076510,Finished Airing,25.0,2008-04-06,2008-09-28,Original,...,"One year has passed since the Black Rebellion,...","['Action', 'Military', 'Sci-Fi', 'Super Power'...","Apr 6, 2008 to Sep 28, 2008",25.0,992196.0,27.0,17.0,8.93,https://cdn.myanimelist.net/images/anime/4/939...,https://myanimelist.net/anime/2904/Code_Geass_...
7,40028,Shingeki no Kyojin: The Final Season,TV,8.84,1073027,Finished Airing,16.0,2020-12-07,2021-03-29,Manga,...,,,,,,,,,,
8,199,Sen to Chihiro no Kamikakushi,Movie,8.78,1145304,Finished Airing,1.0,2001-07-20,2001-07-20,Original,...,"Stubborn, spoiled, and naïve, 10-year-old Chih...","['Adventure', 'Supernatural', 'Drama']","Jul 20, 2001",1.0,913212.0,40.0,20.0,8.9,https://cdn.myanimelist.net/images/anime/6/795...,https://myanimelist.net/anime/199/Sen_to_Chihi...
9,1575,Code Geass: Hangyaku no Lelouch,TV,8.7,1262624,Finished Airing,25.0,2006-10-06,2007-07-29,Original,...,"In the year 2010, the Holy Empire of Britannia...","['Action', 'Military', 'Sci-Fi', 'Super Power'...","Oct 6, 2006 to Jul 29, 2007",25.0,1231546.0,11.0,31.0,8.76,https://cdn.myanimelist.net/images/anime/5/503...,https://myanimelist.net/anime/1575/Code_Geass_...


In [41]:
#creating master data file
anime_merge.to_csv('data/anime_master_data.csv', index = False)