In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import urllib.parse as urlparse


import os
import re
from PIL import Image
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import shutil

import sys
import warnings
import timeit


## 1. Import Data

There are two datasets to be imported:  
a. Animes Dataset: This contains the metadata of the animes which we scrapped from: https://myanimelist.net/  
b. Reviews Dataset: This contains the ratings given by users for animes and is obtained from kaggle website: https://www.kaggle.com/marlesson/myanimelist-dataset-animes-profiles-reviews  

In [2]:
df_anime = pd.read_csv('./data/dataset_EDA_Topic_14022021.csv')
print('There are {} rows and {} columns in the animes dataset'.format(df_anime.shape[0], df_anime.shape[1]))

There are 17489 rows and 91 columns in the animes dataset


  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
df_review = pd.read_csv('./data/reviews.csv')
print('There are {} rows and {} columns in reviews dataset'.format(df_review.shape[0], df_review.shape[1]))

There are 192112 rows and 7 columns in reviews dataset


**Examine the reviews data**

In [4]:
df_review.head()

Unnamed: 0,uid,profile,anime_uid,text,score,scores,link
0,255938,DesolatePsyche,34096,\n \n \n \n ...,8,"{'Overall': '8', 'Story': '8', 'Animation': '8...",https://myanimelist.net/reviews.php?id=255938
1,259117,baekbeans,34599,\n \n \n \n ...,10,"{'Overall': '10', 'Story': '10', 'Animation': ...",https://myanimelist.net/reviews.php?id=259117
2,253664,skrn,28891,\n \n \n \n ...,7,"{'Overall': '7', 'Story': '7', 'Animation': '9...",https://myanimelist.net/reviews.php?id=253664
3,8254,edgewalker00,2904,\n \n \n \n ...,9,"{'Overall': '9', 'Story': '9', 'Animation': '9...",https://myanimelist.net/reviews.php?id=8254
4,291149,aManOfCulture99,4181,\n \n \n \n ...,10,"{'Overall': '10', 'Story': '10', 'Animation': ...",https://myanimelist.net/reviews.php?id=291149


In [5]:
# check columns of data available
df_review.columns

Index(['uid', 'profile', 'anime_uid', 'text', 'score', 'scores', 'link'], dtype='object')

In [6]:
df_review.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 192112 entries, 0 to 192111
Data columns (total 7 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   uid        192112 non-null  int64 
 1   profile    192112 non-null  object
 2   anime_uid  192112 non-null  int64 
 3   text       192112 non-null  object
 4   score      192112 non-null  int64 
 5   scores     192112 non-null  object
 6   link       192112 non-null  object
dtypes: int64(3), object(4)
memory usage: 10.3+ MB


In [7]:
# check if uid is unique 
# uid refers to the id of review according to the data owner on kaggle and hence we expect it to be unique
df_review.duplicated(['uid']).sum()

61593

In [8]:
# examine records where the uids are duplicates
dup_ids = df_review[df_review.duplicated(['uid'])]['uid']
df_review[df_review['uid'].isin(dup_ids)].sort_values('uid')

Unnamed: 0,uid,profile,anime_uid,text,score,scores,link
36254,1,Xinil,1,\n \n \n \n ...,10,"{'Overall': '10', 'Story': '8', 'Animation': '...",https://myanimelist.net/reviews.php?id=1
163348,1,Xinil,1,\n \n \n \n ...,10,"{'Overall': '10', 'Story': '8', 'Animation': '...",https://myanimelist.net/reviews.php?id=1
36318,10,Xinil,263,\n \n \n \n ...,10,"{'Overall': '10', 'Story': '10', 'Animation': ...",https://myanimelist.net/reviews.php?id=10
163484,10,Xinil,263,\n \n \n \n ...,10,"{'Overall': '10', 'Story': '10', 'Animation': ...",https://myanimelist.net/reviews.php?id=10
185200,12,running_lemon,210,\n \n \n \n ...,7,"{'Overall': '7', 'Story': '6', 'Animation': '8...",https://myanimelist.net/reviews.php?id=12
...,...,...,...,...,...,...,...
53898,325734,AnimeBW,21339,\n \n \n \n ...,7,"{'Overall': '7', 'Story': '0', 'Animation': '0...",https://myanimelist.net/reviews.php?id=325734
43185,325738,fenclair,34798,\n \n \n \n ...,10,"{'Overall': '10', 'Story': '9', 'Animation': '...",https://myanimelist.net/reviews.php?id=325738
176806,325738,fenclair,34798,\n \n \n \n ...,10,"{'Overall': '10', 'Story': '9', 'Animation': '...",https://myanimelist.net/reviews.php?id=325738
46087,325747,Doug_Dugle,849,\n \n \n \n ...,8,"{'Overall': '8', 'Story': '10', 'Animation': '...",https://myanimelist.net/reviews.php?id=325747


In [9]:
# remove duplicated uids since they are the same
df_review = df_review.drop_duplicates(['uid'])
df_review.shape

(130519, 7)

In [10]:
# check again if there are duplicated ratings
df_review.duplicated(["uid","anime_uid", "score"]).sum()

0

In [11]:
# count the number of reviews given by each user
reviews_count = df_review[['profile', 'uid']].groupby(['profile']).size().reset_index(name='counts')
reviews_count['counts'].describe()

count    47885.000000
mean         2.725676
std          8.217880
min          1.000000
25%          1.000000
50%          1.000000
75%          2.000000
max        611.000000
Name: counts, dtype: float64

**Examine animes dataset**

In [12]:
df_anime.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17489 entries, 0 to 17488
Data columns (total 91 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   index                17489 non-null  int64  
 1   Unnamed: 0           17489 non-null  int64  
 2   Unnamed: 0.1         17489 non-null  int64  
 3   Title                17489 non-null  object 
 4   URL                  17489 non-null  object 
 5   English              6982 non-null   object 
 6   Synonyms             10619 non-null  object 
 7   Japanese             17443 non-null  object 
 8   Type                 17489 non-null  object 
 9   Episodes             16998 non-null  float64
 10  Status               17489 non-null  object 
 11  Aired                17489 non-null  object 
 12  Premiered            4966 non-null   object 
 13  Broadcast            4966 non-null   object 
 14  Producers            17489 non-null  object 
 15  Licensors            17489 non-null 

In [13]:
df_anime.head()

Unnamed: 0.2,index,Unnamed: 0,Unnamed: 0.1,Title,URL,English,Synonyms,Japanese,Type,Episodes,...,Studio_3,Studio_4,Studio_5,Studio_6,Anime_Number,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text,Topic_Category
0,0,0,0,Fullmetal Alchemist: Brotherhood,https://myanimelist.net/anime/5114/Fullmetal_A...,Fullmetal Alchemist: Brotherhood,"Hagane no Renkinjutsushi: Fullmetal Alchemist,...",鋼の錬金術師 FULLMETAL ALCHEMIST,TV,64.0,...,,,,,0,0.0,0.2993,"world, human, power, earth, fight, save, war, ...",in order for something to be obtained somethi...,Anime about war or battles on earth
1,1,1,1,Shingeki no Kyojin: The Final Season,https://myanimelist.net/anime/40028/Shingeki_n...,Attack on Titan Final Season,"Shingeki no Kyojin Season 4, Attack on Titan S...",進撃の巨人 The Final Season,TV,16.0,...,,,,,1,0.0,0.282,"world, human, power, earth, fight, save, war, ...",gabi braun and falco grice have been training ...,Anime about war or battles on earth
2,2,2,2,Steins;Gate,https://myanimelist.net/anime/9253/Steins_Gate,Steins;Gate,,STEINS;GATE,TV,24.0,...,,,,,2,1.0,0.3302,"year, group, team, member, lead, club, face, j...",the self proclaimed mad scientist rintarou oka...,Anime about membership in a club or group
3,3,3,3,Shingeki no Kyojin Season 3 Part 2,https://myanimelist.net/anime/38524/Shingeki_n...,Attack on Titan Season 3 Part 2,,進撃の巨人 Season3 Part.2,TV,10.0,...,,,,,3,1.0,0.3661,"year, group, team, member, lead, club, face, j...",seeking to restore humanity s diminishing hope...,Anime about membership in a club or group
4,4,4,4,Gintama°,https://myanimelist.net/anime/28977/Gintama°,Gintama Season 4,Gintama' (2015),銀魂°,TV,51.0,...,,,,,4,1.0,0.2368,"year, group, team, member, lead, club, face, j...",gintoki shinpachi and kagura return as the fun...,Anime about membership in a club or group


In [14]:
df_anime.columns

Index(['index', 'Unnamed: 0', 'Unnamed: 0.1', 'Title', 'URL', 'English',
       'Synonyms', 'Japanese', 'Type', 'Episodes', 'Status', 'Aired',
       'Premiered', 'Broadcast', 'Producers', 'Licensors', 'Studios', 'Source',
       'Genres', 'Duration', 'Rating', 'Score', 'Ranked', 'Popularity',
       'Members', 'Favorites', 'Started', 'Ended', 'Voters', 'Adaptation',
       'Alternative version', 'Side story', 'Spin-off', 'Synopsis', 'Prequel',
       'Alternative setting', 'Sequel', 'Other', 'Summary', 'Character',
       'Parent story', 'Full story', 'Genre_0', 'Genre_1', 'Genre_2',
       'Genre_3', 'Genre_4', 'Genre_5', 'Genre_6', 'Genre_7', 'Genre_8',
       'Genre_9', 'Genre_10', 'Genre_11', 'Genre_12', 'Air_Start', 'Air_End',
       'Duration_Aired', 'Producer_0', 'Producer_1', 'Producer_2',
       'Producer_3', 'Producer_4', 'Producer_5', 'Producer_6', 'Producer_7',
       'Producer_8', 'Producer_9', 'Producer_10', 'Producer_11', 'Producer_12',
       'Producer_13', 'Producer_1

In [15]:
# check if there are duplicates by URL - URL by right is supposed to be the unique key
df_anime.duplicated(['URL']).sum()

11

In [16]:
# examine records where the URLs are duplicates
dup_urls = df_anime[df_anime.duplicated(['URL'])]['URL']
df_anime[df_anime['URL'].isin(dup_urls)].sort_values('URL')

Unnamed: 0.2,index,Unnamed: 0,Unnamed: 0.1,Title,URL,English,Synonyms,Japanese,Type,Episodes,...,Studio_3,Studio_4,Studio_5,Studio_6,Anime_Number,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text,Topic_Category
10299,10299,10299,49,Waza no Tabibito,https://myanimelist.net/anime/10904/Waza_no_Ta...,,,技の旅人,Movie,1.0,...,,,,,10299,0.0,0.3471,"world, human, power, earth, fight, save, war, ...",humans acquired a perfect source of energy cal...,Anime about war or battles on earth
10300,10300,10300,0,Waza no Tabibito,https://myanimelist.net/anime/10904/Waza_no_Ta...,,,技の旅人,Movie,1.0,...,,,,,10300,0.0,0.3481,"world, human, power, earth, fight, save, war, ...",humans acquired a perfect source of energy cal...,Anime about war or battles on earth
5999,5999,5999,49,"Dakara Boku wa, H ga Dekinai. Recap",https://myanimelist.net/anime/15325/Dakara_Bok...,"So, I Can't Play H!","Dakara Boku wa, H ga Dekinai Episode 7.5, Daka...",だから僕は、Hができない。,Special,1.0,...,,,,,5999,3.0,0.2308,"short, song, special, base, feature, film, inc...",recap episode aired between episodes and,"Anime featuring songs, films or videos"
6000,6000,6000,0,"Dakara Boku wa, H ga Dekinai. Recap",https://myanimelist.net/anime/15325/Dakara_Bok...,"So, I Can't Play H!","Dakara Boku wa, H ga Dekinai Episode 7.5, Daka...",だから僕は、Hができない。,Special,1.0,...,,,,,6000,3.0,0.2308,"short, song, special, base, feature, film, inc...",recap episode aired between episodes and,"Anime featuring songs, films or videos"
12050,12050,12050,0,Soba no Hana Saita Hi,https://myanimelist.net/anime/26321/Soba_no_Ha...,,,そばの花さいた日,OVA,1.0,...,,,,,12050,0.0,0.2,"world, human, power, earth, fight, save, war, ...",an educational film about environmentalism,Anime about war or battles on earth
12049,12049,12049,49,Soba no Hana Saita Hi,https://myanimelist.net/anime/26321/Soba_no_Ha...,,,そばの花さいた日,OVA,1.0,...,,,,,12049,0.0,0.2,"world, human, power, earth, fight, save, war, ...",an educational film about environmentalism,Anime about war or battles on earth
11550,11550,11550,0,Pororo Hanguk-e Wass-eoyo,https://myanimelist.net/anime/27465/Pororo_Han...,Pororo Goes to Korea,Pororo Hangug-e Wass-eoyo,뽀로로가 한국에 왔어요,ONA,1.0,...,,,,,11550,3.0,0.3432,"short, song, special, base, feature, film, inc...",korean culture and information service ministr...,"Anime featuring songs, films or videos"
11549,11549,11549,49,Pororo Hanguk-e Wass-eoyo,https://myanimelist.net/anime/27465/Pororo_Han...,Pororo Goes to Korea,Pororo Hangug-e Wass-eoyo,뽀로로가 한국에 왔어요,ONA,1.0,...,,,,,11549,3.0,0.346,"short, song, special, base, feature, film, inc...",korean culture and information service ministr...,"Anime featuring songs, films or videos"
10649,10649,10649,49,Aoki Seimei Hoken,https://myanimelist.net/anime/31513/Aoki_Seime...,,,青木生命保険CM,Special,1.0,...,,,,,10649,3.0,0.2321,"short, song, special, base, feature, film, inc...",tv commercial of aoki life insurance company,"Anime featuring songs, films or videos"
10650,10650,10650,0,Aoki Seimei Hoken,https://myanimelist.net/anime/31513/Aoki_Seime...,,,青木生命保険CM,Special,1.0,...,,,,,10650,3.0,0.2321,"short, song, special, base, feature, film, inc...",tv commercial of aoki life insurance company,"Anime featuring songs, films or videos"


In [17]:
# remove duplicated records by URLs since they are the same
df_anime = df_anime.drop_duplicates(['URL'])
df_anime.shape

(17478, 91)

In [18]:
# check the values for TYPE
df_anime['Type'].value_counts()

TV         4966
OVA        3889
Movie      3034
Special    2212
ONA        1884
Music      1462
Unknown      31
Name: Type, dtype: int64

In [19]:
# check number of animes with missing synopsis\n",
df_anime['Synopsis'].isnull().sum()

877

**Observations:**
1. In the reviews dataset, the animes are identified by anime_uid. In the dataset, the anime_uid is masked within the URL: /anime/{anime_uid}/. As such to combine the above two dataset, we need to apply regex to extract the anime_uid info in the animes dataset.  
2. Observe that there are also many different types. We will restrict our scope to Type == TV and discard the rest.

## 2. Identify Animes within Scope for Analysis 

Scope of Analysis limited to:  
1. Animes where Type = 'TV'  
2. Animes with valid synopsis. The reason for doing this is because one of the collaborative filtering algorithm will be using synopsis to derive similarity score. Thus, to ensure a valid comparison across the different models, we will have to ensure the dataset used is the same. 

In [20]:
# subset Type == TV
df_anime = df_anime[df_anime['Type']=='TV']
df_anime.shape

(4966, 91)

In [21]:
# subset animes with valid synopsis
df_anime = df_anime[(df_anime['Synopsis'].notnull()) & (df_anime['Synopsis']!=' ')]
df_anime.shape

(4808, 91)

## 3. Extract anime_uid for animes dataset

In [22]:
#retrieve the digits after the "anime/", these digits are the anime uid
df_anime['anime_uid'] = df_anime['URL'].str.extract('anime/(\d+)/')

#convert to integer
df_anime['anime_uid'] = df_anime['anime_uid'].astype(int)

## 4. Subset Rating Records for Animes within Scope

In [23]:
# subset rating records for animes within scope
df_review = df_review[df_review['anime_uid'].isin(df_anime['anime_uid'])]
df_review.shape

(101481, 7)

In [24]:
# further scale down to look at only users who has more than 1 review
user_multiple_reviews = df_review[df_review.duplicated(['profile'])]['profile']
df_review = df_review[df_review['profile'].isin(user_multiple_reviews)]
df_review.shape

(75921, 7)

In [25]:
# check number of unique users remaining
len(set(df_review['profile']))

15363

## 5. Output datasets to CSV

In [26]:
df_review.to_csv('./processed_data/processed_reviews.csv', index=False)
df_anime.to_csv('./processed_data/processed_anime.csv', index=False)

## 6. Extract images

Given that we have scoped our reviews and anime dataset, we will discard the images data which we have scrapped but are assessed to be out of scope.

In [27]:
# specify images directory
images_dir = r'data/images/'
csv_dir = r'data/'

In [28]:
# subset the indices of the animes retained
animes_idx_retained = df_anime.index.tolist()
len(animes_idx_retained)

4808

In [29]:
# check
animes_idx_retained[:10]

[0, 1, 2, 3, 4, 5, 6, 8, 9, 11]

In [30]:
# create a directory to store the images within scope
try:
    os.mkdir('./processed_data/images/')
except OSError:
    pass

In [31]:
# Get file names from original images directory
file_names = os.listdir(images_dir)
file_names[0]

'index_0.jpg'

In [34]:
# identify images to retain and shift to new folder

animes_retained_img = []

for i in file_names:
    # get the anime index of the image and convert it to int
    anime_name = int(re.findall(r'\d+', i)[0])
    
    if anime_name in animes_idx_retained:    
        src_dir = images_dir + str(i)
        dst_dir = './processed_data/images/'
        shutil.copy(src_dir, dst_dir)
        animes_retained_img.append(anime_name)
    else:
        pass

In [35]:
# check lenght of animes_retained_img
print(len(animes_retained_img))

4801


In [36]:
# check which are the animes which did not have images
animes_no_img = list(set(animes_idx_retained) - set(animes_retained_img))
print(len(animes_no_img))
print(animes_no_img)

7
[14211, 12041, 16074, 15788, 12751, 15823, 11580]
