Data preprocessing

In [30]:
import pandas as pd
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [31]:
# download the necessary nltk resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [32]:
raw_data = pd.read_csv("data/anime.csv")
raw_data.head(2)

Unnamed: 0,mal_id,title,type,score,scored_by,status,episodes,aired_from,aired_to,source,...,producers,licensors,synopsis,background,main_picture,url,trailer_url,title_english,title_japanese,title_synonyms
0,5114,Fullmetal Alchemist: Brotherhood,TV,9.13,1867867.0,Finished Airing,64.0,2009-04-05,2010-07-04,Manga,...,"['Aniplex', 'Square Enix', 'Mainichi Broadcast...","['Funimation', 'Aniplex of America']",After a horrific alchemy experiment goes wrong...,,https://cdn.myanimelist.net/images/anime/1223/...,https://myanimelist.net/anime/5114/Fullmetal_A...,https://www.youtube.com/watch?v=--IcmZkvL0Q,Fullmetal Alchemist: Brotherhood,鋼の錬金術師 FULLMETAL ALCHEMIST,['Hagane no Renkinjutsushi: Fullmetal Alchemis...
1,11061,Hunter x Hunter (2011),TV,9.05,1505975.0,Finished Airing,148.0,2011-10-02,2014-09-24,Manga,...,"['VAP', 'Nippon Television Network', 'Shueisha']",['VIZ Media'],Hunters devote themselves to accomplishing haz...,,https://cdn.myanimelist.net/images/anime/1337/...,https://myanimelist.net/anime/11061/Hunter_x_H...,https://www.youtube.com/watch?v=D9iTQRB4XRk,Hunter x Hunter,HUNTER×HUNTER（ハンター×ハンター）,['HxH (2011)']


In [33]:
# Remove unused information
raw_data = raw_data.drop(['pending_approval', 'title_japanese', 'title_synonyms', 'trailer_url', 'background', 
                          'broadcast_time', 'broadcast_day', 'nsfw', 'premiered_season', 'premiered_year', 'aired_from',
                          'aired_to'], axis=1)
# Remove rows with N.A score, type, english title
raw_data = raw_data.dropna(subset=['score', 'type', 'title_english'])
raw_data.shape

(14262, 22)

In [34]:
raw_data.isna().sum()

mal_id              0
title               0
type                0
score               0
scored_by           0
status              0
episodes           91
source           1760
members             0
favorites           0
duration           20
rating             95
genres              0
themes              0
demographics        0
studios             0
producers           0
licensors           0
synopsis          389
main_picture        5
url                 0
title_english    7136
dtype: int64

In [35]:
# Preprocessing array text
# preprocessing process: "['text', 'text', 'text']" -> "text,text,text"
list_columns = ['genres', 'themes', 'producers', 'licensors', 'demographics', 'studios']

def process_text_array(text):
    words = text.split(',')
    text_array = []
    for word in words:
        word = re.sub(r"[\[\],\'\"]","", word)
        word = word.strip()
        if len(word) != 0:
            text_array.append(word)
    return (",").join(text_array)

for column in list_columns:
    raw_data[column] = raw_data[column].apply(lambda x: process_text_array(x))

In [36]:
raw_data.head(2)

Unnamed: 0,mal_id,title,type,score,scored_by,status,episodes,source,members,favorites,...,genres,themes,demographics,studios,producers,licensors,synopsis,main_picture,url,title_english
0,5114,Fullmetal Alchemist: Brotherhood,TV,9.13,1867867.0,Finished Airing,64.0,Manga,2926579,204314,...,"Action,Adventure,Drama,Fantasy",Military,Shounen,Bones,"Aniplex,Square Enix,Mainichi Broadcasting Syst...","Funimation,Aniplex of America",After a horrific alchemy experiment goes wrong...,https://cdn.myanimelist.net/images/anime/1223/...,https://myanimelist.net/anime/5114/Fullmetal_A...,Fullmetal Alchemist: Brotherhood
1,11061,Hunter x Hunter (2011),TV,9.05,1505975.0,Finished Airing,148.0,Manga,2413361,184766,...,"Action,Adventure,Fantasy",,Shounen,Madhouse,"VAP,Nippon Television Network,Shueisha",VIZ Media,Hunters devote themselves to accomplishing haz...,https://cdn.myanimelist.net/images/anime/1337/...,https://myanimelist.net/anime/11061/Hunter_x_H...,Hunter x Hunter


In [37]:
# I assuming that the episodes are not released yet
raw_data['episodes'] = raw_data['episodes'].fillna(0)

In [38]:
raw_data['type'].unique()

array(['TV', 'Movie', 'OVA', 'Special', 'Music', 'ONA'], dtype=object)

In [39]:
# Remove 'music' type anime
raw_data = raw_data[raw_data['type'] != 'Music']
raw_data.shape

(13158, 22)

In [40]:
raw_data['rating'].unique()

array(['R - 17+ (violence & profanity)', 'PG-13 - Teens 13 or older',
       'PG - Children', 'G - All Ages', 'R+ - Mild Nudity', 'Rx - Hentai',
       nan], dtype=object)

In [41]:
# Fill N.A rating with 'Unknown'
raw_data['rating'] = raw_data['rating'].fillna("Unknown")

In [42]:
# There are 345 animes with empty synopsis, i think it is better to drop them
print('Empty:', raw_data['synopsis'].isna().sum())
# Drop rows with N.A synopsis
raw_data = raw_data.dropna(subset=['synopsis'])

Empty: 361


In [43]:
raw_data.shape

(12797, 22)

In [45]:
raw_data.shape

(12797, 22)

In [46]:
# Create a new dataframe that contains english title (for entity extraction)
stop = stopwords.words('english')
lemmatizer = WordNetLemmatizer()
def cleaning_text(text):
    # Removing hypen with space
    text = str(text)
    text = re.sub("-",' ', text)
    # Removing punctuations
    text = re.sub(r'[^\w\s]', ' ', text)
    # Removing unicode characters, for example: Yugioh anime
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    # Removing continous spaces
    text = re.sub(r'\s+', ' ', text)
    # Remove leading spaces
    text = text.strip()
    # lowercase
    text = text.lower()
    return text

def lemmatize(text):
    text = cleaning_text(text)
    words = nltk.word_tokenize(text)
    words = [word for word in words if word not in stop]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

raw_data['cleaned_synopsis'] = raw_data['synopsis'].apply(lambda x: lemmatize(x))
raw_data['cleaned_title'] = raw_data['title'].apply(lambda x: cleaning_text(x))
raw_data['cleaned_title_english'] = raw_data['title_english'].apply(lambda x: cleaning_text(x))

# Remove rows with an empty string
raw_data = raw_data[raw_data['cleaned_synopsis'] != ""]
raw_data = raw_data[raw_data['cleaned_title'] != ""]
raw_data = raw_data[raw_data['cleaned_title_english'] != ""]

In [47]:
raw_data.shape

(6720, 25)

In [48]:
converted_genres = raw_data['genres'].str.replace(',', " ")
converted_themes = raw_data['themes'].str.replace(',', " ")

raw_data['description'] = converted_genres.str.cat(converted_themes, sep=" ")
raw_data['description'] = raw_data['description'].str.strip()
raw_data['description'] = raw_data['description'].str.cat(raw_data['cleaned_synopsis'], sep=" ")
raw_data['description'] = raw_data['description'].str.strip()
raw_data['description'] = raw_data['description'].str.lower()

In [49]:
raw_data['description'].head(1).values[0]

'action adventure drama fantasy military horrific alchemy experiment go wrong elric household brother edward alphonse left catastrophic new reality ignoring alchemical principle banning human transmutation boy attempted bring recently deceased mother back life instead suffered brutal personal loss alphonse body disintegrated edward lost leg sacrificed arm keep alphonse soul physical realm binding hulking suit armor brother rescued neighbor pinako rockbell granddaughter winry known bio mechanical engineering prodigy winry creates prosthetic limb edward utilizing automail tough versatile metal used robot combat armor year training elric brother set quest restore body locating philosopher stone powerful gem allows alchemist defy traditional law equivalent exchange edward becomes infamous alchemist gain nickname fullmetal boy journey embroils growing conspiracy threatens fate world written mal rewrite'

In [50]:
raw_data.isna().sum()

mal_id                     0
title                      0
type                       0
score                      0
scored_by                  0
status                     0
episodes                   0
source                   660
members                    0
favorites                  0
duration                   3
rating                     0
genres                     0
themes                     0
demographics               0
studios                    0
producers                  0
licensors                  0
synopsis                   0
main_picture               0
url                        0
title_english              0
cleaned_synopsis           0
cleaned_title              0
cleaned_title_english      0
description                0
dtype: int64

In [51]:
raw_data.shape

(6720, 26)

In [52]:
raw_data.to_csv('./data/cleaned_anime.csv', encoding='utf-8', index=False)