# Preparing Database
--- 

In this notebook, I will be preparing the data and finalising what will be needed to complete the parts of the project. The themes will be extracted here and added to the data. Finally the important columns will be kept and a new data will be saved.

## Imporing modules

In [2]:
# Importing modules
import pandas as pd
import os
import numpy as np
import json

from data_preprocessing import *
from clustering import *

with open('config.json') as json_data_file:
    config = json.load(json_data_file)

## Importing data

In [3]:
# Loading data
anime_database = pd.read_csv(
    os.path.join(config['Data_path'], 'all_anime_data.csv'))

anime_database['synopsis'] = anime_database['synopsis'].fillna("")
anime_database.head(5)

Unnamed: 0,MAL_ID,Name,Anime rating,Genres,English name,Japanese name,Type,Episodes,Aired,Premiered,...,Score-9,Score-8,Score-7,Score-6,Score-5,Score-4,Score-3,Score-2,Score-1,synopsis
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",Cowboy Bebop,カウボーイビバップ,TV,26,"Apr 3, 1998 to Apr 24, 1999",Spring 1998,...,182126.0,131625.0,62330.0,20688.0,8904.0,3184.0,1357.0,741.0,1580.0,"In the year 2071, humanity has colonized sever..."
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space",Cowboy Bebop:The Movie,カウボーイビバップ 天国の扉,Movie,1,"Sep 1, 2001",Unknown,...,49201.0,49505.0,22632.0,5805.0,1877.0,577.0,221.0,109.0,379.0,"other day, another bounty—such is the life of ..."
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen",Trigun,トライガン,TV,26,"Apr 1, 1998 to Sep 30, 1998",Spring 1998,...,75651.0,86142.0,49432.0,15376.0,5838.0,1965.0,664.0,316.0,533.0,"Vash the Stampede is the man with a $$60,000,0..."
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",Witch Hunter Robin,Witch Hunter ROBIN (ウイッチハンターロビン),TV,26,"Jul 2, 2002 to Dec 24, 2002",Summer 2002,...,4806.0,10128.0,11618.0,5709.0,2920.0,1083.0,353.0,164.0,131.0,ches are individuals with special powers like ...
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",Beet the Vandel Buster,冒険王ビィト,TV,52,"Sep 30, 2004 to Sep 29, 2005",Fall 2004,...,529.0,1242.0,1713.0,1068.0,634.0,265.0,83.0,50.0,27.0,It is the dark century and the people are suff...


## Deriving empath tags

In [4]:
# Getting themes from the synopsis with the empath themes 
anime_database['empath_themes'] = anime_database['synopsis'].apply(
    topics_to_empath).apply(empath_to_tags).apply(lambda x: ' '.join(x))

In [5]:
anime_database

Unnamed: 0,MAL_ID,Name,Anime rating,Genres,English name,Japanese name,Type,Episodes,Aired,Premiered,...,Score-8,Score-7,Score-6,Score-5,Score-4,Score-3,Score-2,Score-1,synopsis,empath_themes
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",Cowboy Bebop,カウボーイビバップ,TV,26,"Apr 3, 1998 to Apr 24, 1999",Spring 1998,...,131625.0,62330.0,20688.0,8904.0,3184.0,1357.0,741.0,1580.0,"In the year 2071, humanity has colonized sever...",superhero music fun musical stealing crime art...
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space",Cowboy Bebop:The Movie,カウボーイビバップ 天国の扉,Movie,1,"Sep 1, 2001",Unknown,...,49505.0,22632.0,5805.0,1877.0,577.0,221.0,109.0,379.0,"other day, another bounty—such is the life of ...",business surprise attractive art appearance mo...
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen",Trigun,トライガン,TV,26,"Apr 1, 1998 to Sep 30, 1998",Spring 1998,...,86142.0,49432.0,15376.0,5838.0,1965.0,664.0,316.0,533.0,"Vash the Stampede is the man with a $$60,000,0...",suffering superhero business stealing crime mo...
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",Witch Hunter Robin,Witch Hunter ROBIN (ウイッチハンターロビン),TV,26,"Jul 2, 2002 to Dec 24, 2002",Summer 2002,...,10128.0,11618.0,5709.0,2920.0,1083.0,353.0,164.0,131.0,ches are individuals with special powers like ...,business crime prison art order government con...
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",Beet the Vandel Buster,冒険王ビィト,TV,52,"Sep 30, 2004 to Sep 29, 2005",Fall 2004,...,1242.0,1713.0,1068.0,634.0,265.0,83.0,50.0,27.0,It is the dark century and the people are suff...,suffering fun strength order traveling governm...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16209,48481,Daomu Biji Zhi Qinling Shen Shu,Unknown,"Adventure, Mystery, Supernatural",Unknown,盗墓笔记之秦岭神树,ONA,Unknown,"Apr 4, 2021 to ?",Unknown,...,Unknown,1.0,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,No synopsis information has been added to this...,computer communication internet reading meetin...
16210,48483,Mieruko-chan,Unknown,"Comedy, Horror, Supernatural",Unknown,見える子ちゃん,TV,Unknown,2021 to ?,Unknown,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,ko is a typical high school student whose life...,hygiene suffering business surprise crime orde...
16211,48488,Higurashi no Naku Koro ni Sotsu,Unknown,"Mystery, Dementia, Horror, Psychological, Supe...",Higurashi:When They Cry – SOTSU,ひぐらしのなく頃に卒,TV,Unknown,"Jul, 2021 to ?",Summer 2021,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Sequel to Higurashi no Naku Koro ni Gou .,
16212,48491,Yama no Susume: Next Summit,Unknown,"Adventure, Slice of Life, Comedy",Unknown,ヤマノススメ Next Summit,TV,Unknown,Unknown,Unknown,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,New Yama no Susume anime.,


In [6]:
anime_database.to_pickle(os.path.join(config['Data_path'], 'anime_database_empath.pck'))

## Clustering the anime content

In [7]:
anime_frame = pd.read_pickle(
    os.path.join(config['Data_path'], 'anime_database_empath.pck'))
tf_idf = tfidf(anime_frame)

In [8]:
standard_embedding, clusterable_embedding = dim_reduc(tf_idf)
labels = cluster(clusterable_embedding, standard_embedding)

NameError: name 'tf_idf' is not defined

In [None]:
anime_frame['cluster'] = labels
anime_frame['x_coord'] = standard_embedding[:, 0]
anime_frame['y_coord'] = standard_embedding[:, 1]
anime_frame.to_pickle(
    os.path.join(config['Data_path'], 'anime_database_clustered.pck'))

In [None]:
import mpld3
from bokeh.plotting import figure, show, output_notebook, output_file, reset_output
from bokeh.plotting import figure, ColumnDataSource, show, output_notebook; output_notebook()
from bokeh.models import HoverTool
from bokeh.palettes import brewer, Viridis256

In [None]:
def plot(df, title='UMAP PLOT FOR ANIME CONTENT CLUSTERS'):
    plot_data = ColumnDataSource(df)
    plot = figure(title=title,
                  plot_width=800,
                  plot_height=700,
                  tools=('pan, box_zoom, reset'))
    plot.add_tools(
        HoverTool(
            tooltips="""<div style = "width: 400px;">(@label)@text</div>"""))
    plot.circle('x_val',
                'y_val',
                source=plot_data,
                color='color',
                line_alpha=0.2,
                fill_alpha=0.1,
                radius='size',
                hover_line_color='black')
    plot.title.text_font_size = '12pt'
    plot.xaxis.visible = True
    plot.yaxis.visible = True
    plot.grid.grid_line_color = None
    plot.outline_line_color = None
    return plot

In [None]:
anime_plot_frame = pd.read_pickle(
    os.path.join(config['Data_path'], 'anime_database_clustered.pck'))

In [None]:
anime_plot_frame['Anime rating'] = anime_plot_frame['Anime rating'].str.replace('Unknown', '', regex = True)
anime_plot_frame['Anime rating'] = pd.to_numeric(anime_plot_frame['Anime rating'])

In [None]:
anime_plot_frame

In [None]:
hdb_plottable_bios = get_plottable_df(anime_plot_frame['Name'],
                                      np.log(anime_plot_frame['Anime rating'] + 1),
                                      anime_plot_frame['synopsis'],
                                      anime_plot_frame['x_coord'],
                                      anime_plot_frame['y_coord'],
                                      anime_plot_frame['cluster'])
output_file('anime_content_cluster.html')
show(plot(hdb_plottable_bios))

In [None]:
tf_idf_empath = tfidf_empath(anime_frame)

  return array(a, dtype, copy=False, order=order)


In [None]:
standard_embedding, clusterable_embedding = dim_reduc(tf_idf_empath)
labels = cluster(clusterable_embedding, standard_embedding)

In [None]:
anime_frame['empath_cluster'] = labels
anime_frame['empath x_coord'] = standard_embedding[:, 0]
anime_frame['empath y_coord'] = standard_embedding[:, 1]
anime_frame.to_pickle(
    os.path.join(config['Data_path'], 'anime_database_clustered.pck'))

In [None]:
anime_plot_frame_empath = pd.read_pickle(
    os.path.join(config['Data_path'], 'anime_database_clustered.pck'))

In [None]:
anime_plot_frame_empath['Anime rating'] = anime_plot_frame_empath['Anime rating'].str.replace('Unknown', '', regex = True)
anime_plot_frame_empath['Anime rating'] = pd.to_numeric(anime_plot_frame_empath['Anime rating'])

In [None]:
hdb_plottable_bios_empath = get_plottable_df(anime_plot_frame_empath['Name'],
                                      np.log(anime_plot_frame_empath['Anime rating'] + 1),
                                      anime_plot_frame_empath['empath_themes'],
                                      anime_plot_frame_empath['empath x_coord'],
                                      anime_plot_frame_empath['empath y_coord'],
                                      anime_plot_frame_empath['empath_cluster'])
output_file('anime_content_empath_cluster.html')
show(plot(hdb_plottable_bios_empath))

In [None]:
anime_plot_frame_empath.to_pickle(
    os.path.join(config['Data_path'], 'anime_recommendation_database.pck'))