In [1]:
# Standard IPython notebook imports
%matplotlib inline

import os
import json

import csv
import networkx as nx
import pyarrow
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import math
import ast
from itertools import chain
import matplotlib.pyplot as plt
import subprocess
import sklearn.metrics
from typing import Optional
import shutil
pd.options.display.max_colwidth = 300

In [2]:
two_mode_data="dataset/anime-dataset-2023.csv"
users_csv="dataset/user-filtered.csv"

In [3]:
%%time
# importing data from csv to pandas

class Data_factory:
    def from_csv(file: str, sep=",") -> pd.DataFrame:
        return pd.read_csv(file, sep=sep)

CPU times: user 35 µs, sys: 14 µs, total: 49 µs
Wall time: 54.8 µs


In [7]:
users_df = Data_factory.from_csv(users_csv)

In [8]:
users_df.head() # is user data loaded?
len(users_df)

109224747

In [4]:
class AnimeRecomendation:
    def __init__(self, dimensions = 32, iter = 16):
        self.users_df     = pd.DataFrame()
        self.users_count  = None
        self.dimensions   = dimensions
        self.iterations   = iter
        self.columns      = ["user", "anime"]
        self.tsv_filename = None
        
    def number_of_users(self):
        if self.users_count is None:
            self.users_count = self.users_df.max()['user_id']
        return self.users_count

    def save_to_tsv(self, tsv_filename: str):
        grouped_df        = self.grouped_df
        self.tsv_filename = tsv_filename
        columns_to_keep   = ['user_id', 'id_rating']
        grouped_df.to_csv(tsv_filename, index=False, sep='\t', columns=columns_to_keep, mode='w', header=False)

    def fit(self, users_df, lines: Optional[int] = None, rating_threshold: int = 6):
        if lines is not None:
            self.users_df = users_df.head(int(lines))
            users_df = self.users_df
        else:
            self.users_df = users_df
            
        self.users_df['id_rating'] = list(zip(self.users_df['anime_id'], self.users_df['rating']))
    
        def agg_fun(anime, rating):
            return " ".join([str(anime) for _ in range(rating - rating_threshold + 1)])

        self.grouped_df = users_df[users_df.rating >= rating_threshold]                          \
            .groupby("user_id")["id_rating"]                                                     \
            .agg(lambda animes: " ".join([agg_fun(anime, rating) for (anime,rating) in animes])) \
            .reset_index()

        return self.number_of_users()
    
    def choose(self, nousers: int):
        self.grouped_df = self.grouped_df.sample(n=nousers)
    
    def cleora_train(self, cleora_exe="cleora"):
        if self.tsv_filename is None:
            raise RuntimeError("TSV filename not yet created")
        if not os.access(cleora_exe, os.X_OK) and shutil.which(cleora_exe) is None:
            raise RuntimeError("cleora executable not found")

        command = [cleora_exe,
                   "--type", "tsv",
                   f"--columns={self.columns[0]} complex::reflexive::{self.columns[1]}",
                   "--dimension", str(self.dimensions),
                   "--number-of-iterations", str(self.iterations),
                   "--prepend-field-name", "1",
                   "-f", "numpy",
                   "-o", "results",
                   "-e", "1",
                   self.tsv_filename]
        subprocess.run(command, check=True, stderr=subprocess.DEVNULL)


In [5]:
model = AnimeRecomendation(iter=1)

In [None]:
%%time

LINES = 4000000
RATING_THRESHOLD = 6

model.fit(users_df, rating_threshold=RATING_THRESHOLD)

In [6]:
%%time

model.grouped_df =  pd.read_csv("data3.tsv", sep="\t", header=None, names=["user_id", "id_rating"])
model.grouped_df

CPU times: user 10.4 s, sys: 1.05 s, total: 11.5 s
Wall time: 11.5 s


Unnamed: 0,user_id,id_rating
0,0,67 67 67 67 6702 6702 242 242 242 242 242 21 21 21 21 21 24 24 24 24 4722 4722 4722 6098 3125 3125 3125 3125 481 481 481 481 481 68 1689 2913 1250 1250 356 356 356 356 121 121 121 121 430 430 430 430 1829 1829 1571 1571 1571 1571 1571 578 578 578 578 578 431 431 431 2762 2762 2762 2762 570 570 3...
1,1,37403 37403 37403 7674 7674 34566 34566 40852 40852 40852 40852 10087 10087 40052 40052 40052 40748 40748 40748 40748 21 21 21 21 26243 26243 42203 42203 42203 42203 40028 40028 40028 40028 40028 3972 3972 481 481 481 22199 22199 6547 9919 9919 9919 5081 5081 5081 31043 31043 31043 31043 31964 3...
2,2,235 235 235 235 235 5042 5042 5042 7593 7593 7593 21 21 21 21 22 22 22 22 5762 5762 31580 31580 35028 35028 35028 368 368 368 31964 31964 31964 33486 33486 33486 33486 31740 31740 1575 1575 1575 1575 2904 2904 2904 2904 2904 1535 1535 1535 1535 1535 28223 28223 28223 226 226 226 38671 38671 3867...
3,3,6114 6114 6114 199 199 199 199 849 849 849 33352 33352 33352 33352 33352 31646 31646 31646 31646 32998 32998 32998 25397 12291 12291 1292 1292 1292 34881 34881 34881 32323 32323 22199 22199 4744 4744 47 47 47 47 22729 22729 32828 32828 32828 22147 22147 22147 6547 6547 6547 9989 9989 9989 11433 ...
4,4,101 101 101 656 656 656 3549 3549 3549 3359 3359 104 104 104 530 530 68 68 150 150 150 150 819 819 819 1222 1222 1222 105 105 105 1542 1542 1542 1689 1689 1689 1689 1689 232 232 232 232 371 371 372 372 372 372 4975 831 831 59 59 1575 1575 1575 1575 1575 2904 2904 2904 2904 2904 61 61 1535 1535 1...
...,...,...
313197,353400,34566 34566 34566 34566 37982 37982 37982 37982 38691 38691 38691 38691 40852 40852 40852 40852 39026 39026 38671 38671 38671 38671 918 918 918 918 41694 41694 41694 37991 37991 37991 37991 40748 40748 40748 40748 37999 37999 37999 38000 38000 38000 205 205 205 205 2001 2001 2001 2001 41353 4135...
313198,353401,47 47 47 47 47 2356 11111 11111 11111 66 66 32379 32379 889 889 889 31043 31043 31043 31043 31043 14345 14345 14345 14741 14741 14741 14741 1575 1575 1575 1575 1575 2904 2904 2904 2904 2904 26349 26349 6880 6880 10418 10418 1535 1535 1535 1535 1535 35120 35120 35120 2354 2354 2355 226 226 226 22...
313199,353402,857 857 269 269 269 269 269 1482 1482 1482 1482 1482 476 476 953 2034 2034 1699 1699 1699 27 27 27 27 479 479 479 479 298 298 68 68 1686 1686 1686 762 834 834 232 232 371 371 372 372 317 317 317 317 317 120 120 121 121 430 430 430 430 1579 1579 1579 1579 1579 853 853 853 853 853 417 417 417 417 ...
313200,353403,11759 11759 11759 19429 19429 22729 22729 22729 6547 6547 6547 6547 6547 10067 10067 10067 28249 28249 28249 5081 5081 5081 5081 20787 20787 31043 31043 31043 31043 31964 31964 31964 31964 33486 33486 33486 33486 36456 36456 36456 36456 1689 1689 1689 1689 1689 15687 15687 16592 16592 16592 1211...


In [7]:
model.choose(10000)
model.grouped_df

Unnamed: 0,user_id,id_rating
103673,117058,11061 11061 11061 11061 11061 39534 39534 39534 38992 38992 38992 6547 6547 6547 9989 9989 38753 38753 38753 36873 36873 39195 39195 39195 31043 31043 31043 31964 31964 31964 33486 33486 33486 33486 36456 36456 38408 38408 36896 36896 36896 36896 36896 39565 39565 39565 33354 33354 37435 37435 3...
85440,96507,22199 22199 9989 9989 9989 9989 9989 11111 11111 11111 24833 24833 24833 9919 9919 9919 11737 28805 28805 28805 28805 28805 7674 7674 7674 7674 7674 10030 10030 10030 10030 10030 12365 12365 12365 12365 12365 32664 32664 31043 31043 31043 31043 31043 31964 31964 31964 31964 33486 33486 33486 334...
135117,152552,10800 10800 10800 10800 14397 14397 14397 16035 16035 16035 1735 1735 1735 1735 1735 16498 16498 16498 12711 12711 14813 14813 14813 16241 16241 3269 3269 873 873 48 48 48 5525 5525 5525 5525 6682 6682 49 49 50 50 304 304 92 92 719 5784 5784 5784 53 53 53 3266 3266 101 101 101 101 857 857 656 65...
70840,79959,38084 38084 38084 38084 14513 14513 40028 40028 40028 40028 40028 37521 37521 37521 5081 5081 5081 5081 5081 7674 7674 7674 31043 31043 31043 31043 31478 31478 32867 32867 32867 34944 34944 2167 2167 2167 4181 4181 4181 4181 4181 6351 6351 6351 6351 6351 4059 4059 4059 4059 4059 34437 34437 3443...
108367,122311,31646 31646 31646 31646 38735 33337 33337 33337 6547 6547 9989 9989 9989 21995 21995 21995 7193 7193 36649 36649 39195 39195 39195 1690 1690 1690 31478 31478 31478 32867 38003 38003 38003 1575 8142 8142 10029 10029 10029 14353 14353 1535 1535 28223 28223 28223 28223 40056 2759 2759 227 227 227 1...
...,...,...
246195,277727,199 199 199 199 199
150742,170096,1735 1735 1735 1735 1575 2904 2904 2904 1535 1535 1535 1535 1535 2994 2994 356 356 356 72 72 72 121 121 121 5114 5114 5114 5114 5114 9760 9760 9760 9760 9760 431 431 431 431 431 468 468 468 468 43 43 43 43 43 522 522 164 164 20 20 20 20 12355 12355 5530 5530 199 199 199 199 7311 7311 7311 523 52...
94259,106465,966 966 966 23499 23499 23499 23499 17727 17727 17727 17727 17875 9776 9776 9776 9776 11339 11339 11339 10737 10737 10737 102 47 47 16201 16201 16201 16201 19429 1177 1177 2946 6721 9989 9989 9989 9989 11111 11111 11111 9919 9919 1961 1961 1961 1961 1961 13029 13029 13029 9736 9736 4999 4999 499...
297244,335302,6166 66 66 66 66 659 1593 1593 2251 2251 2251 2251 1589 1589 1589 7059 357 1575 1575 1575 2904 2904 2904 2904 1 1 1 1 5 5 5 4037 4037 6746 6746 6746 6746 2759 2759 2759 3784 3784 3784 3784 3784 227 227 227 227 5060 5060 5630 5630 5630 731 731 731 731 9675 237 237 8792 6946 6946 6946 30 30 30 32 ...


In [8]:
%%time
# constructing the tsv data

tsv_file_name = "data2.tsv"
model.save_to_tsv(tsv_file_name)

CPU times: user 753 ms, sys: 71.8 ms, total: 825 ms
Wall time: 864 ms


In [None]:
%%time

# You shouldnt be running it every time!

# Commented it out for safety reasons xd
model.cleora_train()

In [8]:
class RatingGenerator:
    def __init__(self):
        self.users_count  = None
        self.columns      = ["user", "anime"]
        self.rankings     = dict()

    def anime_label(idx: int) -> str:
        return f"anime__{idx}"

    def get_artifacts(self, name: str):
        d = dict()
        p = "results/emb__"
        files  = ["labels", "vects_iter"]
        suf = [".out.entities", ".out.npy"]
    
        return { f:f"{p}{name}__{name}{suf[idx]}" 
                    for idx, f in enumerate(files) }

    def load_artifacts(self):
        artifacts = self.get_artifacts(self.columns[1])
        with open(artifacts['labels'], "r") as entities:
            self.labels     = json.load(entities)
        # Load results to numpy
        self.vects_iter     = np.load(artifacts['vects_iter'])

    def load_rankings(self, idx: int):
        real_id = self.labels.index(f"anime__{idx}")

        v = self.vects_iter[real_id]
        dist = sklearn.metrics.pairwise.cosine_similarity(v.reshape(1, -1), self.vects_iter, dense_output=True)
        ranking = (-dist).argsort()[0]

        self.rankings[self.labels[real_id]] = ranking[:15]

    def add_to_custom_ranking(self, custom_ranking, idx: int):
        anime_ranking = self.rankings[f"anime__{idx}"]

        for anime in anime_ranking:
            if anime in custom_ranking:
                custom_ranking[anime] += 1
            else:
                custom_ranking[anime] = 1

    def predict(self, already_watched):
        
        self.load_artifacts()
        custom_ranking = dict()
        
        for idx in tqdm(already_watched):
            if f"anime__{idx}" not in self.rankings:
                self.load_rankings(idx)

            self.add_to_custom_ranking(custom_ranking, idx)

        return dict(sorted(custom_ranking.items(), reverse=True, key=lambda x:x[1]))

In [20]:
anime_df = Data_factory.from_csv(two_mode_data)

In [21]:
# Ranking


ratingGenerator = RatingGenerator()

def pandas_extract_content(row, label):
    name = row[label].to_string()
    return name.split("    ")[1]

ranking = ratingGenerator.predict([67, 6702, 242])
print(ranking)
for label, times in ranking.items():
    filter = anime_df[anime_df.anime_id == label]
    if len(filter) == 0:
        continue

    print(f"{pandas_extract_content(filter, 'Name')}: {times}")

  0%|          | 0/3 [00:00<?, ?it/s]

{15: 2, 3908: 2, 0: 1, 4922: 1, 3103: 1, 5176: 1, 1686: 1, 2937: 1, 659: 1, 9629: 1, 2899: 1, 960: 1, 652: 1, 2279: 1, 3037: 1, 1: 1, 106: 1, 4698: 1, 990: 1, 1201: 1, 4699: 1, 5600: 1, 53: 1, 4063: 1, 10364: 1, 44: 1, 1076: 1, 161: 1, 2849: 1, 3223: 1, 2: 1, 1787: 1, 1039: 1, 3200: 1, 3582: 1, 2847: 1, 1037: 1, 635: 1, 3105: 1, 3824: 1, 2480: 1, 3896: 1, 765: 1}
Eyeshield 21: 2
Injuu Kyoushi: 2
Bleach Movie 1: Memories of Nobody: 1
Bishoujo Senshi Sailor Moon R: Make Up! Sailor Senshi: 1
Azumanga Daiou: Gekijou Tanpen: 1
Cyborg 009: The Reopening: 1
Higurashi no Naku Koro ni Special: Nekogoroshi-hen: 1
Shin Chou Kyou Ryo: Condor Hero II: 1
Galaxy Angel Z: 1
Nagagutsu wo Haita Neko: 1
GS Mikami: Gokuraku Daisakusen!!: 1
Cowboy Bebop: 1
Hana yori Dango: 1
Jokei Kazoku: Inbou: 1
Kamisama Kazoku: 1
Eisai Kyoiku: 1
Little El Cid no Bouken: 1
Ai Yori Aoshi: 1
Sekirei: 1
Rurouni Kenshin: Meiji Kenkaku Romantan - Tsuioku-hen: 1
Wizardry: 1
Peace Maker Kurogane: 1
Kiddy Grade: Truth Dawn: 1
Vi

In [25]:
def extract_year(aired):
    return aired.split(",")[1].split(" ")[1]

def search_str(s, search):
    return search in str(s).lower()

def find_by_name(name):
    name = name.lower()
    mask = anime_df.apply(lambda x: x.map(lambda s: search_str(s, name)))
    return anime_df.loc[mask.any(axis=1)]

find_by_name("attack")

Unnamed: 0,anime_id,Name,English name,Other name,Score,Genres,Synopsis,Type,Episodes,Aired,...,Studios,Source,Duration,Rating,Rank,Popularity,Favorites,Scored By,Members,Image URL
10,20,Naruto,Naruto,ナルト,7.99,"Action, Adventure, Fantasy","Moments prior to Naruto Uzumaki's birth, a huge demon known as the Kyuubi, the Nine-Tailed Fox, attacked Konohagakure, the Hidden Leaf Village, and wreaked havoc. In order to put an end to the Kyuubi's rampage, the leader of the village, the Fourth Hokage, sacrificed his life and sealed the mons...",TV,220.0,"Oct 3, 2002 to Feb 8, 2007",...,Pierrot,Manga,23 min per ep,PG-13 - Teens 13 or older,599.0,8,76343,1883772.0,2717330,https://cdn.myanimelist.net/images/anime/13/17405.jpg
20,30,Neon Genesis Evangelion,Neon Genesis Evangelion,新世紀エヴァンゲリオン,8.35,"Action, Avant Garde, Award Winning, Drama, Sci-Fi, Suspense","Fifteen years after a cataclysmic event known as the Second Impact, the world faces a new threat: monstrous celestial beings called ""Angels"" invade Tokyo-3 one by one. Mankind is unable to defend themselves against the Angels despite utilizing their most advanced munitions and military tactics. ...",TV,26.0,"Oct 4, 1995 to Mar 27, 1996",...,"Gainax, Tatsunoko Production",Original,24 min per ep,PG-13 - Teens 13 or older,204.0,45,100638,1024927.0,1718019,https://cdn.myanimelist.net/images/anime/1314/108941.jpg
37,56,Avenger,UNKNOWN,アヴェンジャー,5.86,"Adventure, Fantasy, Sci-Fi","Mars has been colonized and is a world where children have been replaced by robot servants known as ""dolls."" Layla is a skilled fighter with a tragic past who travels about the world. Her companions are Nei, a strange and unique doll with some unknown ties to Layla, and Speedy, who is a doll bre...",TV,13.0,"Oct 2, 2003 to Dec 25, 2003",...,Bee Train,Original,24 min per ep,R - 17+ (violence & profanity),9454.0,4856,20,6788.0,17396,https://cdn.myanimelist.net/images/anime/13/19921.jpg
57,77,Mahou Shoujo Lyrical Nanoha A's,Magical Girl Lyrical Nanoha A's,魔法少女リリカルなのは エース,7.97,"Action, Comedy, Drama","After solving the incident of the scattered Jewel Seeds, Nanoha Takamachi happily returns to her everyday life, though now with added magic practice in the morning. Exchanging video messages with Fate Testarossa and the crew of the Arthra, Nanoha eagerly awaits the chance to speak with them in p...",TV,13.0,"Oct 2, 2005 to Dec 25, 2005",...,Seven Arcs,Original,25 min per ep,PG-13 - Teens 13 or older,622.0,2555,993,33024.0,63640,https://cdn.myanimelist.net/images/anime/4/6767.jpg
59,80,Kidou Senshi Gundam,Mobile Suit Gundam,機動戦士ガンダム,7.76,"Action, Drama, Sci-Fi","It is year 0079 of the Universal Century. Mankind has moved to space, living in colony clusters known as ""Sides."" One of these Sides declares itself the ""Principality of Zeon"" and declares war on the Earth Federation, the governmental body currently ruling Earth. Using powerful humanoid robots k...",TV,43.0,"Apr 7, 1979 to Jan 26, 1980",...,Sunrise,Original,25 min per ep,PG-13 - Teens 13 or older,1005.0,1632,2140,56938.0,126804,https://cdn.myanimelist.net/images/anime/5/88524.jpg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23861,54133,"Ijiranaide, Nagatoro-san 2nd Attack Mini Anime","Don't Toy with Me, Miss Nagatoro 2nd Attack Mini Anime",イジらないで、長瀞さん 2nd Attack ミニアニメ,5.81,Comedy,"The mini-anime for Ijiranaide, Nagatoro-san 2nd Attack released on Twitter and YouTube.",ONA,12.0,"Jan 8, 2023 to Mar 26, 2023",...,AQUA ARIS,Manga,1 min per ep,PG-13 - Teens 13 or older,9655.0,10896,5,447.0,1861,https://cdn.myanimelist.net/images/anime/1383/132539.jpg
24112,54560,25-pun de Oitsukeru: Shingeki no Kyojin,Attack on Titan in 25 Minutes,25分で追いつける！アニメ『進撃の巨人』,UNKNOWN,"Action, Drama, Fantasy, Mystery",25 minutes to catch up with the contents of Season 1 to The Final Season Part 2.\n\n(Source: Official YouTube Channel),ONA,1.0,"Feb 18, 2023",...,UNKNOWN,Manga,25 min,R - 17+ (violence & profanity),0.0,0,0,UNKNOWN,0,https://cdn.myanimelist.net/images/anime/1773/133539.jpg
24290,54881,Ta Bu Dang Nuzhu Hen Duo Nian,Since I Wasn't the Heroine,她不当女主很多年,UNKNOWN,"Action, Adventure","Beiyi used a strategy to attack Baicheng, an important town in the northern border of Daqi Kingdom. Tang Yao, the guard of Baicheng, led his people to resist desperately, but they were outnumbered, and all sacrificed. Only the second daughter, Tang Ying, survived.\n\nWhat makes Tang Ying even mo...",ONA,16.0,"Apr 22, 2023 to ?",...,Liyu Culture,Novel,Unknown,UNKNOWN,12727.0,14693,29,UNKNOWN,532,https://cdn.myanimelist.net/images/anime/1007/134843.jpg
24677,55453,Naruto (2023),Naruto,ナルト,UNKNOWN,"Action, Adventure, Comedy, Fantasy","Moments prior to Naruto Uzumaki's birth, a huge demon known as the Kyuubi, the Nine-Tailed Fox, attacked Konohagakure, the Hidden Leaf Village, and wreaked havoc. In order to put an end to the Kyuubi's rampage, the leader of the village, the Fourth Hokage, sacrificed his life and sealed the mons...",TV,UNKNOWN,Sep 2023 to ?,...,UNKNOWN,Manga,Unknown,PG-13 - Teens 13 or older,0.0,0,0,UNKNOWN,0,https://cdn.myanimelist.net/images/anime/1587/136098.jpg


In [3]:
def df_from_array(array):
    return pd.DataFrame(data=array[1:,1:], index=array[1:,0], columns=array[0,1:])

def read_data(file):
    with open(file, newline='') as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        return [ np.array([i] + list(row)) for i, row in enumerate(reader, start=-1)]

def get_genre_list(genres: str):
  return list(map(lambda x: x.strip().lower(), genres.split(",")))

def filter_out(word, col):
  return lambda row: not (word.lower() in get_genre_list(row[col]))

def filter_in(word, col):
  return lambda row: (word.lower() in get_genre_list(row[col]))

def create_two_mode_df(data):
    sub_df = data[["anime_id", "English name", "Genres"]]
    result = [['id', 'source', 'target']]
    i=0
    for _, row in sub_df.iterrows():
        for genre in get_genre_list(row['Genres']):
            to_append = [i, row['English name'], genre]
            result.append(to_append)
            i+=1
    return df_from_array(np.array(result))

def create_weighted_matrix(two_mode_df):
    assert(two_mode_df.columns.tolist() == ['source', 'target'])
    source_uniq = two_mode_df['source'].unique()
    source_len = source_uniq.shape[0]
    source_dict = {k: v for v, k in enumerate(source_uniq)}
    array = np.zeros((source_len, source_len), dtype=int)

    for _, source1 in tqdm(two_mode_df.iterrows(), total=len(two_mode_df.index)):
        for _, source2 in two_mode_df.iterrows():
            if source1['target'] == source2['target']:
                array[source_dict[source1['source']], source_dict[source2['source']]] += 1
    return pd.DataFrame(data=array, columns=source_uniq, index=source_uniq)

def create_weighted_matrix2(two_mode_df):
    adj_df = pd.crosstab(two_mode_df.source, two_mode_df.target)
    adj_np = adj_df.to_numpy()
    adj_np2 = adj_np@adj_np.T
    return (adj_np2, adj_df.index)


def create_weighted_df(matrix_df):
    result=[['', 'source', 'target', 'weight']]
    c = 0
    for rowIndex, row in tqdm(matrix_df.iterrows(), total=len(matrix_df.index)): #iterate over rows
        for colIndex, value in row.items():
            weight = matrix_df.at[rowIndex, colIndex]
            if weight > 0:
                to_append = [str(c), rowIndex, colIndex, weight]
                result.append(to_append)
                c += 1
    return df_from_array(np.array(result))


In [4]:
raw_data = read_data(two_mode_data)
# raw_data = raw_data[:1000]

In [5]:
all_data = df_from_array(np.array(raw_data))

In [6]:
all_data.columns

Index(['anime_id', 'Name', 'English name', 'Other name', 'Score', 'Genres',
       'Synopsis', 'Type', 'Episodes', 'Aired', 'Premiered', 'Status',
       'Producers', 'Licensors', 'Studios', 'Source', 'Duration', 'Rating',
       'Rank', 'Popularity', 'Favorites', 'Scored By', 'Members', 'Image URL'],
      dtype='object')

In [7]:
all_data = all_data[(all_data['Score'] != "UNKNOWN") & (all_data['Score'] != '')].astype({'Score': 'float'})
# all_data = all_data.drop(columns=['Synopsis', 'Aired', 'Image URL', 'Scored By', 'Members', 'Licensors'])
# all_data = all_data[all_data['Score'] > 7.0]
important_data = all_data.drop(columns=['Synopsis', 'Aired', 'Image URL', 'Scored By', 'Members', 'Licensors'])

In [8]:
movies = important_data[important_data['Type'] == "Movie"]
TVs = important_data[important_data['Type'] == "TV"]
specials = important_data[important_data['Type'] == "Special"]

specified = important_data

not_hentai = specified[specified.apply(filter_out("Hentai", 'Genres'), axis=1)]
hentai = specified[specified.apply(filter_in("Hentai", 'Genres'), axis=1)]

mappa = not_hentai[not_hentai.apply(filter_in("Mappa", "Studios"), axis=1)]

In [9]:
# not_hentai.sort_values(['Score'], ascending=False)
print(len(hentai))
print(len(not_hentai))

1465
14227


In [10]:
data = all_data[["anime_id", "English name", "Genres", "Score"]]
data = data[(data['Score'] != "UNKNOWN") & (data['Score'] != '')].astype({'Score': 'float'})

In [11]:
data.dtypes

mapped = map(get_genre_list, data["Genres"].to_numpy().flatten().tolist())

uniq = np.unique(list(chain.from_iterable(mapped))).tolist()

print(len(uniq), uniq)

22 ['action', 'adventure', 'avant garde', 'award winning', 'boys love', 'comedy', 'drama', 'ecchi', 'erotica', 'fantasy', 'girls love', 'gourmet', 'hentai', 'horror', 'mystery', 'romance', 'sci-fi', 'slice of life', 'sports', 'supernatural', 'suspense', 'unknown']


In [12]:
two_mode_df = create_two_mode_df(data)

two_mode_df

Unnamed: 0,source,target
0,Cowboy Bebop,action
1,Cowboy Bebop,award winning
2,Cowboy Bebop,sci-fi
3,Cowboy Bebop: The Movie,action
4,Cowboy Bebop: The Movie,sci-fi
...,...,...
30955,One Piece: Recapping Fierce Fights! The Counte...,action
30956,One Piece: Recapping Fierce Fights! The Counte...,adventure
30957,One Piece: Recapping Fierce Fights! The Counte...,comedy
30958,One Piece: Recapping Fierce Fights! The Counte...,fantasy


In [13]:
(weighted_matrix_np, weighted_matrix_index) = create_weighted_matrix2(two_mode_df)

In [14]:
weighted_matrix_df2 = pd.DataFrame(data=weighted_matrix_np, index=weighted_matrix_index, columns=weighted_matrix_index)


In [15]:
weighted_matrix_df2

source,"""Deji"" Meets Girl","""Ippon"" again!","""Parade"" de Satie","""Star""t",-OutsideR:RequieM-,.Koni-chan,.hack//G.U. Trilogy,.hack//G.U. Trilogy: Parody Mode,.hack//Gift,.hack//Legend of The Twilight,...,selector spread WIXOSS,sound / phantasma / mirror,teeter,the FLY BanD!,tsuritama,xxxHOLiC,xxxHOLiC The Movie: A Midsummer Night's Dream,∀ Gundam,∀ Gundam I: Earth Light,∀ Gundam II: Moonlight Butterfly
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"""Deji"" Meets Girl",1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
"""Ippon"" again!",0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"""Parade"" de Satie",0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"""Star""t",0,0,0,1,1,0,0,0,0,0,...,1,0,1,1,0,0,0,0,0,0
-OutsideR:RequieM-,0,0,0,1,1,0,0,0,0,0,...,1,0,1,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
xxxHOLiC,0,0,0,0,0,1,0,1,1,1,...,0,0,0,0,1,4,4,1,1,1
xxxHOLiC The Movie: A Midsummer Night's Dream,0,0,0,0,0,1,0,1,1,1,...,0,0,0,0,1,4,4,1,1,1
∀ Gundam,0,0,0,0,0,0,1,1,0,1,...,0,0,0,0,1,1,1,6,3,3
∀ Gundam I: Earth Light,0,0,0,0,0,0,1,1,0,0,...,0,0,0,0,1,1,1,3,3,3


In [16]:
weighted_matrix_df_pickle_file = "weighted_matrix_df.df"
if os.path.exists(weighted_matrix_df_pickle_file):
    weighted_matrix_df = pd.read_pickle(weighted_matrix_df_pickle_file)
else:
    weighted_matrix_df = weighted_matrix_df2
    weighted_matrix_df.to_pickle(weighted_matrix_df_pickle_file)

In [17]:
weighted_matrix_df

source,"""Deji"" Meets Girl","""Ippon"" again!","""Parade"" de Satie","""Star""t",-OutsideR:RequieM-,.Koni-chan,.hack//G.U. Trilogy,.hack//G.U. Trilogy: Parody Mode,.hack//Gift,.hack//Legend of The Twilight,...,selector spread WIXOSS,sound / phantasma / mirror,teeter,the FLY BanD!,tsuritama,xxxHOLiC,xxxHOLiC The Movie: A Midsummer Night's Dream,∀ Gundam,∀ Gundam I: Earth Light,∀ Gundam II: Moonlight Butterfly
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"""Deji"" Meets Girl",1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
"""Ippon"" again!",0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"""Parade"" de Satie",0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"""Star""t",0,0,0,1,1,0,0,0,0,0,...,1,0,1,1,0,0,0,0,0,0
-OutsideR:RequieM-,0,0,0,1,1,0,0,0,0,0,...,1,0,1,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
xxxHOLiC,0,0,0,0,0,1,0,1,1,1,...,0,0,0,0,1,4,4,1,1,1
xxxHOLiC The Movie: A Midsummer Night's Dream,0,0,0,0,0,1,0,1,1,1,...,0,0,0,0,1,4,4,1,1,1
∀ Gundam,0,0,0,0,0,0,1,1,0,1,...,0,0,0,0,1,1,1,6,3,3
∀ Gundam I: Earth Light,0,0,0,0,0,0,1,1,0,0,...,0,0,0,0,1,1,1,3,3,3


In [18]:
# weighted_df_pickle_file = "weighted_df.df"
# if os.path.exists(weighted_df_pickle_file):
#     weighted_df = pd.read_pickle(weighted_df_pickle_file)
# else:
#     weighted_df = create_weighted_df(weighted_matrix_df)
#     weighted_df.to_pickle(weighted_df_pickle_file)

In [19]:
# weighted_df

In [20]:
# weighted_df.columns

In [21]:
# for i in [3, 2, 1]:
# G = nx.from_pandas_edgelist(
#         weighted_df, source='source',
#         target='target', edge_attr='weight')

plt.figure(figsize=(35,35))

G = nx.from_numpy_array(weighted_matrix_np)
graph_pos = nx.spring_layout(G)
nx.draw_networkx_nodes(G, graph_pos, node_size=10, node_color='blue', alpha=0.3)
nx.draw_networkx_edges(G, graph_pos)
nx.draw_networkx_labels(G, graph_pos, font_size=8, font_family='sans-serif')

# plt.show()

# nx.draw(G)

KeyboardInterrupt: 

In [None]:
plt.savefig("plot.svg", dpi=1200)

<Figure size 640x480 with 0 Axes>

In [None]:
net = Network(notebook=True)
net.from_nx(G)
net.show("example.html")