In [1]:
# Standard IPython notebook imports
%matplotlib inline

import os
import csv
import math
import ast
import json
import shutil

import pyarrow
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import sklearn.metrics

import subprocess

from tqdm.notebook import tqdm
from itertools import chain
from typing import Optional

pd.options.display.max_colwidth = 300

In [3]:
users_csv="dataset/user-filtered.csv"
users_df = pd.read_csv(users_csv, sep=",")

In [4]:
users_df.head() # is user data loaded?
len(users_df)

109224747

In [5]:
class AnimeRecomendation:
    def __init__(self):
        self.users_df = pd.DataFrame()
        self.users_count = None
        self.columns = ["user", "anime"]
        self.grouped_columns = ["user_id", "id_rating"]
        self.tsv_filename = None

    def number_of_users(self):
        if self.users_count is None:
            self.users_count = self.users_df.max()['user_id']
        return self.users_count

    def save_to_tsv(self, tsv_filename: str):
        self.tsv_filename = tsv_filename
        self.grouped_df.to_csv(self.tsv_filename,
                               index=False,
                               sep='\t',
                               columns=self.grouped_columns,
                               mode='w',
                               header=False)

    def fit(self,
            users_df,
            lines: Optional[int] = None,
            rating_threshold: int = 6):

        self.users_df = users_df.head(int(lines)) \
                        if lines is not None else users_df

        def adj_mult(rating: int) -> int:
            match rating:
                case 6 | 7: return 1
                case 8 | 9: return 2
                case 10: return 3
                case _: return 0
    
        def agg_fun(anime: str, rating: int) -> str:
            return " ".join([str(anime)] * adj_mult(rating))

        self.users_df['id_rating'] = list(zip(self.users_df['anime_id'], self.users_df['rating']))

        self.grouped_df = self.users_df[self.users_df.rating >= rating_threshold]                 \
            .groupby("user_id")["id_rating"]                                                      \
            .agg(lambda animes: " ".join([agg_fun(anime, rating) for (anime, rating) in animes])) \
            .reset_index()

        return self.number_of_users()

    def from_csv(self, file: str):
        self.grouped_df = pd.read_csv(file,
                                      sep="\t",
                                      header=None,
                                      names=self.grouped_columns)

    def choose(self, nousers: int):
        self.grouped_df = self.grouped_df.sample(n=nousers)

    def cleora_train(self, cleora_exe="cleora", dimensions = 32, iter = 16):
        if self.tsv_filename is None:
            raise RuntimeError("TSV filename not yet created")
        if not os.access(cleora_exe, os.X_OK) and shutil.which(cleora_exe) is None:
            raise RuntimeError("cleora executable not found")

        command = [cleora_exe,
                   "--type", "tsv",
                   f"--columns=transient::{self.columns[0]} complex::{self.columns[1]}",
                   "--dimension", str(dimensions),
                   "--number-of-iterations", str(iter),
                   "--prepend-field-name", "0",
                   "-f", "numpy",
                   "-o", "results",
                   "-e", "1",
                   self.tsv_filename]
        subprocess.run(command, check=True)


In [74]:
model = AnimeRecomendation()

In [75]:
%%time

LINES = 40000000
RATING_THRESHOLD = 6

model.fit(users_df, lines=LINES, rating_threshold=RATING_THRESHOLD)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.users_df['id_rating'] = list(zip(self.users_df['anime_id'], self.users_df['rating']))


CPU times: user 50.6 s, sys: 1.91 s, total: 52.5 s
Wall time: 52.7 s


129988

In [76]:
model.choose(20000)
model.grouped_df

Unnamed: 0,user_id,id_rating
46121,52057,39792 39792 42203 42203 41762 41762 6547 9989 31964 31964 33486 33486 38186 28999 14741 18671 18671 16934 2167 4059 4059 4059 1575 1575 26349 26349 26349 35849 35849 37982 37982 38145 38544 3784 3784 3785 227 227 38680 38680 71 37998 39417 39417 17729 10793 31859 934 934 185 185 38472 14719 2089...
110191,124385,36456 36456 35760 35760 30123 31173 22199 19429 6547 24833 30654 21995 30205 31754 31964 31964 33486 33486 21405 33056 34173 28999 34382 32189 33028 16592 11843 35849 9982 74 28825 6008 7054 10298 9366 32281 34389 33489 585 16067 36038 34383 31414 20507 30503 20767 26243 28927 30015 36752 13759 ...
92443,104444,34572 34572 38101 38101 30123 30123 31173 31173 22199 22199 22147 6547 6547 9989 9989 15039 15039 11111 24833 30654 30654 28405 9919 33506 7817 7817 32827 5081 20787 20787 269 31043 31043 31964 33486 36456 38186 14345 28999 28999 34382 34382 38940 35849 35849 6880 6880 1535 1535 28223 28223 3512...
108375,122319,42897 42897 40748 40748 21 21 39617 37956 38101 22147 22147 9989 9989 9989 9919 36882 5081 5081 5081 10330 889 889 1519 1519 269 269 11633 40060 31043 31043 31964 31964 33486 36456 36456 38408 42603 10719 14967 31478 32867 38003 10163 28999 28999 30485 10800 10800 39523 14741 14741 15583 1535 15...
9655,10930,457 457 457 20767 20767 18179 12291 16273 101 101 102 19429 6547 6547 9919 9919 11266 8475 8475 8475 269 269 269 31964 31964 31964 1222 1222 1689 1689 1689 59 14741 14741 14741 18671 18671 61 6880 6880 6880 1535 1535 28223 28223 28223 5593 5593 30346 30346 30346 18507 18507 243 8768 15895 15895 ...
...,...,...
41226,46539,21 21 32998 30123 22199 26449 25013 41433 6547 9062 35015 37924 39959 42238 9989 9989 11111 11701 24833 30654 30654 19759 21995 21995 9919 11737 33506 28249 31821 39195 39195 13535 530 740 532 1239 996 20787 11633 40060 31043 31043 31964 1222 14345 14345 31478 31478 32867 32867 38003 38003 33071...
89685,101346,33975 33975 33975 30829 30829 20913 20913 36904 36904 37769 47 47 47 6547 9989 9989 9989 7475 11111 11111 21995 21995 21995 9919 9919 7647 7647 9074 9074 1028 1028 36214 36214 5168 37171 37171 37171 37996 38002 38002 38002 9736 10582 586 7817 28805 28805 5081 5081 5081 7674 7674 34636 36649 3664...
38597,43564,30654 31964 31964 30694 30694 31798 31798 28623 31741 31741 31772 31772 31240 31240 32595 31327 32105 31430 31098 31098 5525 6682 25397 22199 22199 24833 9919 9919 11737 11737 13677 11266 11266 10647 20787 31043 31043 31478 31478 28999 28999 31553 31553 31636 6880 1535 28223 29589 31710 223 223 ...
39187,44221,1470 1470 1470 1491 1491 1491 2998 2998 170 170 47 47 47 1348 1348 2755 2755 1661 1661 1661 1030 1030 1030 1358 1358 1358 431 431 731 731 731 3220 3220 3220 572 572 43 43 416 416 1430 1430 1430 522 522 522 875 875 323 323 30 30 30 437 437 437 210 210 2000 2000 2000 199 199 199 2154 2154 2154 513...


In [77]:
%%time
# constructing the tsv data

tsv_file_name = "data2.tsv"
model.save_to_tsv(tsv_file_name)

CPU times: user 954 ms, sys: 59.8 ms, total: 1.01 s
Wall time: 1.07 s


In [78]:
%%time

# You shouldnt be running it every time!

# Commented it out for safety reasons xd
model.cleora_train()

[0m[38;5;8m[[0m2024-02-07T23:50:42Z [0m[32mINFO [0m cleora[0m[38;5;8m][0m Reading args...
[src/main.rs:222] &config = Configuration {
    produce_entity_occurrence_count: true,
    embeddings_dimension: 32,
    max_number_of_iteration: 16,
    seed: None,
    prepend_field: false,
    log_every_n: 10000,
    in_memory_embedding_calculation: true,
    input: [
        "data2.tsv",
    ],
    file_type: Tsv,
    output_dir: Some(
        "results",
    ),
    output_format: Numpy,
    relation_name: "emb",
    columns: [
        Column {
            name: "user",
            transient: true,
            complex: false,
            reflexive: false,
            ignored: false,
        },
        Column {
            name: "anime",
            transient: false,
            complex: true,
            reflexive: false,
            ignored: false,
        },
    ],
}
[0m[38;5;8m[[0m2024-02-07T23:50:42Z [0m[32mINFO [0m cleora[0m[38;5;8m][0m Starting calculation...
[src/pipeli

CPU times: user 12.6 ms, sys: 5.02 ms, total: 17.7 ms
Wall time: 3.44 s


[0m[38;5;8m[[0m2024-02-07T23:50:45Z [0m[32mINFO [0m cleora::embedding[0m[38;5;8m][0m Done iter: 14. Dims: 32, entities: 34246, num data points: 7009582.
[0m[38;5;8m[[0m2024-02-07T23:50:45Z [0m[32mINFO [0m cleora::embedding[0m[38;5;8m][0m Done iter: 15. Dims: 32, entities: 34246, num data points: 7009582.
[0m[38;5;8m[[0m2024-02-07T23:50:45Z [0m[32mINFO [0m cleora::embedding[0m[38;5;8m][0m Done propagating.
[0m[38;5;8m[[0m2024-02-07T23:50:45Z [0m[32mINFO [0m cleora::embedding[0m[38;5;8m][0m Start saving embeddings.
[0m[38;5;8m[[0m2024-02-07T23:50:45Z [0m[32mINFO [0m cleora::embedding[0m[38;5;8m][0m Done saving embeddings.
[0m[38;5;8m[[0m2024-02-07T23:50:45Z [0m[32mINFO [0m cleora::embedding[0m[38;5;8m][0m Finalizing embeddings calculations!
[0m[38;5;8m[[0m2024-02-07T23:50:45Z [0m[32mINFO [0m cleora[0m[38;5;8m][0m Finished in 3 sec


In [2]:
class RatingGenerator:
    def __init__(self):
        self.users_count = None
        self.columns = ["user", "anime"]
        self.rankings = dict()

    def get_artifacts(self):
        p = "results/emb__"
        files = ["labels", "vects_iter"]
        suf = [".out.entities", ".out.npy"]

        return {f: f"{p}{self.columns[0]}__{self.columns[1]}{suf[idx]}"
                for idx, f in enumerate(files)}

    def load_artifacts(self):
        artifacts = self.get_artifacts()
        with open(artifacts['labels'], "r") as entities:
            self.labels = np.array([int(i) for i in json.load(entities)])
        # Load results to numpy
        self.vects_iter = np.load(artifacts['vects_iter'])

    def load_rankings(self, idx: int):
        real_id = np.where(self.labels == idx)[0][0]

        v = self.vects_iter[real_id]
        dist = sklearn.metrics.pairwise.cosine_similarity(v.reshape(1, -1),
                                                          self.vects_iter,
                                                          dense_output=True)
        ranking = (-dist).argsort()[0]

        self.rankings[self.labels[real_id]] = self.labels[ranking[:15]]

    def add_to_custom_ranking(self, custom_ranking, idx: int):
        anime_ranking = self.rankings[idx]

        for anime in anime_ranking:
            if anime in custom_ranking:
                custom_ranking[anime] += 1
            else:
                custom_ranking[anime] = 1

    def predict(self, already_watched):

        self.load_artifacts()
        custom_ranking = dict()

        for idx in tqdm(already_watched):
            if idx not in self.rankings:
                self.load_rankings(idx)

            self.add_to_custom_ranking(custom_ranking, idx)

        return dict(sorted(custom_ranking.items(),
                           reverse=True,
                           key=lambda x: x[1]))

In [3]:
two_mode_data="dataset/anime-dataset-2023.csv"
anime_df = pd.read_csv(two_mode_data, sep=",")

In [4]:
# Ranking

ratingGenerator = RatingGenerator()

def pandas_extract_content(row, label):
    name = row[label].to_string()
    return name.split("    ")[1]

ranking = ratingGenerator.predict([67, 6702, 242])
for label, times in ranking.items():
    filter = anime_df[anime_df.anime_id == label]
    if len(filter) == 0:
        continue

    print(f"{pandas_extract_content(filter, 'Name')}: {times}")

  0%|          | 0/3 [00:00<?, ?it/s]

Upotte!! Miatte Waratte: 1
Re:Zero kara Hajimeru Isekai Seikatsu - Memory Snow - Manner Movie: 1
Mametarou Ganbare: 1
Kappa no Sanpei: 1
Tamayura: Hitotose - Attakai Kaze no Omoide, nanode: 1
Shima Shima Tora no Shimajirou: 1
Zouressha ga Yatte Kita: 1
IRoid: Koi no Yuukou Frontier: 1
100% Renewable Energy: 1
Hachi-nan tte, Sore wa Nai deshou!: 1
Ongaku Shoujo (TV): 1
Xingchen Bian: Yu Li Cang Hai: 1
Gakumon!: Ookami Shoujo wa Kujikenai: 1
Aura: Maryuuin Kouga Saigo no Tatakai: 1
Tarzan: 1
Ani ni Tsukeru Kusuri wa Nai!: 1
X-Men: 1
Spo-chan Taiketsu: Youkai Daikessen: 1
Spider Riders: Oracle no Yuusha-tachi: 1
Ooedo Forty Eight: 1
Grim: 1
Dies Irae: 1
Lilac (Bombs Jun Togawa): 1
Escha Chron: 1
Muku na Kisetsu: 1
Hellsing Ultimate: 1
Lord El-Melloi II Sei no Jikenbo: Rail Zeppelin Grace Note: 1
Granblue Fantasy The Animation: Kabocha no Lantern: 1
The iDOLM@STER Cinderella Girls: Special Program: 1
Starmyu OVA: 1
Blade & Soul: 1
Anata no Shiranai Kangofu: Seiteki Byoutou 24 Ji: 1
Kitsuts

In [7]:
def extract_year(aired):
    return aired.split(",")[1].split(" ")[1]

def search_str(s, search):
    return search in str(s).lower()

def find_by_name(name):
    name = name.lower()
    mask = anime_df.apply(lambda x: x.map(lambda s: search_str(s, name)))
    return anime_df.loc[mask.any(axis=1)]

find_by_name("pokemon").head(100)

Unnamed: 0,anime_id,Name,English name,Other name,Score,Genres,Synopsis,Type,Episodes,Aired,...,Studios,Source,Duration,Rating,Rank,Popularity,Favorites,Scored By,Members,Image URL
494,527,Pokemon,Pokémon,ポケットモンスター,7.37,"Action, Adventure, Comedy, Fantasy","Pokémon are peculiar creatures with a vast array of different abilities and appearances; many people, known as Pokémon trainers, capture and train them, often with the intent of battling others. Young Satoshi has not only dreamed of becoming a Pokémon trainer but also a ""Pokémon Master,"" and on ...",TV,276.0,"Apr 1, 1997 to Nov 14, 2002",...,OLM,Game,24 min per ep,PG - Children,2205.0,326,5125,386995.0,607481,https://cdn.myanimelist.net/images/anime/13/73834.jpg
495,528,Pokemon Movie 01: Mewtwo no Gyakushuu,Pokémon: The First Movie,ポケットモンスター ミュウツーの逆襲,7.64,"Action, Adventure, Award Winning, Comedy, Drama, Fantasy","Using the fossil of the Legendary Pokémon Mew, Team Rocket has created the first successful Pokémon clone. Named Mewtwo, the man-made Pokémon quickly becomes upset with the circumstances of its creation and goes on a rampage. It is soon approached by Sakaki, the leader of Team Rocket, and is man...",Movie,1.0,"Jul 18, 1998",...,OLM,Game,1 hr 25 min,PG - Children,1286.0,877,397,173195.0,255480,https://cdn.myanimelist.net/images/anime/13/65699.jpg
1016,1117,Pokemon Movie 02: Maboroshi no Pokemon Lugia Bakutan,Pokémon: The Movie 2000,ポケットモンスター 幻のポケモンルギア爆誕,7.34,"Adventure, Comedy, Drama, Fantasy","An ancient prophecy tells of a day when the titans of ice, lightning, and fire are disturbed. When this occurs, Lugia, the guardian of the sea, will rise up and restore harmony. Enchanted by the words of the prophecy, Gelardan, a Pokémon collector, sets out on his airship with a dreadful plan to...",Movie,1.0,"Jul 17, 1999",...,OLM,Game,1 hr 20 min,PG - Children,2344.0,1198,205,121050.0,184370,https://cdn.myanimelist.net/images/anime/11/41801.jpg
1017,1118,Pokemon Movie 03: Kesshoutou no Teiou Entei,Pokémon 3: The Movie,ポケットモンスター 結晶塔の帝王 ENTEI,7.08,"Action, Adventure, Comedy, Drama, Fantasy","Mii Snowdon is left on her own after her father disappears while investigating the mysterious letter-shaped Pokémon called the Unown. The only clue to her father's disappearance is a box containing several tiles. While playing with these tiles, Mii makes a wish to see her father again, and this ...",Movie,1.0,"Jul 8, 2000",...,OLM,Game,1 hr 31 min,PG - Children,3633.0,1275,144,112069.0,173426,https://cdn.myanimelist.net/images/anime/7/80288.jpg
1018,1119,Pokemon Movie 04: Celebi Toki wo Koeta Deai,Pokémon 4Ever,ポケットモンスター セレビィ 時を越えた遭遇（であい）,6.9,"Adventure, Comedy, Drama, Fantasy","Serebii, a Legendary Pokémon known for its ability to traverse time, is hunted by an unnamed Pokémon poacher seeking to capture it. Yukinari, a young Pokémon trainer who enjoys drawing portraits of Pokémon, tries to protect Serebii after it stumbles upon him; but in the middle of its escape, bot...",Movie,1.0,"Jul 7, 2001",...,OLM,Game,1 hr 19 min,PG - Children,4396.0,1428,65,95533.0,150163,https://cdn.myanimelist.net/images/anime/2/41799.jpg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16315,40881,Pokemon Movie 23: Koko,Pokémon the Movie: Secrets of the Jungle,劇場版ポケットモンスター ココ,7.28,"Action, Adventure, Comedy, Fantasy","The new film's story is set in Okoya Forest, a Pokémon paradise protected by strict rules that forbid outsiders from setting foot inside. The film centers on Koko, a boy who was raised by Pokémon and also considers himself as one, treating the Mythical Pokémon Zarude as his father. Ash and Pikac...",Movie,1.0,"Dec 25, 2020",...,OLM,Game,1 hr 39 min,PG - Children,2641.0,4273,32,9896.0,23305,https://cdn.myanimelist.net/images/anime/1029/106237.jpg
16680,41612,Oshougatsuda yo! Pokemon Special!,UNKNOWN,お正月だよ！ポケットモンスタースペシャル！,5.84,"Action, Adventure, Comedy, Fantasy",A recap special of the first seventy-seven episodes of the Pokemon anime.,Special,1.0,"Jan 1, 1999",...,OLM,Game,1 hr 30 min,PG - Children,9554.0,14366,0,143.0,587,https://cdn.myanimelist.net/images/anime/1045/106871.jpg
16838,42135,Pokemon Advanced Generation Planetarium: Tenkuu Kara no Chousen,UNKNOWN,ポケットモンスターアドバンスジェネレーション　プラネタリウム　天空からの挑戦,6.24,"Action, Adventure, Comedy, Fantasy","This is a world where people and Pokemon live together in peace. Satoshi from Masara Town set out on a journey to train to become a Pokemon Master. One day, Satoshi and Pikachu visit a forest with lots and lots of Pokemon. Suddenly, Satoshi and Pikachu are swallowed up by a world of darkness!...",Movie,1.0,"Jul 17, 2004",...,UNKNOWN,Game,30 min,PG - Children,7693.0,14349,0,131.0,590,https://cdn.myanimelist.net/images/anime/1069/107847.jpg
16839,42136,Pokemon Diamond & Pearl Atsumare! Pokemon Planet Center,UNKNOWN,ポケットモンスターダイヤモンドパール・あつまれ！ポケモンプラネットセンター,6.53,"Action, Adventure, Comedy, Fantasy","Hey everyone! I'm Satoshi! Have you ever looked up at the night sky and gazed at the stars floating up there? Just like with Pokemon, the star-studded sky has lots and lots of mysteries! The Pokemon Planet Center is a place where you can solve some of the mysteries of space together with Pikach...",Movie,1.0,"Jul 2, 2006",...,UNKNOWN,Game,26 min,PG - Children,6214.0,10734,1,569.0,1963,https://cdn.myanimelist.net/images/anime/1492/108084.jpg


In [3]:
def df_from_array(array):
    return pd.DataFrame(data=array[1:,1:], index=array[1:,0], columns=array[0,1:])

def read_data(file):
    with open(file, newline='') as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        return [ np.array([i] + list(row)) for i, row in enumerate(reader, start=-1)]

def get_genre_list(genres: str):
  return list(map(lambda x: x.strip().lower(), genres.split(",")))

def filter_out(word, col):
  return lambda row: not (word.lower() in get_genre_list(row[col]))

def filter_in(word, col):
  return lambda row: (word.lower() in get_genre_list(row[col]))

def create_two_mode_df(data):
    sub_df = data[["anime_id", "English name", "Genres"]]
    result = [['id', 'source', 'target']]
    i=0
    for _, row in sub_df.iterrows():
        for genre in get_genre_list(row['Genres']):
            to_append = [i, row['English name'], genre]
            result.append(to_append)
            i+=1
    return df_from_array(np.array(result))

def create_weighted_matrix(two_mode_df):
    assert(two_mode_df.columns.tolist() == ['source', 'target'])
    source_uniq = two_mode_df['source'].unique()
    source_len = source_uniq.shape[0]
    source_dict = {k: v for v, k in enumerate(source_uniq)}
    array = np.zeros((source_len, source_len), dtype=int)

    for _, source1 in tqdm(two_mode_df.iterrows(), total=len(two_mode_df.index)):
        for _, source2 in two_mode_df.iterrows():
            if source1['target'] == source2['target']:
                array[source_dict[source1['source']], source_dict[source2['source']]] += 1
    return pd.DataFrame(data=array, columns=source_uniq, index=source_uniq)

def create_weighted_matrix2(two_mode_df):
    adj_df = pd.crosstab(two_mode_df.source, two_mode_df.target)
    adj_np = adj_df.to_numpy()
    adj_np2 = adj_np@adj_np.T
    return (adj_np2, adj_df.index)


def create_weighted_df(matrix_df):
    result=[['', 'source', 'target', 'weight']]
    c = 0
    for rowIndex, row in tqdm(matrix_df.iterrows(), total=len(matrix_df.index)): #iterate over rows
        for colIndex, value in row.items():
            weight = matrix_df.at[rowIndex, colIndex]
            if weight > 0:
                to_append = [str(c), rowIndex, colIndex, weight]
                result.append(to_append)
                c += 1
    return df_from_array(np.array(result))


In [4]:
raw_data = read_data(two_mode_data)
# raw_data = raw_data[:1000]

In [5]:
all_data = df_from_array(np.array(raw_data))

In [6]:
all_data.columns

Index(['anime_id', 'Name', 'English name', 'Other name', 'Score', 'Genres',
       'Synopsis', 'Type', 'Episodes', 'Aired', 'Premiered', 'Status',
       'Producers', 'Licensors', 'Studios', 'Source', 'Duration', 'Rating',
       'Rank', 'Popularity', 'Favorites', 'Scored By', 'Members', 'Image URL'],
      dtype='object')

In [7]:
all_data = all_data[(all_data['Score'] != "UNKNOWN") & (all_data['Score'] != '')].astype({'Score': 'float'})
# all_data = all_data.drop(columns=['Synopsis', 'Aired', 'Image URL', 'Scored By', 'Members', 'Licensors'])
# all_data = all_data[all_data['Score'] > 7.0]
important_data = all_data.drop(columns=['Synopsis', 'Aired', 'Image URL', 'Scored By', 'Members', 'Licensors'])

In [8]:
movies = important_data[important_data['Type'] == "Movie"]
TVs = important_data[important_data['Type'] == "TV"]
specials = important_data[important_data['Type'] == "Special"]

specified = important_data

not_hentai = specified[specified.apply(filter_out("Hentai", 'Genres'), axis=1)]
hentai = specified[specified.apply(filter_in("Hentai", 'Genres'), axis=1)]

mappa = not_hentai[not_hentai.apply(filter_in("Mappa", "Studios"), axis=1)]

In [9]:
# not_hentai.sort_values(['Score'], ascending=False)
print(len(hentai))
print(len(not_hentai))

1465
14227


In [10]:
data = all_data[["anime_id", "English name", "Genres", "Score"]]
data = data[(data['Score'] != "UNKNOWN") & (data['Score'] != '')].astype({'Score': 'float'})

In [11]:
data.dtypes

mapped = map(get_genre_list, data["Genres"].to_numpy().flatten().tolist())

uniq = np.unique(list(chain.from_iterable(mapped))).tolist()

print(len(uniq), uniq)

22 ['action', 'adventure', 'avant garde', 'award winning', 'boys love', 'comedy', 'drama', 'ecchi', 'erotica', 'fantasy', 'girls love', 'gourmet', 'hentai', 'horror', 'mystery', 'romance', 'sci-fi', 'slice of life', 'sports', 'supernatural', 'suspense', 'unknown']


In [12]:
two_mode_df = create_two_mode_df(data)

two_mode_df

Unnamed: 0,source,target
0,Cowboy Bebop,action
1,Cowboy Bebop,award winning
2,Cowboy Bebop,sci-fi
3,Cowboy Bebop: The Movie,action
4,Cowboy Bebop: The Movie,sci-fi
...,...,...
30955,One Piece: Recapping Fierce Fights! The Counte...,action
30956,One Piece: Recapping Fierce Fights! The Counte...,adventure
30957,One Piece: Recapping Fierce Fights! The Counte...,comedy
30958,One Piece: Recapping Fierce Fights! The Counte...,fantasy


In [13]:
(weighted_matrix_np, weighted_matrix_index) = create_weighted_matrix2(two_mode_df)

In [14]:
weighted_matrix_df2 = pd.DataFrame(data=weighted_matrix_np, index=weighted_matrix_index, columns=weighted_matrix_index)


In [15]:
weighted_matrix_df2

source,"""Deji"" Meets Girl","""Ippon"" again!","""Parade"" de Satie","""Star""t",-OutsideR:RequieM-,.Koni-chan,.hack//G.U. Trilogy,.hack//G.U. Trilogy: Parody Mode,.hack//Gift,.hack//Legend of The Twilight,...,selector spread WIXOSS,sound / phantasma / mirror,teeter,the FLY BanD!,tsuritama,xxxHOLiC,xxxHOLiC The Movie: A Midsummer Night's Dream,∀ Gundam,∀ Gundam I: Earth Light,∀ Gundam II: Moonlight Butterfly
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"""Deji"" Meets Girl",1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
"""Ippon"" again!",0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"""Parade"" de Satie",0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"""Star""t",0,0,0,1,1,0,0,0,0,0,...,1,0,1,1,0,0,0,0,0,0
-OutsideR:RequieM-,0,0,0,1,1,0,0,0,0,0,...,1,0,1,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
xxxHOLiC,0,0,0,0,0,1,0,1,1,1,...,0,0,0,0,1,4,4,1,1,1
xxxHOLiC The Movie: A Midsummer Night's Dream,0,0,0,0,0,1,0,1,1,1,...,0,0,0,0,1,4,4,1,1,1
∀ Gundam,0,0,0,0,0,0,1,1,0,1,...,0,0,0,0,1,1,1,6,3,3
∀ Gundam I: Earth Light,0,0,0,0,0,0,1,1,0,0,...,0,0,0,0,1,1,1,3,3,3


In [16]:
weighted_matrix_df_pickle_file = "weighted_matrix_df.df"
if os.path.exists(weighted_matrix_df_pickle_file):
    weighted_matrix_df = pd.read_pickle(weighted_matrix_df_pickle_file)
else:
    weighted_matrix_df = weighted_matrix_df2
    weighted_matrix_df.to_pickle(weighted_matrix_df_pickle_file)

In [17]:
weighted_matrix_df

source,"""Deji"" Meets Girl","""Ippon"" again!","""Parade"" de Satie","""Star""t",-OutsideR:RequieM-,.Koni-chan,.hack//G.U. Trilogy,.hack//G.U. Trilogy: Parody Mode,.hack//Gift,.hack//Legend of The Twilight,...,selector spread WIXOSS,sound / phantasma / mirror,teeter,the FLY BanD!,tsuritama,xxxHOLiC,xxxHOLiC The Movie: A Midsummer Night's Dream,∀ Gundam,∀ Gundam I: Earth Light,∀ Gundam II: Moonlight Butterfly
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"""Deji"" Meets Girl",1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
"""Ippon"" again!",0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"""Parade"" de Satie",0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"""Star""t",0,0,0,1,1,0,0,0,0,0,...,1,0,1,1,0,0,0,0,0,0
-OutsideR:RequieM-,0,0,0,1,1,0,0,0,0,0,...,1,0,1,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
xxxHOLiC,0,0,0,0,0,1,0,1,1,1,...,0,0,0,0,1,4,4,1,1,1
xxxHOLiC The Movie: A Midsummer Night's Dream,0,0,0,0,0,1,0,1,1,1,...,0,0,0,0,1,4,4,1,1,1
∀ Gundam,0,0,0,0,0,0,1,1,0,1,...,0,0,0,0,1,1,1,6,3,3
∀ Gundam I: Earth Light,0,0,0,0,0,0,1,1,0,0,...,0,0,0,0,1,1,1,3,3,3


In [18]:
# weighted_df_pickle_file = "weighted_df.df"
# if os.path.exists(weighted_df_pickle_file):
#     weighted_df = pd.read_pickle(weighted_df_pickle_file)
# else:
#     weighted_df = create_weighted_df(weighted_matrix_df)
#     weighted_df.to_pickle(weighted_df_pickle_file)

In [19]:
# weighted_df

In [20]:
# weighted_df.columns

In [21]:
# for i in [3, 2, 1]:
# G = nx.from_pandas_edgelist(
#         weighted_df, source='source',
#         target='target', edge_attr='weight')

plt.figure(figsize=(35,35))

G = nx.from_numpy_array(weighted_matrix_np)
graph_pos = nx.spring_layout(G)
nx.draw_networkx_nodes(G, graph_pos, node_size=10, node_color='blue', alpha=0.3)
nx.draw_networkx_edges(G, graph_pos)
nx.draw_networkx_labels(G, graph_pos, font_size=8, font_family='sans-serif')

# plt.show()

# nx.draw(G)

KeyboardInterrupt: 

In [None]:
plt.savefig("plot.svg", dpi=1200)

<Figure size 640x480 with 0 Axes>

In [None]:
net = Network(notebook=True)
net.from_nx(G)
net.show("example.html")