In [3]:
# Standard IPython notebook imports
%matplotlib inline

import os
import json

import csv
import networkx as nx
import pyarrow
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import math
import ast
from itertools import chain
import matplotlib.pyplot as plt
import subprocess
import sklearn.metrics
from typing import Optional
import shutil
pd.options.display.max_colwidth = 300

In [4]:
two_mode_data="dataset/anime-dataset-2023.csv"
users_csv="dataset/user-filtered.csv"

In [5]:
%%time
# importing data from csv to pandas

class Data_factory:
    def from_csv(file: str) -> pd.DataFrame:
        return pd.read_csv(file)

anime_df = Data_factory.from_csv(two_mode_data)
users_df = Data_factory.from_csv(users_csv)

CPU times: user 33.8 s, sys: 30.7 s, total: 1min 4s
Wall time: 2min 34s


In [6]:
users_df.head() # is user data loaded?
len(users_df)

109224747

In [26]:
class AnimeRecomendation:
    def __init__(self, dimensions = 32, iter = 16):
        self.users_df     = pd.DataFrame()
        self.users_count  = None
        self.dimensions   = dimensions
        self.iterations   = iter
        self.columns      = ["user", "anime"]
        self.tsv_filename = None
        self.rankings     = dict()

    def anime_label(idx: int) -> str:
        return f"anime__{idx}"

    def number_of_users(self):
        if self.users_count is None:
            self.users_count = self.users_df.max()['user_id']
        return self.users_count

    def save_to_tsv(self, tsv_filename: str):
        grouped_df        = self.grouped_df
        users_count       = self.number_of_users()
        self.tsv_filename = tsv_filename
        columns_to_keep   = ['user_id', 'id_rating']
        grouped_df.to_csv(tsv_filename, index=False, sep='\t', columns=columns_to_keep, mode='w', header=False)

    def fit(self, users_df, lines: Optional[int] = None, rating_threshold: int = 6):
        if lines is not None:
            self.users_df = users_df.head(int(lines))
            users_df = self.users_df
        else:
            self.users_df = users_df
            
        self.users_df['id_rating'] = list(zip(self.users_df['anime_id'], self.users_df['rating']))
    
        def agg_fun(anime, rating):
            return " ".join([str(anime) for _ in range(rating - rating_threshold + 1)])

        self.grouped_df = users_df[users_df.rating >= rating_threshold]                          \
            .groupby("user_id")["id_rating"]                                                     \
            .agg(lambda animes: " ".join([agg_fun(anime, rating) for (anime,rating) in animes])) \
            .reset_index()

            
#         self.grouped_df = self.grouped_df \
#             .filter(items=['user_id', 'id_rating'])  \
#             .apply(list, axis=1)                                           \
#             .agg(lambda animes: " ".join([str(anime) for anime in animes])) \

#         self.number_of_users()
        return self.grouped_df

    def get_artifacts(self, name: str):
        d = dict()
        p = "results/emb__"
        files  = ["labels", "vects_iter"]
        suf = [".out.entities", ".out.npy"]
    
        return { f:f"{p}{name}__{name}{suf[idx]}" 
                    for idx, f in enumerate(files) }

    def cleora_train(self, cleora_exe="cleora"):
        if self.tsv_filename is None:
            raise RuntimeError("TSV filename not yet created")
        if not os.access(cleora_exe, os.X_OK) and shutil.which(cleora_exe) is None:
            raise RuntimeError("cleora executable not found")

        command = [cleora_exe,
                   "--type", "tsv",
                   f"--columns={self.columns[0]} complex::reflexive::{self.columns[1]}",
                   "--dimension", str(self.dimensions),
                   "--number-of-iterations", str(self.iterations),
                   "--prepend-field-name", "1",
                   "-f", "numpy",
                   "-o", "results",
                   "-e", "0",
                   self.tsv_filename]
        subprocess.run(command, check=True, stderr=subprocess.DEVNULL)

    def load_artifacts(self):
        artifacts = self.get_artifacts(self.columns[1])
        with open(artifacts['labels'], "r") as entities:
            self.labels     = json.load(entities)
        # Load results to numpy
        self.vects_iter     = np.load(artifacts['vects_iter'])

    def load_rankings(self, idx: int):
        real_id = self.labels.index(f"anime__{idx}")

        v = self.vects_iter[real_id]
        dist = sklearn.metrics.pairwise.cosine_similarity(v.reshape(1, -1), self.vects_iter, dense_output=True)
        ranking = (-dist).argsort()[0]

        self.rankings[self.labels[real_id]] = ranking[:15]

    def add_to_custom_ranking(self, custom_ranking, idx: int):
        anime_ranking = self.rankings[f"anime__{idx}"]

        for anime in anime_ranking:
            if anime in custom_ranking:
                custom_ranking[anime] += 1
            else:
                custom_ranking[anime] = 1

    def predict(self, already_watched):
        
        self.load_artifacts()
        custom_ranking = dict()
        
        for idx in tqdm(already_watched):
            if f"anime__{idx}" not in self.rankings:
                self.load_rankings(idx)

            self.add_to_custom_ranking(custom_ranking, idx)

        return dict(sorted(custom_ranking.items(), reverse=True, key=lambda x:x[1]))

In [27]:
%%time

LINES = 2000000
RATING_THRESHOLD = 6

model = AnimeRecomendation()
model.fit(users_df, lines=LINES, rating_threshold=RATING_THRESHOLD)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.users_df['id_rating'] = list(zip(self.users_df['anime_id'], self.users_df['rating']))


CPU times: user 3.37 s, sys: 305 ms, total: 3.68 s
Wall time: 4.26 s


Unnamed: 0,user_id,id_rating
0,0,67 67 67 67 6702 6702 242 242 242 242 242 21 21 21 21 21 24 24 24 24 4722 4722 4722 6098 3125 3125 3125 3125 481 481 481 481 481 68 1689 2913 1250 1250 356 356 356 356 121 121 121 121 430 430 430 430 1829 1829 1571 1571 1571 1571 1571 578 578 578 578 578 431 431 431 2762 2762 2762 2762 570 570 3...
1,1,37403 37403 37403 7674 7674 34566 34566 40852 40852 40852 40852 10087 10087 40052 40052 40052 40748 40748 40748 40748 21 21 21 21 26243 26243 42203 42203 42203 42203 40028 40028 40028 40028 40028 3972 3972 481 481 481 22199 22199 6547 9919 9919 9919 5081 5081 5081 31043 31043 31043 31043 31964 3...
2,2,235 235 235 235 235 5042 5042 5042 7593 7593 7593 21 21 21 21 22 22 22 22 5762 5762 31580 31580 35028 35028 35028 368 368 368 31964 31964 31964 33486 33486 33486 33486 31740 31740 1575 1575 1575 1575 2904 2904 2904 2904 2904 1535 1535 1535 1535 1535 28223 28223 28223 226 226 226 38671 38671 3867...
3,3,6114 6114 6114 199 199 199 199 849 849 849 33352 33352 33352 33352 33352 31646 31646 31646 31646 32998 32998 32998 25397 12291 12291 1292 1292 1292 34881 34881 34881 32323 32323 22199 22199 4744 4744 47 47 47 47 22729 22729 32828 32828 32828 22147 22147 22147 6547 6547 6547 9989 9989 9989 11433 ...
4,4,101 101 101 656 656 656 3549 3549 3549 3359 3359 104 104 104 530 530 68 68 150 150 150 150 819 819 819 1222 1222 1222 105 105 105 1542 1542 1542 1689 1689 1689 1689 1689 232 232 232 232 371 371 372 372 372 372 4975 831 831 59 59 1575 1575 1575 1575 1575 2904 2904 2904 2904 2904 61 61 1535 1535 1...
...,...,...
6003,6806,28977 28977 28977 28977 28977 21 21 21 21 21 205 205 205 31646 31646 31646 31646 31646 35180 35180 35180 35180 35180 50 50 12291 12291 12291 30123 30123 30123 31173 31173 31173 30370 30370 30370 9919 9919 9919 9919 11266 11266 11266 33506 33506 33506 7647 7647 7647 9074 9074 9074 12967 12967 282...
6004,6807,31646 31646 31646 35180 35180 35180 35180 32998 32998 22199 25013 30370 47 47 47 16201 16201 16201 16201 22147 22147 6547 6547 6547 6547 9989 9989 9989 9989 9989 11433 11111 14669 14669 14669 2251 2251 2251 2251 5081 5081 5081 5081 6948 7674 7674 7674 10030 10030 10030 12365 12365 12365 12365 22...
6005,6808,9989 9989 9989 9989 15039 15039 11111 11111 9919 9919 11737 11737 11737 11266 33506 33506 10647 10647 10647 7817 7817 31964 31964 31964 31964 31964 33486 33486 33486 33486 33486 35262 36456 36456 36456 36456 36456 36896 36896 36896 36896 1 1 32494 1535 1535 1535 1535 35120 35120 35120 18507 1850...
6006,6809,40852 40852 40852 40852 42897 42897 42897 40748 40748 40748 40748 40028 40028 40028 40028 40028 42923 42923 42923 31580 31580 16201 16201 16201 16201 16201 9989 9989 9989 9989 24833 24833 24833 24833 21995 21995 21995 9919 9919 9919 36649 36649 36649 36649 36649 39195 39195 39195 31043 31043 310...


In [115]:
model.grouped_df

Unnamed: 0,user_id,id_rating
0,0,"([67, 6702, 242, 21, 24, 4722, 6098, 3125, 481, 68, 1689, 2913, 1250, 356, 121, 430, 1829, 1571, 578, 431, 2762, 570, 3418, 3010, 433, 600, 2034, 164, 4086, 2248, 1047, 459, 466, 2543, 419, 199, 169, 2547, 2236, 415, 1894, 269, 235], [9, 7, 10, 10, 9, 8, 6, 9, 10, 6, 6, 6, 7, 9, 9, 9, 7, 10, 10,..."
1,1,"([37403, 7674, 34566, 40852, 10087, 40052, 40748, 21, 26243, 42203, 40028, 3972, 481, 22199, 6547, 9919, 5081, 31043, 31964, 33486, 36456, 38408, 28755, 1575, 35849, 1535, 18689, 38731, 30230, 38691, 38671, 40956, 5114, 37349, 20583, 28891, 32935, 38883, 40776, 37259, 11061, 37497, 37999, 40221,..."
2,2,"([235, 5042, 7593, 21, 22, 5762, 31580, 35028, 368, 31964, 33486, 31740, 1575, 2904, 1535, 28223, 226, 38671, 32872, 15, 71, 7661, 263, 270, 24703, 15451, 8074, 11061, 28961, 34542, 14719, 20899, 26055, 34933, 37086, 1604, 22535, 189, 34599, 29575, 10620, 32182, 23755, 20, 1735, 35581, 5040, 302..."
3,3,"([6114, 199, 849, 33352, 31646, 32998, 25397, 12291, 1292, 34881, 32323, 22199, 4744, 47, 22729, 32828, 22147, 6547, 9989, 11433, 11111, 9919, 33506, 7647, 6166, 4999, 66, 7817, 2251, 5081, 7674, 10030, 22789, 10396, 13535, 20787, 889, 1519, 32086, 34618, 11633, 31043, 10719, 14967, 38186, 14345..."
4,4,"([101, 656, 3549, 3359, 104, 530, 68, 150, 819, 1222, 105, 1542, 1689, 232, 371, 372, 4975, 831, 59, 1575, 2904, 61, 1535, 345, 2035, 356, 120, 71, 123, 380, 381, 124, 4725, 74, 127, 1571, 476, 16, 1142, 644, 4551, 5630, 578, 3731, 5680, 145, 1691, 6045, 1579, 3392, 146, 4898, 690, 189, 190, 191..."
...,...,...
6003,6806,"([28977, 21, 205, 31646, 35180, 50, 12291, 30123, 31173, 30370, 9919, 11266, 33506, 7647, 9074, 12967, 28249, 1946, 9736, 2251, 3901, 109, 36649, 22789, 14751, 31733, 889, 4901, 11633, 31964, 33486, 36456, 38408, 35459, 10719, 1222, 357, 31478, 32867, 1689, 4535, 37435, 10800, 14397, 37379, 59, ..."
6004,6807,"([31646, 35180, 32998, 22199, 25013, 30370, 47, 16201, 22147, 6547, 9989, 11433, 11111, 14669, 2251, 5081, 6948, 7674, 10030, 12365, 22789, 57, 889, 4901, 1519, 269, 31043, 31964, 33486, 36456, 21405, 28755, 1689, 5356, 10800, 14397, 14741, 1470, 2167, 4181, 6351, 4059, 1818, 1575, 2904, 8142, 1..."
6005,6808,"([9989, 15039, 11111, 9919, 11737, 11266, 33506, 10647, 7817, 31964, 33486, 35262, 36456, 36896, 1, 32494, 1535, 35120, 18507, 33845, 35191, 22265, 26213, 19671, 35198, 121, 5114, 7902, 6421, 908, 664, 430, 9135, 23289, 27633, 20583, 28891, 32935, 25303, 35806, 15809, 30415, 11061, 13271, 249, 4..."
6006,6809,"([40852, 42897, 40748, 40028, 42923, 31580, 16201, 9989, 24833, 21995, 9919, 36649, 39195, 31043, 31964, 33486, 36456, 38408, 37597, 16592, 6880, 1535, 28223, 35120, 37520, 38691, 38671, 2759, 3784, 3785, 6702, 18507, 41120, 121, 5114, 39533, 40421, 40052, 20583, 28891, 15809, 37972, 431, 11061,..."


In [28]:
%%time
# constructing the tsv data

tsv_file_name = "data3.tsv"
model.save_to_tsv(tsv_file_name)

CPU times: user 927 ms, sys: 25.8 ms, total: 953 ms
Wall time: 1.03 s


In [35]:
# %%time
# # alternative

# lines_n = 1000000
# tsv_file_name_n = "data2.tsv"
# grouped_df_n =                             \
#     users_df[users_df.rating >= 6]         \
#     .head(lines_n)                         \
#     .filter(items=["user_id", "anime_id"]) \
#     .to_csv(tsv_file_name_n, index=False, sep="\t")

In [None]:
%%time

# You shouldnt be running it every time!

# Commented it out for safety reasons xd
model.cleora_train(cleora_exe="../cleora/target/release/cleora")

In [41]:
# Ranking

def pandas_extract_content(row, label):
    name = row[label].to_string()
    return name.split("    ")[1]

ranking = model.predict([67, 6702, 242])
print(ranking)
for label, times in ranking.items():
    filter = anime_df[anime_df.anime_id == label]
    if len(filter) == 0:
        continue

    print(f"{pandas_extract_content(filter, "Name")}: {times}")

  0%|          | 0/3 [00:00<?, ?it/s]

{15: 2, 0: 1, 4922: 1, 5176: 1, 2723: 1, 1763: 1, 3077: 1, 2683: 1, 1482: 1, 5745: 1, 960: 1, 3856: 1, 6331: 1, 3582: 1, 1037: 1, 1: 1, 106: 1, 990: 1, 4698: 1, 4534: 1, 4532: 1, 1076: 1, 3171: 1, 10364: 1, 1201: 1, 1093: 1, 3221: 1, 44: 1, 2800: 1, 3071: 1, 2: 1, 1039: 1, 3200: 1, 1928: 1, 3105: 1, 2627: 1, 1787: 1, 3563: 1, 5635: 1, 5955: 1, 5707: 1, 4251: 1, 765: 1, 685: 1}
Eyeshield 21: 2
D4 Princess: 1
Midnight Panther: 1
Warau Salesman: 1
Tokyo Majin Gakuen Kenpucho: Tou Dai Ni Maku: 1
D.Gray-man: 1
Shin Chou Kyou Ryo: Condor Hero II: 1
Ten Little Gall Force: 1
Mashou no Kao: 1
Saint Beast: Seijuu Kourin-hen: 1
Cowboy Bebop: 1
Hana yori Dango: 1
Kamisama Kazoku: 1
Jokei Kazoku: Inbou: 1
Choujin Locke: Shinsekai Sentai: 1
Taiyou no Yuusha Fighbird: 1
Wizardry: 1
Ryoujoku Hitozuma Onsen: 1
Oishinbo: 1
Shakugan no Shana-tan Movie: 1
Rurouni Kenshin: Meiji Kenkaku Romantan - Tsuioku-hen: 1
Candy Candy: 1
Uchuu Senkan Yamato: Aratanaru Tabidachi: 1
Mizuiro (2003): 1
Early Reins: 1
Chi

In [65]:
def extract_year(aired):
    return aired.split(",")[1].split(" ")[1]

def search_str(s, search):
    return search in str(s).lower()

mask = anime_df.apply(lambda x: x.map(lambda s: search_str(s, "naruto")))

anime_df.loc[mask.any(axis=1)]

Unnamed: 0,anime_id,Name,English name,Other name,Score,Genres,Synopsis,Type,Episodes,Aired,...,Studios,Source,Duration,Rating,Rank,Popularity,Favorites,Scored By,Members,Image URL
10,20,Naruto,Naruto,ナルト,7.99,"Action, Adventure, Fantasy","Moments prior to Naruto Uzumaki's birth, a huge demon known as the Kyuubi, the Nine-Tailed Fox, attacked Konohagakure, the Hidden Leaf Village, and wreaked havoc. In order to put an end to the Kyuubi's rampage, the leader of the village, the Fourth Hokage, sacrificed his life and sealed the mons...",TV,220.0,"Oct 3, 2002 to Feb 8, 2007",...,Pierrot,Manga,23 min per ep,PG-13 - Teens 13 or older,599.0,8,76343,1883772.0,2717330,https://cdn.myanimelist.net/images/anime/13/17405.jpg
414,442,Naruto Movie 1: Dai Katsugeki!! Yuki Hime Shinobu Houjou Dattebayo!,Naruto the Movie 1: Ninja Clash in the Land of Snow,劇場版　NARUTO　大活劇！雪姫忍法帖だってばよ!!,7.11,"Action, Adventure, Fantasy","Naruto Uzumaki and his squadmates, Sasuke Uchiha and Sakura Haruno, are sent on a mission to escort a movie crew on its way to film in the Land of Snow. They soon find out that they are accompanying a famous actress, Yukie Fujikaze, who persistently refuses to travel there, making the trip far m...",Movie,1.0,"Aug 21, 2004",...,Pierrot,Manga,1 hr 22 min,PG-13 - Teens 13 or older,3493.0,786,283,176117.0,286178,https://cdn.myanimelist.net/images/anime/1231/134484.jpg
557,594,Naruto: Takigakure no Shitou - Ore ga Eiyuu Dattebayo!,Naruto: The Lost Story - Mission: Protect the Waterfall Village,滝隠れの死闘　オレが英雄だってばよ!,6.76,"Action, Adventure, Fantasy","After safely escorting the cowardly Takigakure leader Shibuki to his homeland, Naruto Uzumaki, Sasuke Uchiha, and Sakura Haruno are taken aback by the village's sudden invasion of rogue ninjas. As their teacher Kakashi Hatake had departed for Konohagakure beforehand, the team is left to deal wit...",Special,1.0,"Dec 20, 2003",...,Pierrot,Manga,40 min,PG-13 - Teens 13 or older,5025.0,2102,138,49370.0,88258,https://cdn.myanimelist.net/images/anime/11/20921.jpg
696,761,Naruto: Akaki Yotsuba no Clover wo Sagase,Naruto: Find the Crimson Four-leaf Clover!,ナルト 紅き四つ葉のクローバーを探せ,6.56,"Adventure, Comedy","When Konohamaru Sarutobi asks Naruto Uzumaki for help, the latter readily accepts to join his young friend on a special mission—the retrieval of the legendary crimson four-leaf clover, rumored to have the ability to grant any wish. The miraculous plant is seemingly Konohamaru's only hope to prev...",Special,1.0,"May 24, 2003",...,Pierrot,Manga,17 min,PG-13 - Teens 13 or older,6042.0,2152,90,45728.0,84150,https://cdn.myanimelist.net/images/anime/12/11240.jpg
848,936,Naruto Movie 2: Dai Gekitotsu! Maboroshi no Chiteiiseki Dattebayo!,Naruto the Movie 2: Legend of the Stone of Gelel,劇場版　NARUTO　大激突！幻の地底遺跡だってばよ,6.87,"Action, Adventure, Fantasy","In a tumultuous effort, the Sunagakure ninjas attempt to repel an unforeseen invasion of mysterious armored warriors on the Land of Wind. Shortly afterwards, the same armored troops led by Temujin—a skilled knight of impressive power—ambush Naruto Uzumaki, Sakura Haruno, and Shikamaru Nara, who ...",Movie,1.0,"Aug 6, 2005",...,Pierrot,Manga,1 hr 37 min,PG-13 - Teens 13 or older,4535.0,981,166,133610.0,228051,https://cdn.myanimelist.net/images/anime/1114/134485.jpg
975,1074,Naruto Narutimate Hero 3: Tsuini Gekitotsu! Jounin vs. Genin!! Musabetsu Dairansen Taikai Kaisai!!,Naruto: Finally a Clash!! Jounin vs. Genin!,NARUTO ナルティメットヒーロー3 ついに激突! 上忍VS下忍!! 無差別大乱戦大会開催!!,6.78,Action,"Konohagakure hosts a special tournament for ninjas of all ranks, stirring up fervor among the rookies who are eager to prove themselves in a competition against their superiors. Additionally, the winner is granted the opportunity to create their own village decree! Despite his enthusiasm to clai...",OVA,1.0,"Dec 22, 2005",...,Pierrot,Game,26 min,PG-13 - Teens 13 or older,4945.0,2249,104,41313.0,78696,https://cdn.myanimelist.net/images/anime/10/11244.jpg
1574,1735,Naruto: Shippuuden,Naruto Shippuden,-ナルト- 疾風伝,8.26,"Action, Adventure, Fantasy","It has been two and a half years since Naruto Uzumaki left Konohagakure, the Hidden Leaf Village, for intense training following events which fueled his desire to be stronger. Now Akatsuki, the mysterious organization of elite rogue ninja, is closing in on their grand plan which may threaten the...",TV,500.0,"Feb 15, 2007 to Mar 23, 2017",...,Pierrot,Manga,23 min per ep,PG-13 - Teens 13 or older,278.0,15,107735,1569553.0,2349413,https://cdn.myanimelist.net/images/anime/1565/111305.jpg
1963,2144,Naruto Movie 3: Dai Koufun! Mikazuki Jima no Animaru Panikku Dattebayo!,Naruto the Movie 3: Guardians of the Crescent Moon Kingdom,劇場版 NARUTO -ナルト- 大興奮!みかづき島のアニマル騒動だってばよ,6.92,"Action, Adventure, Fantasy","Led by Kakashi Hatake, Naruto Uzumaki, Sakura Haruno, and Rock Lee are tasked to escort the extravagant Prince Michiru Tsuki and his spoiled son Hikaru to the prosperous Land of Moon when the two return from a long trip around the world. As if guarding two whimsical high-ranked individuals was n...",Movie,1.0,"Aug 5, 2006",...,Pierrot,Manga,1 hr 34 min,PG-13 - Teens 13 or older,4323.0,1108,162,118920.0,202638,https://cdn.myanimelist.net/images/anime/1918/134487.jpg
2058,2248,Naruto: Dai Katsugeki!! Yuki Hime Shinobu Houjou Dattebayo! - Konoha no Sato no Dai Undoukai,Naruto: Hidden Leaf Village Grand Sports Festival,木ノ葉の里の大うん動会,6.86,"Action, Comedy, Fantasy","The Konohagakure Grand Sports Festival has begun with all ninja squads vying for the ultimate prize—a whole week of paid leave! Despite his enthusiasm to achieve victory alongside Sasuke Uchiha and Sakura Haruno, Naruto Uzumaki finds himself in a pinch due to unexpected digestion problems. Caugh...",Special,1.0,"Aug 21, 2004",...,Pierrot,Original,11 min,PG-13 - Teens 13 or older,4592.0,2303,67,42739.0,75873,https://cdn.myanimelist.net/images/anime/1/2473.jpg
2267,2472,Naruto: Shippuuden Movie 1,Naruto Shippuden the Movie 1,劇場版NARUTO -ナルト- 疾風伝,7.29,"Action, Adventure, Fantasy","A group of ninja is planning to revive a powerful demon, and once its spirit is reunited with its body, the world will be destroyed. The only way to prevent this from happening is for Shion, a shrine maiden, to seal it away for good.\n\nNaruto Uzumaki is tasked to guard her, but one thing stops ...",Movie,1.0,"Aug 4, 2007",...,Pierrot,Manga,1 hr 34 min,PG-13 - Teens 13 or older,2572.0,783,302,167460.0,286579,https://cdn.myanimelist.net/images/anime/1703/134493.jpg


In [3]:
def df_from_array(array):
    return pd.DataFrame(data=array[1:,1:], index=array[1:,0], columns=array[0,1:])

def read_data(file):
    with open(file, newline='') as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        return [ np.array([i] + list(row)) for i, row in enumerate(reader, start=-1)]

def get_genre_list(genres: str):
  return list(map(lambda x: x.strip().lower(), genres.split(",")))

def filter_out(word, col):
  return lambda row: not (word.lower() in get_genre_list(row[col]))

def filter_in(word, col):
  return lambda row: (word.lower() in get_genre_list(row[col]))

def create_two_mode_df(data):
    sub_df = data[["anime_id", "English name", "Genres"]]
    result = [['id', 'source', 'target']]
    i=0
    for _, row in sub_df.iterrows():
        for genre in get_genre_list(row['Genres']):
            to_append = [i, row['English name'], genre]
            result.append(to_append)
            i+=1
    return df_from_array(np.array(result))

def create_weighted_matrix(two_mode_df):
    assert(two_mode_df.columns.tolist() == ['source', 'target'])
    source_uniq = two_mode_df['source'].unique()
    source_len = source_uniq.shape[0]
    source_dict = {k: v for v, k in enumerate(source_uniq)}
    array = np.zeros((source_len, source_len), dtype=int)

    for _, source1 in tqdm(two_mode_df.iterrows(), total=len(two_mode_df.index)):
        for _, source2 in two_mode_df.iterrows():
            if source1['target'] == source2['target']:
                array[source_dict[source1['source']], source_dict[source2['source']]] += 1
    return pd.DataFrame(data=array, columns=source_uniq, index=source_uniq)

def create_weighted_matrix2(two_mode_df):
    adj_df = pd.crosstab(two_mode_df.source, two_mode_df.target)
    adj_np = adj_df.to_numpy()
    adj_np2 = adj_np@adj_np.T
    return (adj_np2, adj_df.index)


def create_weighted_df(matrix_df):
    result=[['', 'source', 'target', 'weight']]
    c = 0
    for rowIndex, row in tqdm(matrix_df.iterrows(), total=len(matrix_df.index)): #iterate over rows
        for colIndex, value in row.items():
            weight = matrix_df.at[rowIndex, colIndex]
            if weight > 0:
                to_append = [str(c), rowIndex, colIndex, weight]
                result.append(to_append)
                c += 1
    return df_from_array(np.array(result))


In [4]:
raw_data = read_data(two_mode_data)
# raw_data = raw_data[:1000]

In [5]:
all_data = df_from_array(np.array(raw_data))

In [6]:
all_data.columns

Index(['anime_id', 'Name', 'English name', 'Other name', 'Score', 'Genres',
       'Synopsis', 'Type', 'Episodes', 'Aired', 'Premiered', 'Status',
       'Producers', 'Licensors', 'Studios', 'Source', 'Duration', 'Rating',
       'Rank', 'Popularity', 'Favorites', 'Scored By', 'Members', 'Image URL'],
      dtype='object')

In [7]:
all_data = all_data[(all_data['Score'] != "UNKNOWN") & (all_data['Score'] != '')].astype({'Score': 'float'})
# all_data = all_data.drop(columns=['Synopsis', 'Aired', 'Image URL', 'Scored By', 'Members', 'Licensors'])
# all_data = all_data[all_data['Score'] > 7.0]
important_data = all_data.drop(columns=['Synopsis', 'Aired', 'Image URL', 'Scored By', 'Members', 'Licensors'])

In [8]:
movies = important_data[important_data['Type'] == "Movie"]
TVs = important_data[important_data['Type'] == "TV"]
specials = important_data[important_data['Type'] == "Special"]

specified = important_data

not_hentai = specified[specified.apply(filter_out("Hentai", 'Genres'), axis=1)]
hentai = specified[specified.apply(filter_in("Hentai", 'Genres'), axis=1)]

mappa = not_hentai[not_hentai.apply(filter_in("Mappa", "Studios"), axis=1)]

In [9]:
# not_hentai.sort_values(['Score'], ascending=False)
print(len(hentai))
print(len(not_hentai))

1465
14227


In [10]:
data = all_data[["anime_id", "English name", "Genres", "Score"]]
data = data[(data['Score'] != "UNKNOWN") & (data['Score'] != '')].astype({'Score': 'float'})

In [11]:
data.dtypes

mapped = map(get_genre_list, data["Genres"].to_numpy().flatten().tolist())

uniq = np.unique(list(chain.from_iterable(mapped))).tolist()

print(len(uniq), uniq)

22 ['action', 'adventure', 'avant garde', 'award winning', 'boys love', 'comedy', 'drama', 'ecchi', 'erotica', 'fantasy', 'girls love', 'gourmet', 'hentai', 'horror', 'mystery', 'romance', 'sci-fi', 'slice of life', 'sports', 'supernatural', 'suspense', 'unknown']


In [12]:
two_mode_df = create_two_mode_df(data)

two_mode_df

Unnamed: 0,source,target
0,Cowboy Bebop,action
1,Cowboy Bebop,award winning
2,Cowboy Bebop,sci-fi
3,Cowboy Bebop: The Movie,action
4,Cowboy Bebop: The Movie,sci-fi
...,...,...
30955,One Piece: Recapping Fierce Fights! The Counte...,action
30956,One Piece: Recapping Fierce Fights! The Counte...,adventure
30957,One Piece: Recapping Fierce Fights! The Counte...,comedy
30958,One Piece: Recapping Fierce Fights! The Counte...,fantasy


In [13]:
(weighted_matrix_np, weighted_matrix_index) = create_weighted_matrix2(two_mode_df)

In [14]:
weighted_matrix_df2 = pd.DataFrame(data=weighted_matrix_np, index=weighted_matrix_index, columns=weighted_matrix_index)


In [15]:
weighted_matrix_df2

source,"""Deji"" Meets Girl","""Ippon"" again!","""Parade"" de Satie","""Star""t",-OutsideR:RequieM-,.Koni-chan,.hack//G.U. Trilogy,.hack//G.U. Trilogy: Parody Mode,.hack//Gift,.hack//Legend of The Twilight,...,selector spread WIXOSS,sound / phantasma / mirror,teeter,the FLY BanD!,tsuritama,xxxHOLiC,xxxHOLiC The Movie: A Midsummer Night's Dream,∀ Gundam,∀ Gundam I: Earth Light,∀ Gundam II: Moonlight Butterfly
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"""Deji"" Meets Girl",1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
"""Ippon"" again!",0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"""Parade"" de Satie",0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"""Star""t",0,0,0,1,1,0,0,0,0,0,...,1,0,1,1,0,0,0,0,0,0
-OutsideR:RequieM-,0,0,0,1,1,0,0,0,0,0,...,1,0,1,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
xxxHOLiC,0,0,0,0,0,1,0,1,1,1,...,0,0,0,0,1,4,4,1,1,1
xxxHOLiC The Movie: A Midsummer Night's Dream,0,0,0,0,0,1,0,1,1,1,...,0,0,0,0,1,4,4,1,1,1
∀ Gundam,0,0,0,0,0,0,1,1,0,1,...,0,0,0,0,1,1,1,6,3,3
∀ Gundam I: Earth Light,0,0,0,0,0,0,1,1,0,0,...,0,0,0,0,1,1,1,3,3,3


In [16]:
weighted_matrix_df_pickle_file = "weighted_matrix_df.df"
if os.path.exists(weighted_matrix_df_pickle_file):
    weighted_matrix_df = pd.read_pickle(weighted_matrix_df_pickle_file)
else:
    weighted_matrix_df = weighted_matrix_df2
    weighted_matrix_df.to_pickle(weighted_matrix_df_pickle_file)

In [17]:
weighted_matrix_df

source,"""Deji"" Meets Girl","""Ippon"" again!","""Parade"" de Satie","""Star""t",-OutsideR:RequieM-,.Koni-chan,.hack//G.U. Trilogy,.hack//G.U. Trilogy: Parody Mode,.hack//Gift,.hack//Legend of The Twilight,...,selector spread WIXOSS,sound / phantasma / mirror,teeter,the FLY BanD!,tsuritama,xxxHOLiC,xxxHOLiC The Movie: A Midsummer Night's Dream,∀ Gundam,∀ Gundam I: Earth Light,∀ Gundam II: Moonlight Butterfly
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"""Deji"" Meets Girl",1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
"""Ippon"" again!",0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"""Parade"" de Satie",0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"""Star""t",0,0,0,1,1,0,0,0,0,0,...,1,0,1,1,0,0,0,0,0,0
-OutsideR:RequieM-,0,0,0,1,1,0,0,0,0,0,...,1,0,1,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
xxxHOLiC,0,0,0,0,0,1,0,1,1,1,...,0,0,0,0,1,4,4,1,1,1
xxxHOLiC The Movie: A Midsummer Night's Dream,0,0,0,0,0,1,0,1,1,1,...,0,0,0,0,1,4,4,1,1,1
∀ Gundam,0,0,0,0,0,0,1,1,0,1,...,0,0,0,0,1,1,1,6,3,3
∀ Gundam I: Earth Light,0,0,0,0,0,0,1,1,0,0,...,0,0,0,0,1,1,1,3,3,3


In [18]:
# weighted_df_pickle_file = "weighted_df.df"
# if os.path.exists(weighted_df_pickle_file):
#     weighted_df = pd.read_pickle(weighted_df_pickle_file)
# else:
#     weighted_df = create_weighted_df(weighted_matrix_df)
#     weighted_df.to_pickle(weighted_df_pickle_file)

In [19]:
# weighted_df

In [20]:
# weighted_df.columns

In [21]:
# for i in [3, 2, 1]:
# G = nx.from_pandas_edgelist(
#         weighted_df, source='source',
#         target='target', edge_attr='weight')

plt.figure(figsize=(35,35))

G = nx.from_numpy_array(weighted_matrix_np)
graph_pos = nx.spring_layout(G)
nx.draw_networkx_nodes(G, graph_pos, node_size=10, node_color='blue', alpha=0.3)
nx.draw_networkx_edges(G, graph_pos)
nx.draw_networkx_labels(G, graph_pos, font_size=8, font_family='sans-serif')

# plt.show()

# nx.draw(G)

KeyboardInterrupt: 

In [None]:
plt.savefig("plot.svg", dpi=1200)

<Figure size 640x480 with 0 Axes>

In [None]:
net = Network(notebook=True)
net.from_nx(G)
net.show("example.html")