In [12]:
# Standard IPython notebook imports
%matplotlib inline

import os
import json

import csv
import networkx as nx
import pyarrow
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import math
import ast
from itertools import chain
import matplotlib.pyplot as plt
import subprocess
import sklearn.metrics
from typing import Optional
import shutil
pd.options.display.max_colwidth = 300

In [13]:
two_mode_data="dataset/anime-dataset-2023.csv"
users_csv="dataset/user-filtered.csv"

In [14]:
%%time
# importing data from csv to pandas

class Data_factory:
    def from_csv(file: str, sep=",") -> pd.DataFrame:
        return pd.read_csv(file, sep=sep)

CPU times: user 25 µs, sys: 4 µs, total: 29 µs
Wall time: 33.4 µs


In [15]:
users_df = Data_factory.from_csv(users_csv)

In [16]:
users_df.head() # is user data loaded?
len(users_df)

109224747

In [43]:
class AnimeRecomendation:
    def __init__(self, dimensions = 32, iter = 16):
        self.users_df     = pd.DataFrame()
        self.users_count  = None
        self.dimensions   = dimensions
        self.iterations   = iter
        self.columns      = ["user", "anime"]
        self.tsv_filename = None
        
    def number_of_users(self):
        if self.users_count is None:
            self.users_count = self.users_df.max()['user_id']
        return self.users_count

    def save_to_tsv(self, tsv_filename: str):
        grouped_df        = self.grouped_df
        self.tsv_filename = tsv_filename
        columns_to_keep   = ['user_id', 'id_rating']
        grouped_df.to_csv(tsv_filename, index=False, sep='\t', columns=columns_to_keep, mode='w', header=False)

    def fit(self, users_df, lines: Optional[int] = None, rating_threshold: int = 6):
        if lines is not None:
            self.users_df = users_df.head(int(lines))
            users_df = self.users_df
        else:
            self.users_df = users_df
            
        self.users_df['id_rating'] = list(zip(self.users_df['anime_id'], self.users_df['rating']))
    
        def agg_fun(anime, rating):
            return " ".join([str(anime) for _ in range(rating - rating_threshold + 1)])

        self.grouped_df = users_df[users_df.rating >= rating_threshold]                          \
            .groupby("user_id")["id_rating"]                                                     \
            .agg(lambda animes: " ".join([agg_fun(anime, rating) for (anime,rating) in animes])) \
            .reset_index()

        return self.number_of_users()
    
    def choose(self, nousers: int):
        self.grouped_df = self.grouped_df.sample(n=nousers)
    
    def cleora_train(self, cleora_exe="cleora"):
        if self.tsv_filename is None:
            raise RuntimeError("TSV filename not yet created")
        if not os.access(cleora_exe, os.X_OK) and shutil.which(cleora_exe) is None:
            raise RuntimeError("cleora executable not found")

        command = [cleora_exe,
                   "--type", "tsv",
                   f"--columns=transient::{self.columns[0]} complex::{self.columns[1]}",
                   "--dimension", str(self.dimensions),
                   "--number-of-iterations", str(self.iterations),
                   "--prepend-field-name", "1",
                   "-f", "numpy",
                   "-o", "results",
                   "-e", "1",
                   self.tsv_filename]
        subprocess.run(command, check=True)


In [108]:
model = AnimeRecomendation(dimensions=128, iter=64)

In [109]:
%%time

LINES = 40000000
RATING_THRESHOLD = 8

model.fit(users_df, lines=LINES, rating_threshold=RATING_THRESHOLD)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.users_df['id_rating'] = list(zip(self.users_df['anime_id'], self.users_df['rating']))


CPU times: user 1min 7s, sys: 4.32 s, total: 1min 11s
Wall time: 1min 12s


129988

In [110]:
%%time

#model.grouped_df =  pd.read_csv("data3.tsv", sep="\t", header=None, names=["user_id", "id_rating"])
model.grouped_df

CPU times: user 11 µs, sys: 1 µs, total: 12 µs
Wall time: 17.9 µs


Unnamed: 0,user_id,id_rating
0,0,67 67 242 242 242 21 21 21 24 24 4722 3125 3125 481 481 481 356 356 121 121 430 430 1571 1571 1571 578 578 578 431 2762 2762 3418 3418 2034 164 459 459 419 199 2236 2236 2236 415 415 415 269 269 235 235 235
1,1,37403 40852 40852 40052 40748 40748 21 21 42203 42203 40028 40028 40028 481 9919 5081 31043 31043 31964 31964 33486 36456 38408 28755 1575 1575 1575 1535 1535 18689 38731 30230 38691 38691 5114 5114 5114 20583 28891 32935 38883 40776 40776 37259 11061 11061 37497 37999 40221 37965 37965 38000 38...
2,2,235 235 235 5042 7593 21 21 22 22 35028 368 31964 33486 33486 1575 1575 2904 2904 2904 1535 1535 1535 28223 226 38671 15 15 71 7661 263 263 263 270 270 24703 15451 15451 11061 11061 11061 14719 14719 20899 20899 26055 26055 34933 37086 1604 1604 22535 189 29575 10620 32182 32182 23755 23755 20 2...
3,3,6114 199 199 849 33352 33352 33352 31646 31646 32998 1292 34881 47 47 32828 22147 6547 9989 11433 11433 11111 9919 33506 7647 66 66 2251 2251 5081 7674 10030 10030 22789 10396 13535 889 889 1519 1519 32086 14967 14345 1689 28999 14741 2167 1575 2904 1 1 34103 32189 33028 16592 11843 33988 24031 ...
4,4,101 656 3549 104 150 150 819 1222 105 1542 1689 1689 1689 232 232 372 372 1575 1575 1575 2904 2904 2904 1535 1535 1535 345 2035 2035 356 120 120 71 123 123 123 74 1571 1571 16 16 1142 1142 4551 4551 5630 3731 3731 5680 145 1691 6045 6045 6045 1579 1579 435 1563 1563 334 585 585 2926 3655 1698 44...
...,...,...
114804,129984,34572 34572 34572 34566 34566 34566 40530 40530 40750 40750 40750 42203 42203 42203 40028 40028 40028 31646 35180 35180 35180 38101 38101 5081 5081 10030 31043 33486 38408 35608 35608 35608 2167 1535 32901 5114 17895 17895 17895 10793 10793 28891 32935 32935 38883 40776 40776 36296 36296 5680 37...
114805,129985,35849 35849 35849 21 21 6408 31646 11759 13939 12291 12291 17082 30123 30123 25013 25013 30370 5112 5112 27655 8676 8676 8676 9117 9117 11235 11235 12669 12669 9925 9925 9925 6547 6547 6547 9062 9062 9989 9989 9989 15039 24833 24833 24833 30654 30654 30654 9919 11737 11737 11266 11266 10647 2824...
114806,129986,31173 25013 25013 30370 30370 6547 6547 9062 9062 10067 10067 35015 24833 30654 30654 21995 21995 21995 24151 24151 34636 34618 31964 31964 33486 33486 36456 36896 40465 2167 2167 4181 4181 4181 4059 4059 4059 1535 1535 28223 33988 18507 22265 26213 2890 27633 918 918 15335 15335 9969 9969 15417...
114807,129987,21 31646 47 9989 28805 28805 5081 5081 12115 4901 1222 1689 1818 1575 2904 1 1 1 5 17121 1535 6746 27833 27833 23199 23199 27831 2759 3784 3784 3785 10087 11741 227 10793 21855 270 777 777 578 578 10408 10408 468 16664 16664 32491 32491 2593 572 33 33 32281 486 486 2175 1379 1379 3466 22535 9260...


In [111]:
#model.choose(20000)
model.grouped_df

Unnamed: 0,user_id,id_rating
0,0,67 67 242 242 242 21 21 21 24 24 4722 3125 3125 481 481 481 356 356 121 121 430 430 1571 1571 1571 578 578 578 431 2762 2762 3418 3418 2034 164 459 459 419 199 2236 2236 2236 415 415 415 269 269 235 235 235
1,1,37403 40852 40852 40052 40748 40748 21 21 42203 42203 40028 40028 40028 481 9919 5081 31043 31043 31964 31964 33486 36456 38408 28755 1575 1575 1575 1535 1535 18689 38731 30230 38691 38691 5114 5114 5114 20583 28891 32935 38883 40776 40776 37259 11061 11061 37497 37999 40221 37965 37965 38000 38...
2,2,235 235 235 5042 7593 21 21 22 22 35028 368 31964 33486 33486 1575 1575 2904 2904 2904 1535 1535 1535 28223 226 38671 15 15 71 7661 263 263 263 270 270 24703 15451 15451 11061 11061 11061 14719 14719 20899 20899 26055 26055 34933 37086 1604 1604 22535 189 29575 10620 32182 32182 23755 23755 20 2...
3,3,6114 199 199 849 33352 33352 33352 31646 31646 32998 1292 34881 47 47 32828 22147 6547 9989 11433 11433 11111 9919 33506 7647 66 66 2251 2251 5081 7674 10030 10030 22789 10396 13535 889 889 1519 1519 32086 14967 14345 1689 28999 14741 2167 1575 2904 1 1 34103 32189 33028 16592 11843 33988 24031 ...
4,4,101 656 3549 104 150 150 819 1222 105 1542 1689 1689 1689 232 232 372 372 1575 1575 1575 2904 2904 2904 1535 1535 1535 345 2035 2035 356 120 120 71 123 123 123 74 1571 1571 16 16 1142 1142 4551 4551 5630 3731 3731 5680 145 1691 6045 6045 6045 1579 1579 435 1563 1563 334 585 585 2926 3655 1698 44...
...,...,...
114804,129984,34572 34572 34572 34566 34566 34566 40530 40530 40750 40750 40750 42203 42203 42203 40028 40028 40028 31646 35180 35180 35180 38101 38101 5081 5081 10030 31043 33486 38408 35608 35608 35608 2167 1535 32901 5114 17895 17895 17895 10793 10793 28891 32935 32935 38883 40776 40776 36296 36296 5680 37...
114805,129985,35849 35849 35849 21 21 6408 31646 11759 13939 12291 12291 17082 30123 30123 25013 25013 30370 5112 5112 27655 8676 8676 8676 9117 9117 11235 11235 12669 12669 9925 9925 9925 6547 6547 6547 9062 9062 9989 9989 9989 15039 24833 24833 24833 30654 30654 30654 9919 11737 11737 11266 11266 10647 2824...
114806,129986,31173 25013 25013 30370 30370 6547 6547 9062 9062 10067 10067 35015 24833 30654 30654 21995 21995 21995 24151 24151 34636 34618 31964 31964 33486 33486 36456 36896 40465 2167 2167 4181 4181 4181 4059 4059 4059 1535 1535 28223 33988 18507 22265 26213 2890 27633 918 918 15335 15335 9969 9969 15417...
114807,129987,21 31646 47 9989 28805 28805 5081 5081 12115 4901 1222 1689 1818 1575 2904 1 1 1 5 17121 1535 6746 27833 27833 23199 23199 27831 2759 3784 3784 3785 10087 11741 227 10793 21855 270 777 777 578 578 10408 10408 468 16664 16664 32491 32491 2593 572 33 33 32281 486 486 2175 1379 1379 3466 22535 9260...


In [112]:
%%time
# constructing the tsv data

tsv_file_name = "data2.tsv"
model.save_to_tsv(tsv_file_name)

CPU times: user 3.13 s, sys: 243 ms, total: 3.37 s
Wall time: 3.61 s


In [113]:
%%time

# You shouldnt be running it every time!

# Commented it out for safety reasons xd
model.cleora_train()

[0m[38;5;8m[[0m2024-02-07T17:50:03Z [0m[32mINFO [0m cleora[0m[38;5;8m][0m Reading args...
[src/main.rs:222] &config = Configuration {
    produce_entity_occurrence_count: true,
    embeddings_dimension: 128,
    max_number_of_iteration: 64,
    seed: None,
    prepend_field: true,
    log_every_n: 10000,
    in_memory_embedding_calculation: true,
    input: [
        "data2.tsv",
    ],
    file_type: Tsv,
    output_dir: Some(
        "results",
    ),
    output_format: Numpy,
    relation_name: "emb",
    columns: [
        Column {
            name: "user",
            transient: true,
            complex: false,
            reflexive: false,
            ignored: false,
        },
        Column {
            name: "anime",
            transient: false,
            complex: true,
            reflexive: false,
            ignored: false,
        },
    ],
}
[0m[38;5;8m[[0m2024-02-07T17:50:03Z [0m[32mINFO [0m cleora[0m[38;5;8m][0m Starting calculation...
[src/pipeli

CPU times: user 127 ms, sys: 23.2 ms, total: 150 ms
Wall time: 2min 28s


[0m[38;5;8m[[0m2024-02-07T17:52:31Z [0m[32mINFO [0m cleora::embedding[0m[38;5;8m][0m Done iter: 63. Dims: 128, entities: 130150, num data points: 24062056.
[0m[38;5;8m[[0m2024-02-07T17:52:31Z [0m[32mINFO [0m cleora::embedding[0m[38;5;8m][0m Done propagating.
[0m[38;5;8m[[0m2024-02-07T17:52:31Z [0m[32mINFO [0m cleora::embedding[0m[38;5;8m][0m Start saving embeddings.
[0m[38;5;8m[[0m2024-02-07T17:52:31Z [0m[32mINFO [0m cleora::embedding[0m[38;5;8m][0m Done saving embeddings.
[0m[38;5;8m[[0m2024-02-07T17:52:31Z [0m[32mINFO [0m cleora::embedding[0m[38;5;8m][0m Finalizing embeddings calculations!
[0m[38;5;8m[[0m2024-02-07T17:52:31Z [0m[32mINFO [0m cleora[0m[38;5;8m][0m Finished in 148 sec


In [105]:
class RatingGenerator:
    def __init__(self):
        self.users_count  = None
        self.columns      = ["user", "anime"]
        self.rankings     = dict()

    def anime_label(idx: int) -> str:
        return f"anime__{idx}"

    def get_artifacts(self):
        d = dict()
        p = "results/emb__"
        files  = ["labels", "vects_iter"]
        suf = [".out.entities", ".out.npy"]
    
        return { f:f"{p}{self.columns[0]}__{self.columns[1]}{suf[idx]}" 
                    for idx, f in enumerate(files) }

    def load_artifacts(self):
        artifacts = self.get_artifacts()
        with open(artifacts['labels'], "r") as entities:
            self.labels     = json.load(entities)
        # Load results to numpy
        self.vects_iter     = np.load(artifacts['vects_iter'])

    def load_rankings(self, idx: int):
        real_id = self.labels.index(f"anime__{idx}")

        v = self.vects_iter[real_id]
        dist = sklearn.metrics.pairwise.cosine_similarity(v.reshape(1, -1), self.vects_iter, dense_output=True)
        ranking = (-dist).argsort()[0]

        self.rankings[self.labels[real_id]] = ranking[:15]

    def add_to_custom_ranking(self, custom_ranking, idx: int):
        anime_ranking = self.rankings[f"anime__{idx}"]

        for anime in anime_ranking:
            if anime in custom_ranking:
                custom_ranking[anime] += 1
            else:
                custom_ranking[anime] = 1

    def predict(self, already_watched):
        
        self.load_artifacts()
        custom_ranking = dict()
        
        for idx in tqdm(already_watched):
            if f"anime__{idx}" not in self.rankings:
                self.load_rankings(idx)

            self.add_to_custom_ranking(custom_ranking, idx)

        return dict(sorted(custom_ranking.items(), reverse=True, key=lambda x:x[1]))

In [106]:
anime_df = Data_factory.from_csv(two_mode_data)

In [107]:
# Ranking


ratingGenerator = RatingGenerator()

def pandas_extract_content(row, label):
    name = row[label].to_string()
    return name.split("    ")[1]

ranking = ratingGenerator.predict([67, 6702, 242])
print(ranking)
for label, times in ranking.items():
    filter = anime_df[anime_df.anime_id == label]
    if len(filter) == 0:
        continue

    print(f"{pandas_extract_content(filter, 'Name')}: {times}")

  0%|          | 0/3 [00:00<?, ?it/s]

{7151: 1, 13083: 1, 9226: 1, 9231: 1, 9235: 1, 9236: 1, 2140: 1, 9219: 1, 5450: 1, 9237: 1, 5442: 1, 5438: 1, 2154: 1, 5432: 1, 13021: 1, 0: 1, 8430: 1, 2401: 1, 2400: 1, 2380: 1, 8458: 1, 8460: 1, 2376: 1, 8462: 1, 12032: 1, 8428: 1, 12031: 1, 12025: 1, 8500: 1, 2339: 1, 3054: 1, 5210: 1, 7296: 1, 3380: 1, 3827: 1, 8774: 1, 12110: 1, 3352: 1, 11252: 1, 8070: 1, 1646: 1, 13416: 1, 2564: 1, 1813: 1, 8829: 1}
Fushigi na Kusuri: 1
Aniyome: 1
Tekkon Kinkreet: 1
GS Mikami: 1
Tokubetsu Jugyou 2: 1
Mirai Nikki: 1
Shusaku Liberty: 1
Kingdom: 1
Haguregumo: 1
Youjuu Kyoushitsu Gaiden: 1
La Seine no Hoshi: 1
Houkago 2 The Animation: 1
Michi (2004): 1
Hana Dorei: 1
Ore wa Teppei: 1
3-tsu no Hanashi: 1
Tengen Toppa Gurren Lagann: Mitee Mono wa Miteen da!!: 1
Code-E: 1
Genshiken OVA: 1
Shounen Ashibe (TV): 1


In [100]:
def extract_year(aired):
    return aired.split(",")[1].split(" ")[1]

def search_str(s, search):
    return search in str(s).lower()

def find_by_name(name):
    name = name.lower()
    mask = anime_df.apply(lambda x: x.map(lambda s: search_str(s, name)))
    return anime_df.loc[mask.any(axis=1)]

find_by_name("ghibli")

Unnamed: 0,anime_id,Name,English name,Other name,Score,Genres,Synopsis,Type,Episodes,Aired,...,Studios,Source,Duration,Rating,Rank,Popularity,Favorites,Scored By,Members,Image URL
142,164,Mononoke Hime,Princess Mononoke,もののけ姫,8.67,"Action, Adventure, Award Winning, Fantasy","When an Emishi village is attacked by a fierce demon boar, the young prince Ashitaka puts his life at stake to defend his tribe. With its dying breath, the beast curses the prince's arm, granting him demonic powers while gradually siphoning his life away. Instructed by the village elders to trav...",Movie,1.0,"Jul 12, 1997",...,Studio Ghibli,Original,2 hr 13 min,PG-13 - Teens 13 or older,58.0,105,20632,769154.0,1190601,https://cdn.myanimelist.net/images/anime/7/75919.jpg
176,199,Sen to Chihiro no Kamikakushi,Spirited Away,千と千尋の神隠し,8.78,"Adventure, Award Winning, Supernatural","Stubborn, spoiled, and naïve, 10-year-old Chihiro Ogino is less than pleased when she and her parents discover an abandoned amusement park on the way to their new house. Cautiously venturing inside, she realizes that there is more to this place than meets the eye, as strange things begin to happ...",Movie,1.0,"Jul 20, 2001",...,Studio Ghibli,Original,2 hr 4 min,PG - Children,36.0,44,31139,1234780.0,1760676,https://cdn.myanimelist.net/images/anime/6/79597.jpg
388,415,Tonari no Yamada-kun,My Neighbors the Yamadas,ホーホケキョ となりの山田くん,7.12,"Award Winning, Comedy, Slice of Life","From squabbles about groceries against the ambient noise of the TV to their daily shenanigans, the Yamada family invites only a few unusual gazes—similar to any typical household. But despite their prickly exteriors, each of them genuinely cares for one another, making sure to smile and laugh th...",Movie,1.0,"Jul 17, 1999",...,Studio Ghibli,4-koma manga,1 hr 43 min,G - All Ages,3448.0,3050,103,24750.0,45565,https://cdn.myanimelist.net/images/anime/6/64545.jpg
389,416,Kurenai no Buta,Porco Rosso,紅の豚,7.95,"Action, Adventure, Award Winning, Comedy, Drama, Romance","After a curse turned him into a pig, World War I ace Marco Pagot becomes Porco Rosso, a mysterious bounty hunter who takes down sky pirates in the Adriatic Sea. He whiles away his days on a secluded island, rarely leaving other than to collect bounties or to visit the beautiful Gina, a songstres...",Movie,1.0,"Jul 18, 1992",...,Studio Ghibli,Manga,1 hr 33 min,G - All Ages,648.0,956,1938,147809.0,234087,https://cdn.myanimelist.net/images/anime/6/2878.jpg
404,431,Howl no Ugoku Shiro,Howl's Moving Castle,ハウルの動く城,8.66,"Adventure, Award Winning, Drama, Fantasy, Romance","That jumbled piece of architecture, that cacophony of hissing steam and creaking joints, with smoke billowing from it as it moves on its own... That castle is home to the magnificent wizard Howl, infamous for both his magical prowess and for being a womanizer—or so the rumor goes in Sophie Hatte...",Movie,1.0,"Nov 20, 2004",...,Studio Ghibli,Novel,1 hr 59 min,G - All Ages,61.0,97,26858,866696.0,1253703,https://cdn.myanimelist.net/images/anime/5/75810.jpg
479,512,Majo no Takkyuubin,Kiki's Delivery Service,魔女の宅急便,8.22,"Adventure, Award Winning, Comedy, Drama, Fantasy","Kiki, a 13-year-old witch-in-training, must spend a year living on her own in a distant town in order to become a full-fledged witch. Leaving her family and friends, Kiki undertakes this tradition when she flies out into the open world atop her broomstick with her black cat Jiji.\n\nAs she settl...",Movie,1.0,"Jul 29, 1989",...,Studio Ghibli,Novel,1 hr 43 min,G - All Ages,323.0,367,5357,356104.0,547722,https://cdn.myanimelist.net/images/anime/10/75916.jpg
480,513,Tenkuu no Shiro Laputa,Castle in the Sky,天空の城ラピュタ,8.26,"Adventure, Award Winning, Fantasy, Romance, Sci-Fi","In a world filled with planes and airships, Sheeta is a young girl who has been kidnapped by government agents who seek her mysterious crystal amulet. While trapped aboard an airship, she finds herself without hope—that is, until the ship is raided by pirates. Taking advantage of the ensuing con...",Movie,1.0,"Aug 2, 1986",...,Studio Ghibli,Original,2 hr 4 min,PG - Children,286.0,451,4017,289006.0,460201,https://cdn.myanimelist.net/images/anime/5/37799.jpg
490,523,Tonari no Totoro,My Neighbor Totoro,となりのトトロ,8.25,"Adventure, Award Winning, Supernatural","In 1950s Japan, Tatsuo Kusakabe relocates himself and his two daughters, Satsuki and Mei, to the countryside to be closer to their mother, who is hospitalized due to long-term illness. As the girls grow acquainted with rural life, Mei encounters a small, bunny-like creature in the yard one day. ...",Movie,1.0,"Apr 16, 1988",...,Studio Ghibli,Original,1 hr 26 min,G - All Ages,287.0,143,7978,658094.0,1009153,https://cdn.myanimelist.net/images/anime/4/75923.jpg
542,578,Hotaru no Haka,Grave of the Fireflies,火垂るの墓,8.51,Drama,"As World War II reaches its conclusion in 1945, Japan faces widespread destruction in the form of American bombings, devastating city after city. Hotaru no Haka, also known as Grave of the Fireflies, is the story of Seita and his sister Setsuko, two Japanese children whose lives are ravaged by t...",Movie,1.0,"Apr 16, 1988",...,Studio Ghibli,Novel,1 hr 28 min,PG-13 - Teens 13 or older,119.0,299,7606,379744.0,637561,https://cdn.myanimelist.net/images/anime/7/75808.jpg
548,585,Mimi wo Sumaseba,Whisper of the Heart,耳をすませば,8.22,"Drama, Romance","Shizuku Tsukishima is an energetic 14-year-old girl who enjoys reading and writing poetry in her free time. Glancing at the checkout cards of her books one evening, she notices that her library books are frequently checked out by a boy named Seiji Amasawa. Curiosity strikes Shizuku, and she deci...",Movie,1.0,"Jul 15, 1995",...,Studio Ghibli,Manga,1 hr 51 min,G - All Ages,325.0,647,4664,190624.0,337679,https://cdn.myanimelist.net/images/anime/1899/109259.jpg


In [3]:
def df_from_array(array):
    return pd.DataFrame(data=array[1:,1:], index=array[1:,0], columns=array[0,1:])

def read_data(file):
    with open(file, newline='') as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        return [ np.array([i] + list(row)) for i, row in enumerate(reader, start=-1)]

def get_genre_list(genres: str):
  return list(map(lambda x: x.strip().lower(), genres.split(",")))

def filter_out(word, col):
  return lambda row: not (word.lower() in get_genre_list(row[col]))

def filter_in(word, col):
  return lambda row: (word.lower() in get_genre_list(row[col]))

def create_two_mode_df(data):
    sub_df = data[["anime_id", "English name", "Genres"]]
    result = [['id', 'source', 'target']]
    i=0
    for _, row in sub_df.iterrows():
        for genre in get_genre_list(row['Genres']):
            to_append = [i, row['English name'], genre]
            result.append(to_append)
            i+=1
    return df_from_array(np.array(result))

def create_weighted_matrix(two_mode_df):
    assert(two_mode_df.columns.tolist() == ['source', 'target'])
    source_uniq = two_mode_df['source'].unique()
    source_len = source_uniq.shape[0]
    source_dict = {k: v for v, k in enumerate(source_uniq)}
    array = np.zeros((source_len, source_len), dtype=int)

    for _, source1 in tqdm(two_mode_df.iterrows(), total=len(two_mode_df.index)):
        for _, source2 in two_mode_df.iterrows():
            if source1['target'] == source2['target']:
                array[source_dict[source1['source']], source_dict[source2['source']]] += 1
    return pd.DataFrame(data=array, columns=source_uniq, index=source_uniq)

def create_weighted_matrix2(two_mode_df):
    adj_df = pd.crosstab(two_mode_df.source, two_mode_df.target)
    adj_np = adj_df.to_numpy()
    adj_np2 = adj_np@adj_np.T
    return (adj_np2, adj_df.index)


def create_weighted_df(matrix_df):
    result=[['', 'source', 'target', 'weight']]
    c = 0
    for rowIndex, row in tqdm(matrix_df.iterrows(), total=len(matrix_df.index)): #iterate over rows
        for colIndex, value in row.items():
            weight = matrix_df.at[rowIndex, colIndex]
            if weight > 0:
                to_append = [str(c), rowIndex, colIndex, weight]
                result.append(to_append)
                c += 1
    return df_from_array(np.array(result))


In [4]:
raw_data = read_data(two_mode_data)
# raw_data = raw_data[:1000]

In [5]:
all_data = df_from_array(np.array(raw_data))

In [6]:
all_data.columns

Index(['anime_id', 'Name', 'English name', 'Other name', 'Score', 'Genres',
       'Synopsis', 'Type', 'Episodes', 'Aired', 'Premiered', 'Status',
       'Producers', 'Licensors', 'Studios', 'Source', 'Duration', 'Rating',
       'Rank', 'Popularity', 'Favorites', 'Scored By', 'Members', 'Image URL'],
      dtype='object')

In [7]:
all_data = all_data[(all_data['Score'] != "UNKNOWN") & (all_data['Score'] != '')].astype({'Score': 'float'})
# all_data = all_data.drop(columns=['Synopsis', 'Aired', 'Image URL', 'Scored By', 'Members', 'Licensors'])
# all_data = all_data[all_data['Score'] > 7.0]
important_data = all_data.drop(columns=['Synopsis', 'Aired', 'Image URL', 'Scored By', 'Members', 'Licensors'])

In [8]:
movies = important_data[important_data['Type'] == "Movie"]
TVs = important_data[important_data['Type'] == "TV"]
specials = important_data[important_data['Type'] == "Special"]

specified = important_data

not_hentai = specified[specified.apply(filter_out("Hentai", 'Genres'), axis=1)]
hentai = specified[specified.apply(filter_in("Hentai", 'Genres'), axis=1)]

mappa = not_hentai[not_hentai.apply(filter_in("Mappa", "Studios"), axis=1)]

In [9]:
# not_hentai.sort_values(['Score'], ascending=False)
print(len(hentai))
print(len(not_hentai))

1465
14227


In [10]:
data = all_data[["anime_id", "English name", "Genres", "Score"]]
data = data[(data['Score'] != "UNKNOWN") & (data['Score'] != '')].astype({'Score': 'float'})

In [11]:
data.dtypes

mapped = map(get_genre_list, data["Genres"].to_numpy().flatten().tolist())

uniq = np.unique(list(chain.from_iterable(mapped))).tolist()

print(len(uniq), uniq)

22 ['action', 'adventure', 'avant garde', 'award winning', 'boys love', 'comedy', 'drama', 'ecchi', 'erotica', 'fantasy', 'girls love', 'gourmet', 'hentai', 'horror', 'mystery', 'romance', 'sci-fi', 'slice of life', 'sports', 'supernatural', 'suspense', 'unknown']


In [12]:
two_mode_df = create_two_mode_df(data)

two_mode_df

Unnamed: 0,source,target
0,Cowboy Bebop,action
1,Cowboy Bebop,award winning
2,Cowboy Bebop,sci-fi
3,Cowboy Bebop: The Movie,action
4,Cowboy Bebop: The Movie,sci-fi
...,...,...
30955,One Piece: Recapping Fierce Fights! The Counte...,action
30956,One Piece: Recapping Fierce Fights! The Counte...,adventure
30957,One Piece: Recapping Fierce Fights! The Counte...,comedy
30958,One Piece: Recapping Fierce Fights! The Counte...,fantasy


In [13]:
(weighted_matrix_np, weighted_matrix_index) = create_weighted_matrix2(two_mode_df)

In [14]:
weighted_matrix_df2 = pd.DataFrame(data=weighted_matrix_np, index=weighted_matrix_index, columns=weighted_matrix_index)


In [15]:
weighted_matrix_df2

source,"""Deji"" Meets Girl","""Ippon"" again!","""Parade"" de Satie","""Star""t",-OutsideR:RequieM-,.Koni-chan,.hack//G.U. Trilogy,.hack//G.U. Trilogy: Parody Mode,.hack//Gift,.hack//Legend of The Twilight,...,selector spread WIXOSS,sound / phantasma / mirror,teeter,the FLY BanD!,tsuritama,xxxHOLiC,xxxHOLiC The Movie: A Midsummer Night's Dream,∀ Gundam,∀ Gundam I: Earth Light,∀ Gundam II: Moonlight Butterfly
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"""Deji"" Meets Girl",1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
"""Ippon"" again!",0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"""Parade"" de Satie",0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"""Star""t",0,0,0,1,1,0,0,0,0,0,...,1,0,1,1,0,0,0,0,0,0
-OutsideR:RequieM-,0,0,0,1,1,0,0,0,0,0,...,1,0,1,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
xxxHOLiC,0,0,0,0,0,1,0,1,1,1,...,0,0,0,0,1,4,4,1,1,1
xxxHOLiC The Movie: A Midsummer Night's Dream,0,0,0,0,0,1,0,1,1,1,...,0,0,0,0,1,4,4,1,1,1
∀ Gundam,0,0,0,0,0,0,1,1,0,1,...,0,0,0,0,1,1,1,6,3,3
∀ Gundam I: Earth Light,0,0,0,0,0,0,1,1,0,0,...,0,0,0,0,1,1,1,3,3,3


In [16]:
weighted_matrix_df_pickle_file = "weighted_matrix_df.df"
if os.path.exists(weighted_matrix_df_pickle_file):
    weighted_matrix_df = pd.read_pickle(weighted_matrix_df_pickle_file)
else:
    weighted_matrix_df = weighted_matrix_df2
    weighted_matrix_df.to_pickle(weighted_matrix_df_pickle_file)

In [17]:
weighted_matrix_df

source,"""Deji"" Meets Girl","""Ippon"" again!","""Parade"" de Satie","""Star""t",-OutsideR:RequieM-,.Koni-chan,.hack//G.U. Trilogy,.hack//G.U. Trilogy: Parody Mode,.hack//Gift,.hack//Legend of The Twilight,...,selector spread WIXOSS,sound / phantasma / mirror,teeter,the FLY BanD!,tsuritama,xxxHOLiC,xxxHOLiC The Movie: A Midsummer Night's Dream,∀ Gundam,∀ Gundam I: Earth Light,∀ Gundam II: Moonlight Butterfly
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"""Deji"" Meets Girl",1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
"""Ippon"" again!",0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"""Parade"" de Satie",0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"""Star""t",0,0,0,1,1,0,0,0,0,0,...,1,0,1,1,0,0,0,0,0,0
-OutsideR:RequieM-,0,0,0,1,1,0,0,0,0,0,...,1,0,1,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
xxxHOLiC,0,0,0,0,0,1,0,1,1,1,...,0,0,0,0,1,4,4,1,1,1
xxxHOLiC The Movie: A Midsummer Night's Dream,0,0,0,0,0,1,0,1,1,1,...,0,0,0,0,1,4,4,1,1,1
∀ Gundam,0,0,0,0,0,0,1,1,0,1,...,0,0,0,0,1,1,1,6,3,3
∀ Gundam I: Earth Light,0,0,0,0,0,0,1,1,0,0,...,0,0,0,0,1,1,1,3,3,3


In [18]:
# weighted_df_pickle_file = "weighted_df.df"
# if os.path.exists(weighted_df_pickle_file):
#     weighted_df = pd.read_pickle(weighted_df_pickle_file)
# else:
#     weighted_df = create_weighted_df(weighted_matrix_df)
#     weighted_df.to_pickle(weighted_df_pickle_file)

In [19]:
# weighted_df

In [20]:
# weighted_df.columns

In [21]:
# for i in [3, 2, 1]:
# G = nx.from_pandas_edgelist(
#         weighted_df, source='source',
#         target='target', edge_attr='weight')

plt.figure(figsize=(35,35))

G = nx.from_numpy_array(weighted_matrix_np)
graph_pos = nx.spring_layout(G)
nx.draw_networkx_nodes(G, graph_pos, node_size=10, node_color='blue', alpha=0.3)
nx.draw_networkx_edges(G, graph_pos)
nx.draw_networkx_labels(G, graph_pos, font_size=8, font_family='sans-serif')

# plt.show()

# nx.draw(G)

KeyboardInterrupt: 

In [None]:
plt.savefig("plot.svg", dpi=1200)

<Figure size 640x480 with 0 Axes>

In [None]:
net = Network(notebook=True)
net.from_nx(G)
net.show("example.html")