In [5]:
# Standard IPython notebook imports
%matplotlib inline

import os
import csv
import math
import ast
import json
import shutil

import pyarrow
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import sklearn.metrics

import subprocess

from tqdm.notebook import tqdm
from itertools import chain
from typing import Optional

pd.options.display.max_colwidth = 300

In [6]:
USERS_CSV="dataset/user-filtered.csv"
ANIME_CSV="dataset/anime-dataset-2023.csv"

In [7]:
def load_pd(filepath, line_count: int | None):
    return pd.read_csv(filepath, sep=',', nrows=line_count)

In [8]:
%%time
LINES = 2000000

users_df = load_pd(USERS_CSV, LINES)
animes_df = load_pd(ANIME_CSV, None)

# is user data loaded?
len(users_df)
len(animes_df)

CPU times: user 532 ms, sys: 60.3 ms, total: 592 ms
Wall time: 594 ms


24905

In [9]:
class AnimeRecomendation:
    def __init__(self, users_df=pd.DataFrame(), animes_df=pd.DataFrame()):
        self.users_df = users_df
        self.animes_df = animes_df
        self.users_count = None
        self.columns = ['user', 'anime']
        self.grouped_columns = ['user_id', 'anime_id']
        self.tsv_filename = None

    def change_data(self, users_df, animes_df):
        self.users_df = users_df
        self.animes_df = animes_df

        return self.number_of_users()

    def from_csv(self, file: str):
        self.tsv_filename = file
        self.grouped_df = pd.read_csv(file,
                                      sep="\t",
                                      header=None,
                                      names=self.grouped_columns)

    def choose(self, nousers: int):
        if nousers <= self.number_of_users():
            self.users_df = self.users_df.sample(n=nousers)

    def number_of_users(self):
        if self.users_count is None:
            self.users_count = self.users_df.max()['user_id']
        return self.users_count

    def fit(self,
            rating_threshold=6):

        def adj_mult(rating: int) -> int:
            match rating:
                case 6 | 7: return 1
                case 8 | 9: return 2
                case 10: return 3
                case _: return 0

        def agg_fun(anime: str, rating: int) -> str:
            return " ".join([str(anime)] * adj_mult(rating))

        self.users_df = (self.users_df[self.users_df.rating >= rating_threshold]
            .assign(id_rating=lambda x: list(zip(x['anime_id'], x['rating'])))
            .groupby('user_id')['id_rating']
            .agg(lambda animes: " ".join([agg_fun(anime, rating) for (anime, rating) in animes]))
            .reset_index()
            .rename(columns={'id_rating': 'anime_id'}))

        self.animes_df = (self.animes_df
            .assign(genre=lambda x: x['Genres'].apply(lambda x: x.split(",")))
            .filter(items=['anime_id', 'genre'])
            .explode('genre')
            .groupby('genre')
            .agg(lambda genres: " ".join([str(i) for i in genres]))
            .reset_index()
            .query('genre != "UNKNOWN"')
            .rename(columns={'genre': 'user_id'}))

        self.grouped_df = pd.concat([self.users_df, self.animes_df], ignore_index=True, sort=False)

        return self.number_of_users()

    def save_to_tsv(self, tsv_filename: str):
        self.tsv_filename = tsv_filename
        self.grouped_df.to_csv(self.tsv_filename,
                               index=False,
                               sep='\t',
                               columns=self.grouped_columns,
                               mode='w',
                               header=False)

    def cleora_train(self, cleora_exe="./cleora-exe", dimensions=32, iter=16):
        if self.tsv_filename is None:
            raise RuntimeError("TSV filename not yet created")
        if not os.access(cleora_exe, os.X_OK) and shutil.which(cleora_exe) is None:
            raise RuntimeError(f"cleora executable not found: {cleora_exe}")

        command = [cleora_exe,
                   "--type", "tsv",
                   f"--columns=transient::{self.columns[0]} complex::{self.columns[1]}",
                   "--dimension", str(dimensions),
                   "--number-of-iterations", str(iter),
                   "--prepend-field-name", "0",
                   "-f", "numpy",
                   "-o", "results",
                   "-e", "1",
                   self.tsv_filename]
        subprocess.run(command, check=True)

In [10]:
model = AnimeRecomendation(users_df=users_df, animes_df=animes_df)

In [11]:
CHOOSE = None

if CHOOSE is not None:
    model.choose(CHOOSE)

In [12]:
%%time
RATING_THRESHOLD = 6

model.fit(rating_threshold=RATING_THRESHOLD)

CPU times: user 1.69 s, sys: 52.6 ms, total: 1.75 s
Wall time: 1.75 s


6810

In [13]:
model.grouped_df

Unnamed: 0,user_id,anime_id
0,0,67 67 6702 242 242 242 21 21 21 24 24 4722 4722 6098 3125 3125 481 481 481 68 1689 2913 1250 356 356 121 121 430 430 1829 1571 1571 1571 578 578 578 431 431 2762 2762 570 3418 3418 3010 433 600 2034 2034 164 164 4086 2248 1047 459 459 466 2543 419 419 199 199 169 2547 2236 2236 2236 415 415 415 ...
1,1,37403 37403 7674 34566 40852 40852 10087 40052 40052 40748 40748 21 21 26243 42203 42203 40028 40028 40028 3972 481 481 22199 6547 9919 9919 5081 5081 31043 31043 31964 31964 33486 33486 36456 36456 38408 38408 28755 28755 1575 1575 1575 35849 1535 1535 18689 18689 38731 38731 30230 30230 38691 ...
2,2,235 235 235 5042 5042 7593 7593 21 21 22 22 5762 31580 35028 35028 368 368 31964 31964 33486 33486 31740 1575 1575 2904 2904 2904 1535 1535 1535 28223 28223 226 226 38671 38671 32872 15 15 71 71 7661 7661 263 263 263 270 270 24703 24703 15451 15451 8074 11061 11061 11061 28961 34542 14719 14719 ...
3,3,6114 6114 199 199 849 849 33352 33352 33352 31646 31646 32998 32998 25397 12291 1292 1292 34881 34881 32323 22199 4744 47 47 22729 32828 32828 22147 22147 6547 6547 9989 9989 11433 11433 11111 11111 9919 9919 33506 33506 7647 7647 6166 4999 66 66 7817 2251 2251 5081 5081 7674 7674 10030 10030 22...
4,4,101 101 656 656 3549 3549 3359 104 104 530 68 150 150 819 819 1222 1222 105 105 1542 1542 1689 1689 1689 232 232 371 372 372 4975 831 59 1575 1575 1575 2904 2904 2904 61 1535 1535 1535 345 345 2035 2035 356 356 120 120 71 71 123 123 123 380 381 124 4725 74 74 127 1571 1571 476 16 16 1142 1142 64...
...,...,...
6044,Sci-Fi,141 166 406 611 832 848 864 975 976 1126 1215 1290 1361 1409 1480 1631 1673 1712 1780 1812 1916 1917 1975 1985 1986 1993 1995 2028 2106 2157 2215 2310 2410 2455 2551 2604 2755 2758 3079 3167 3190 3252 3256 3268 3287 3305 3433 3463 3490 3635 3684 3772 3817 3818 3820 3821 3880 3882 3883 3890 4037 ...
6045,Slice of Life,273 1804 1926 1982 2950 2953 3691 4091 4991 5299 5671 6150 6195 6209 6491 6688 6935 7518 7547 7709 7955 8178 8518 8603 8626 8661 8669 8671 8719 8729 9119 9344 9423 9689 9777 9886 9938 9955 9957 10162 10407 10511 10804 10832 10859 10884 10905 11113 12119 12981 13177 13179 13283 13333 13373 13375 ...
6046,Sports,15 22 170 183 263 264 265 287 388 551 558 627 815 995 1190 1209 1316 1317 1512 1614 1674 1764 1842 1859 1861 2112 2116 2117 2118 2119 2120 2121 2122 2159 2316 2498 2499 2703 2752 3012 3053 3111 3131 3226 3438 3791 3870 3874 4053 4062 4298 5028 5040 5231 5258 5273 5589 5674 5751 5826 5996 6392 64...
6047,Supernatural,972 1535 1587 1765 2189 2340 2366 2994 3302 3584 3681 3747 3802 3930 4010 4379 5049 5653 6184 6730 6831 6832 7170 7495 7531 8068 8685 8728 9951 10067 10101 10286 10683 10770 11031 11447 12055 13163 14853 15891 16059 16762 17245 17813 18653 18881 20757 20801 22429 22641 23065 23405 23639 24109 24...


In [14]:
%%time
# constructing the tsv data

tsv_file_name = "data.tsv"
model.save_to_tsv(tsv_file_name)

CPU times: user 198 ms, sys: 102 ms, total: 300 ms
Wall time: 310 ms


In [15]:
%%time

CLEORA_EXE = './cleora-exe'
DIMENSIONS = 64
ITER = 64

# You shouldnt be running it every time!

# Commented it out for safety reasons xd
model.cleora_train(cleora_exe=CLEORA_EXE, dimensions=DIMENSIONS, iter=ITER)

[0m[38;5;8m[[0m2024-02-10T16:55:01Z [0m[32mINFO [0m cleora[0m[38;5;8m][0m Reading args...
[src/main.rs:222] &config = Configuration {
    produce_entity_occurrence_count: true,
    embeddings_dimension: 64,
    max_number_of_iteration: 64,
    seed: None,
    prepend_field: false,
    log_every_n: 10000,
    in_memory_embedding_calculation: true,
    input: [
        "data.tsv",
    ],
    file_type: Tsv,
    output_dir: Some(
        "results",
    ),
    output_format: Numpy,
    relation_name: "emb",
    columns: [
        Column {
            name: "user",
            transient: true,
            complex: false,
            reflexive: false,
            ignored: false,
        },
        Column {
            name: "anime",
            transient: false,
            complex: true,
            reflexive: false,
            ignored: false,
        },
    ],
}
[0m[38;5;8m[[0m2024-02-10T16:55:01Z [0m[32mINFO [0m cleora[0m[38;5;8m][0m Starting calculation...
[src/pipelin

CPU times: user 25.1 ms, sys: 2.65 ms, total: 27.7 ms
Wall time: 3.41 s


[0m[38;5;8m[[0m2024-02-10T16:55:04Z [0m[32mINFO [0m cleora::embedding[0m[38;5;8m][0m Done iter: 61. Dims: 64, entities: 26782, num data points: 2101610.
[0m[38;5;8m[[0m2024-02-10T16:55:04Z [0m[32mINFO [0m cleora::embedding[0m[38;5;8m][0m Done iter: 62. Dims: 64, entities: 26782, num data points: 2101610.
[0m[38;5;8m[[0m2024-02-10T16:55:04Z [0m[32mINFO [0m cleora::embedding[0m[38;5;8m][0m Done iter: 63. Dims: 64, entities: 26782, num data points: 2101610.
[0m[38;5;8m[[0m2024-02-10T16:55:04Z [0m[32mINFO [0m cleora::embedding[0m[38;5;8m][0m Done propagating.
[0m[38;5;8m[[0m2024-02-10T16:55:04Z [0m[32mINFO [0m cleora::embedding[0m[38;5;8m][0m Start saving embeddings.
[0m[38;5;8m[[0m2024-02-10T16:55:04Z [0m[32mINFO [0m cleora::embedding[0m[38;5;8m][0m Done saving embeddings.
[0m[38;5;8m[[0m2024-02-10T16:55:04Z [0m[32mINFO [0m cleora::embedding[0m[38;5;8m][0m Finalizing embeddings calculations!
[0m[38;5;8m[[0m2024-02-10T16:55:

In [16]:
class RatingGenerator:
    def __init__(self):
        self.users_count = None
        self.columns = ["user", "anime"]
        self.rankings = dict()

    def get_artifacts(self):
        p = "results/emb__"
        files = ["labels", "vects_iter"]
        suf = [".out.entities", ".out.npy"]

        return {f: f"{p}{self.columns[0]}__{self.columns[1]}{suf[idx]}"
                for idx, f in enumerate(files)}

    def load_artifacts(self):
        artifacts = self.get_artifacts()
        with open(artifacts['labels'], "r") as entities:
            self.labels = np.array([int(i) for i in json.load(entities)])
        # Load results to numpy
        self.vects_iter = np.load(artifacts['vects_iter'])

    def load_rankings(self, idx: int):
        real_id = np.where(self.labels == idx)[0][0]

        v = self.vects_iter[real_id]
        dist = sklearn.metrics.pairwise.cosine_similarity(v.reshape(1, -1),
                                                          self.vects_iter,
                                                          dense_output=True)
        ranking = (-dist).argsort()[0]

        self.rankings[self.labels[real_id]] = self.labels[ranking[:15]]

    def add_to_custom_ranking(self, custom_ranking, idx: int):
        anime_ranking = self.rankings[idx]

        for anime in anime_ranking:
            if anime in custom_ranking:
                custom_ranking[anime] += 1
            else:
                custom_ranking[anime] = 1

    def predict(self, already_watched):

        self.load_artifacts()
        custom_ranking = dict()

        for idx in tqdm(already_watched):
            if idx not in self.rankings:
                self.load_rankings(idx)

            self.add_to_custom_ranking(custom_ranking, idx)

        return dict(sorted(custom_ranking.items(),
                           reverse=True,
                           key=lambda x: x[1]))

In [17]:
two_mode_data="dataset/anime-dataset-2023.csv"
anime_df = pd.read_csv(two_mode_data, sep=",")

In [18]:
# Ranking

ratingGenerator = RatingGenerator()

def pandas_extract_content(row, label):
    name = row[label].to_string()
    return name.split("    ")[1]

ranking = ratingGenerator.predict([67, 6702, 242])
for label, times in ranking.items():
    filter = anime_df[anime_df.anime_id == label]
    if len(filter) == 0:
        continue

    print(f"{pandas_extract_content(filter, 'Name')}: {times}")

  0%|          | 0/3 [00:00<?, ?it/s]

TWO-MIX: White Reflection: 2
Kentauros no Densetsu: 1
Freezing Vibration Specials: 1
Dragon Ball Specials: 1
Kaiba: 1
Rozen Maiden: Träumend: 1
Hibike! Euphonium Movie: Photo Session e Youkoso: 1
Promised Town: 1
Terra Formars: Bugs 2-hen: 1
Gangsta. Recap: 1
Hataraku Saibou CM: 1
Shinryaku! Ika Musume: Ika Ice Tabena-ika?: 1
Ookami Kodomo no Ame to Yuki: 1
Hakuouki: Hekketsuroku: 1
Trigun: Badlands Rumble: 1
Shishigari: 1
Trinity Seven: Nanatsu no Taizai to Nana Madoushi: 1
Horimiya: 1
Bikkuriman: Daiichiji Seima Taisen: 1
Time-Patrol Bon: Fujiko F. Fujio Anime Special - SF Adventure: 1
Shounen Ninja Kaze no Fujimaru: 1
Juushin Enbu: Hero Tales: 1
Taegeugsonyeon Huin Dogsuli: 1
Gongnyong Baengmannyeon Ttori: 1
Jing Ma Zhanshi: 1
Bikkuriman: Moen Zone no Himitsu: 1
Sengoku Bushou Retsuden Bakufuu Douji Hissatsuman: 1
Miraesonyeon Kunta Beomyuda 5000 Nyeon: 1
Hwanggeum Cheolin: 1
Kanon (2006): 1
Lupin III: Alcatraz Connection: 1
Black Clover: Jump Festa 2018 Special: 1
Deadman Wonderlan

In [315]:
def extract_year(aired):
    return aired.split(",")[1].split(" ")[1]

def search_str(s, search):
    return search in str(s).lower()

def find_in_rows(name):
    name = name.lower()
    mask = anime_df.apply(lambda x: x.map(lambda s: search_str(s, name)))
    return anime_df.loc[mask.any(axis=1)]

def find_by_name(name):
    return anime_df[anime_df['Name'].str.lower().str.contains(name.lower(), na=False)]
    
find_by_name("Shingeki no Kyojin").head(10).anime_id

7428     16498
7857     18397
8030     19285
8046     19391
8988     23775
8989     23777
9352     25777
9353     25781
13176    35760
13349    36106
Name: anime_id, dtype: int64