In [55]:
# Standard IPython notebook imports
%matplotlib inline

import os
import csv
import math
import ast
import json
import shutil

import pyarrow
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import sklearn.metrics

import subprocess

from tqdm.notebook import tqdm
from itertools import chain
from typing import Optional

pd.options.display.max_colwidth = 300

In [56]:
USERS_CSV="dataset/user-filtered.csv"
ANIME_CSV="dataset/anime-dataset-2023.csv"

In [57]:
def load_pd(filepath, line_count: int | None):
    return pd.read_csv(filepath, sep=',', nrows=line_count)

In [None]:
%%time
LINES = None

users_df = load_pd(USERS_CSV, LINES)
animes_df = load_pd(ANIME_CSV, None)

# is user data loaded?
len(users_df)
len(animes_df)

In [None]:
class AnimeRecomendation:
    def __init__(self, users_df=pd.DataFrame(), animes_df=pd.DataFrame()):
        self.users_df = users_df
        self.animes_df = animes_df
        self.users_count = None
        self.columns = ['user', 'anime']
        self.grouped_columns = ['user_id', 'anime_id']
        self.tsv_filename = None
        self.popularity_threshold = 6000

    def change_data(self, users_df, animes_df):
        self.users_df = users_df
        self.animes_df = animes_df

        return self.number_of_users()

    def from_csv(self, file: str):
        self.tsv_filename = file
        self.grouped_df = pd.read_csv(file,
                                      sep="\t",
                                      header=None,
                                      names=self.grouped_columns)

    def choose(self, nousers: int):
        if nousers <= self.number_of_users():
            self.users_df = self.users_df.sample(n=nousers)

    def number_of_users(self):
        if self.users_count is None:
            self.users_count = self.users_df.max()['user_id']
        return self.users_count

    def fit(self,
            rating_threshold=6):

        def adj_mult(rating: int) -> int:
            match rating:
                case 6 | 7: return 1
                case 8 | 9: return 2
                case 10: return 3
                case _: return 0

        def agg_fun(anime: str, rating: int) -> str:
            return " ".join([str(anime)] * adj_mult(rating))

        self.animes_df = self.animes_df[(self.animes_df['Popularity'] <= self.popularity_threshold) &
                                        (self.animes_df['Popularity'] > 0)]

        self.users_df = self.users_df[~self.users_df['anime_id'].isin(self.animes_df['anime_id'])]
        
        self.users_df = (self.users_df[self.users_df.rating >= rating_threshold]
            .assign(id_rating=lambda x: list(zip(x['anime_id'], x['rating'])))
            .groupby('user_id')['id_rating']
            .agg(lambda animes: " ".join([agg_fun(anime, rating) for (anime, rating) in animes]))
            .reset_index()
            .rename(columns={'id_rating': 'anime_id'}))

        self.animes_df = (self.animes_df
            .assign(genre=lambda x: x['Genres'].apply(lambda x: x.split(",")))
            .filter(items=['anime_id', 'genre'])
            .explode('genre')
            .groupby('genre')
            .agg(lambda genres: " ".join([str(i) for i in genres]))
            .reset_index()
            .query('genre != "UNKNOWN"')
            .rename(columns={'genre': 'user_id'}))

        self.grouped_df = pd.concat([self.users_df, self.animes_df], ignore_index=True, sort=False)

        return self.number_of_users()

    def save_to_tsv(self, tsv_filename: str):
        self.tsv_filename = tsv_filename
        self.grouped_df.to_csv(self.tsv_filename,
                               index=False,
                               sep='\t',
                               columns=self.grouped_columns,
                               mode='w',
                               header=False)

    def cleora_train(self, cleora_exe="./cleora-exe", dimensions=32, iter=16):
        if self.tsv_filename is None:
            raise RuntimeError("TSV filename not yet created")
        if not os.access(cleora_exe, os.X_OK) and shutil.which(cleora_exe) is None:
            raise RuntimeError(f"cleora executable not found: {cleora_exe}")

        command = [cleora_exe,
                   "--type", "tsv",
                   f"--columns=transient::{self.columns[0]} complex::{self.columns[1]}",
                   "--dimension", str(dimensions),
                   "--number-of-iterations", str(iter),
                   "--prepend-field-name", "0",
                   "-f", "numpy",
                   "-o", "results",
                   "-e", "1",
                   self.tsv_filename]
        subprocess.run(command, check=True)

In [None]:
model = AnimeRecomendation(users_df=users_df, animes_df=animes_df)

In [None]:
CHOOSE = None

if CHOOSE is not None:
    model.choose(CHOOSE)

In [69]:
%%time
RATING_THRESHOLD = 6

model.fit(rating_threshold=RATING_THRESHOLD)

CPU times: user 330 ms, sys: 3.3 ms, total: 333 ms
Wall time: 335 ms


6807

In [70]:
model.grouped_df

Unnamed: 0,user_id,anime_id
0,0,3125 3125 2913 3418 3418 4086 2543
1,2,31740
2,3,38938 38938
3,4,1542 1542 179 2704 3196 2032
4,5,3186 3186 3186
...,...,...
2695,Sci-Fi,141 166 406 864 975 976 1126 1215 1290 1361 1631 1780 1916 1917 1985 2106 2604 3167 3268 3287 4037 4232 4360 5675 5945 6802 7465 8197 10999 12149 13429 14373 14817 14941 17187 19191 19193 19195 21603 23279 25907 28211 28391 28625 29325 30913 32032 32188 32410 33069 33531 33797 34006 34240 35286 ...
2696,Slice of Life,273 2950 2953 5671 9938 10162 10884 11113 12119 13283 13333 15051 15061 15989 16123 16417 17549 17637 18411 19919 19953 20555 20745 20909 21267 21273 21405 21667 22789 22839 23135 23151 23623 24821 24855 24913 29511 29513 29787 30375 30705 31376 32093 32175 32491 32526 32547 32607 32673 32717 32...
2697,Sports,15 22 170 183 263 264 265 287 388 551 558 627 815 995 1190 1316 1614 1674 1764 1842 1859 1861 2112 2116 2117 2159 2498 2499 2752 3226 3791 4053 5028 5040 5231 5258 5751 6392 7655 7720 9032 9744 9890 10257 10507 10731 11371 11763 11771 11917 12069 12449 12875 13261 16894 16916 17249 17259 17819 1...
2698,Supernatural,972 1535 1587 2994 4010 8068 8728 10067 10683 12055 13163 16059 16762 17813 18881 20757 20801 22429 23405 33668 33865 34411 36654 38034 38971 39421 39495 39534 40280 41619 43814 48171 48405 48567


In [14]:
%%time
# constructing the tsv data

tsv_file_name = "data.tsv"
model.save_to_tsv(tsv_file_name)

CPU times: user 198 ms, sys: 102 ms, total: 300 ms
Wall time: 310 ms


In [15]:
%%time

CLEORA_EXE = './cleora-exe'
DIMENSIONS = 64
ITER = 64

# You shouldnt be running it every time!

# Commented it out for safety reasons xd
model.cleora_train(cleora_exe=CLEORA_EXE, dimensions=DIMENSIONS, iter=ITER)

[0m[38;5;8m[[0m2024-02-10T16:55:01Z [0m[32mINFO [0m cleora[0m[38;5;8m][0m Reading args...
[src/main.rs:222] &config = Configuration {
    produce_entity_occurrence_count: true,
    embeddings_dimension: 64,
    max_number_of_iteration: 64,
    seed: None,
    prepend_field: false,
    log_every_n: 10000,
    in_memory_embedding_calculation: true,
    input: [
        "data.tsv",
    ],
    file_type: Tsv,
    output_dir: Some(
        "results",
    ),
    output_format: Numpy,
    relation_name: "emb",
    columns: [
        Column {
            name: "user",
            transient: true,
            complex: false,
            reflexive: false,
            ignored: false,
        },
        Column {
            name: "anime",
            transient: false,
            complex: true,
            reflexive: false,
            ignored: false,
        },
    ],
}
[0m[38;5;8m[[0m2024-02-10T16:55:01Z [0m[32mINFO [0m cleora[0m[38;5;8m][0m Starting calculation...
[src/pipelin

CPU times: user 25.1 ms, sys: 2.65 ms, total: 27.7 ms
Wall time: 3.41 s


[0m[38;5;8m[[0m2024-02-10T16:55:04Z [0m[32mINFO [0m cleora::embedding[0m[38;5;8m][0m Done iter: 61. Dims: 64, entities: 26782, num data points: 2101610.
[0m[38;5;8m[[0m2024-02-10T16:55:04Z [0m[32mINFO [0m cleora::embedding[0m[38;5;8m][0m Done iter: 62. Dims: 64, entities: 26782, num data points: 2101610.
[0m[38;5;8m[[0m2024-02-10T16:55:04Z [0m[32mINFO [0m cleora::embedding[0m[38;5;8m][0m Done iter: 63. Dims: 64, entities: 26782, num data points: 2101610.
[0m[38;5;8m[[0m2024-02-10T16:55:04Z [0m[32mINFO [0m cleora::embedding[0m[38;5;8m][0m Done propagating.
[0m[38;5;8m[[0m2024-02-10T16:55:04Z [0m[32mINFO [0m cleora::embedding[0m[38;5;8m][0m Start saving embeddings.
[0m[38;5;8m[[0m2024-02-10T16:55:04Z [0m[32mINFO [0m cleora::embedding[0m[38;5;8m][0m Done saving embeddings.
[0m[38;5;8m[[0m2024-02-10T16:55:04Z [0m[32mINFO [0m cleora::embedding[0m[38;5;8m][0m Finalizing embeddings calculations!
[0m[38;5;8m[[0m2024-02-10T16:55:

In [31]:
class RatingGenerator:
    def __init__(self):
        self.users_count = None
        self.columns = ["user", "anime"]
        self.rankings = dict()

    def get_artifacts(self):
        p = "results/emb__"
        files = ["labels", "vects_iter"]
        suf = [".out.entities", ".out.npy"]

        return {f: f"{p}{self.columns[0]}__{self.columns[1]}{suf[idx]}"
                for idx, f in enumerate(files)}

    def load_artifacts(self):
        artifacts = self.get_artifacts()
        with open(artifacts['labels'], "r") as entities:
            self.labels = np.array([int(i) for i in json.load(entities)])
        # Load results to numpy
        self.vects_iter = np.load(artifacts['vects_iter'])

    def load_rankings(self, idx: int):
        real_id = np.where(self.labels == idx)[0][0]

        v = self.vects_iter[real_id]
        dist = sklearn.metrics.pairwise.cosine_similarity(v.reshape(1, -1),
                                                          self.vects_iter,
                                                          dense_output=True)
        print((-dist[0]).argsort())
        ranking = (-dist).argsort()[0]

        self.rankings[self.labels[real_id]] = self.labels[ranking[:15]]

    def add_to_custom_ranking(self, custom_ranking, idx: int):
        anime_ranking = self.rankings[idx]

        for anime in anime_ranking:
            if anime in custom_ranking:
                custom_ranking[anime] += 1
            else:
                custom_ranking[anime] = 1

    def predict(self, already_watched):

        self.load_artifacts()
        custom_ranking = dict()

        for idx in tqdm(already_watched):
            if idx not in self.rankings:
                self.load_rankings(idx)

            self.add_to_custom_ranking(custom_ranking, idx)

        return dict(sorted(custom_ranking.items(),
                           reverse=True,
                           key=lambda x: x[1]))

In [32]:
two_mode_data="dataset/anime-dataset-2023.csv"
anime_df = pd.read_csv(two_mode_data, sep=",")

In [33]:
# Ranking

ratingGenerator = RatingGenerator()

def pandas_extract_content(row, label):
    name = row[label].to_string()
    return name.split("    ")[1]

ranking = ratingGenerator.predict([67, 6702, 242])
for label, times in ranking.items():
    filter = anime_df[anime_df.anime_id == label]
    if len(filter) == 0:
        continue

    print(f"{pandas_extract_content(filter, 'Name')}: {times}")

  0%|          | 0/3 [00:00<?, ?it/s]

[ 7547  6065  3151 ... 22756 22765 26781]
[11428  2516  2518 ... 22756 22765 26781]
[ 4441  5349  6592 ... 22755 22764 26781]
TWO-MIX: White Reflection: 2
Kentauros no Densetsu: 1
Freezing Vibration Specials: 1
Dragon Ball Specials: 1
Kaiba: 1
Rozen Maiden: Träumend: 1
Hibike! Euphonium Movie: Photo Session e Youkoso: 1
Promised Town: 1
Terra Formars: Bugs 2-hen: 1
Gangsta. Recap: 1
Hataraku Saibou CM: 1
Shinryaku! Ika Musume: Ika Ice Tabena-ika?: 1
Ookami Kodomo no Ame to Yuki: 1
Hakuouki: Hekketsuroku: 1
Trigun: Badlands Rumble: 1
Shishigari: 1
Trinity Seven: Nanatsu no Taizai to Nana Madoushi: 1
Horimiya: 1
Bikkuriman: Daiichiji Seima Taisen: 1
Time-Patrol Bon: Fujiko F. Fujio Anime Special - SF Adventure: 1
Shounen Ninja Kaze no Fujimaru: 1
Juushin Enbu: Hero Tales: 1
Taegeugsonyeon Huin Dogsuli: 1
Gongnyong Baengmannyeon Ttori: 1
Jing Ma Zhanshi: 1
Bikkuriman: Moen Zone no Himitsu: 1
Sengoku Bushou Retsuden Bakufuu Douji Hissatsuman: 1
Miraesonyeon Kunta Beomyuda 5000 Nyeon: 1
Hwa

In [315]:
def extract_year(aired):
    return aired.split(",")[1].split(" ")[1]

def search_str(s, search):
    return search in str(s).lower()

def find_in_rows(name):
    name = name.lower()
    mask = anime_df.apply(lambda x: x.map(lambda s: search_str(s, name)))
    return anime_df.loc[mask.any(axis=1)]

def find_by_name(name):
    return anime_df[anime_df['Name'].str.lower().str.contains(name.lower(), na=False)]
    
find_by_name("Shingeki no Kyojin").head(10).anime_id

7428     16498
7857     18397
8030     19285
8046     19391
8988     23775
8989     23777
9352     25777
9353     25781
13176    35760
13349    36106
Name: anime_id, dtype: int64