In [1]:
# Standard IPython notebook imports
%matplotlib inline

import os
import csv
import math
import ast
import json
import shutil

import pyarrow
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import sklearn.metrics

import subprocess

from tqdm.notebook import tqdm
from itertools import chain
from typing import Optional

pd.options.display.max_colwidth = 300

In [2]:
USERS_CSV="dataset/user-filtered.csv"
ANIME_CSV="dataset/anime-dataset-2023.csv"

In [3]:
def load_pd(filepath, line_count: int | None):
    return pd.read_csv(filepath, sep=',', nrows=line_count)

In [4]:
%%time
LINES = 10000000

users_df = load_pd(USERS_CSV, LINES)
animes_df = load_pd(ANIME_CSV, None)

# is user data loaded?
len(users_df)
len(animes_df)

CPU times: user 1.36 s, sys: 290 ms, total: 1.65 s
Wall time: 1.73 s


24905

In [207]:
class AnimeRecomendation:
    def __init__(self, users_df=pd.DataFrame(), animes_df=pd.DataFrame()):
        self.users_df = users_df
        self.animes_df = animes_df
        self.users_count = None
        self.columns = ['user', 'anime']
        self.grouped_columns = ['user_id', 'anime_id']
        self.tsv_filename = None
        self.popularity_threshold = 6000

    def change_data(self, users_df, animes_df):
        self.users_df = users_df
        self.animes_df = animes_df

        return self.number_of_users()

    def from_csv(self, file: str):
        self.tsv_filename = file
        self.grouped_df = pd.read_csv(file,
                                      sep="\t",
                                      header=None,
                                      names=self.grouped_columns)

    def choose(self, nousers: int):
        if nousers <= self.number_of_users():
            self.users_df = self.users_df.sample(n=nousers)

    def number_of_users(self):
        if self.users_count is None:
            self.users_count = self.users_df.max()['user_id']
        return self.users_count

    def fit(self, rating_threshold=6):

        def adj_mult(rating: int) -> int:
            match rating:
                case 6 | 7: return 1
                case 8 | 9: return 2
                case 10: return 3
                case _: return 0

        def agg_fun(anime: str, rating: int) -> str:
            return " ".join([str(anime)] * adj_mult(rating))


        self.animes_df = self.animes_df[(self.animes_df['Popularity'] <= self.popularity_threshold) &
                                        (self.animes_df['Popularity'] > 0)]

        self.users_df = self.users_df[self.users_df['rating'] >= rating_threshold]
        self.users_df = self.users_df[self.users_df['anime_id'].isin(self.animes_df['anime_id'])]
        self.animes_df = self.animes_df[self.animes_df['anime_id'].isin(self.users_df['anime_id'])]

        self.users_df = (self.users_df
            .assign(id_rating=lambda x: list(zip(x['anime_id'], x['rating'])))
            .groupby('user_id')['id_rating']
            .agg(lambda animes: " ".join([agg_fun(anime, rating) for (anime, rating) in animes]))
            .reset_index()
            .rename(columns={'id_rating': 'anime_id'}))

        self.animes_df = (self.animes_df
            .assign(genre=lambda x: x['Genres'].apply(lambda x: list(map(str.strip, x.split(",")))))
            .filter(items=['anime_id', 'genre'])
            .explode('genre')
            .groupby('genre')
            .agg(lambda genres: " ".join([str(i) for i in genres]))
            .reset_index()
            .rename(columns={'genre': 'user_id'}))

        self.grouped_df = self.users_df

        return self.number_of_users()

    def save_to_tsv(self, tsv_filename: str):
        self.tsv_filename = tsv_filename
        self.grouped_df.to_csv(self.tsv_filename,
                               index=False,
                               sep='\t',
                               columns=self.grouped_columns,
                               mode='w',
                               header=False)
        self.animes_df.to_csv("animes.tsv",
                               index=False,
                               sep='\t',
                               columns=self.grouped_columns,
                               mode='w',
                               header=False)

    def cleora_train(self, cleora_exe="./cleora-exe", dimensions=32, iter=16):
        if self.tsv_filename is None:
            raise RuntimeError("TSV filename not yet created")
        if not os.access(cleora_exe, os.X_OK) and shutil.which(cleora_exe) is None:
            raise RuntimeError(f"cleora executable not found: {cleora_exe}")

        command = [cleora_exe,
                   "--type", "tsv",
                   f"--columns=transient::{self.columns[0]} complex::{self.columns[1]}",
                   "--dimension", str(dimensions),
                   "--number-of-iterations", str(iter),
                   "--prepend-field-name", "0",
                   "-f", "numpy",
                   "-o", "results",
                   "-e", "1",
                   self.tsv_filename]
        subprocess.run(command, check=True)

        command = [cleora_exe,
                   "--type", "tsv",
                   f"--columns=transient::genre complex::anime",
                   "--dimension", "8",
                   "--number-of-iterations", str(iter),
                   "--prepend-field-name", "0",
                   "-f", "numpy",
                   "-o", "results",
                   "-e", "1",
                   "animes.tsv"]
        subprocess.run(command, check=True)

In [208]:
model = AnimeRecomendation(users_df=users_df, animes_df=animes_df)

In [209]:
%%time
RATING_THRESHOLD = 6

model.fit(rating_threshold=RATING_THRESHOLD)

CPU times: user 6.96 s, sys: 1.26 s, total: 8.22 s
Wall time: 8.32 s


32600

In [210]:
x = model.users_df['anime_id'].apply(lambda x: x.split(' ')).explode().unique()
x.sort()
y = model.animes_df['anime_id'].apply(lambda x: x.split(' ')).explode().unique()
y.sort()
np.all(x == y)


True

In [211]:
%%time
# constructing the tsv data

tsv_file_name = "data.tsv"
model.save_to_tsv(tsv_file_name)

CPU times: user 753 ms, sys: 67.2 ms, total: 820 ms
Wall time: 848 ms


In [212]:
%%time

CLEORA_EXE = './cleora-exe'
DIMENSIONS = 64
ITER = 64

# You shouldnt be running it every time!

# Commented it out for safety reasons xd
model.cleora_train(cleora_exe=CLEORA_EXE, dimensions=DIMENSIONS, iter=ITER)

[0m[38;5;8m[[0m2024-02-11T17:11:00Z [0m[32mINFO [0m cleora[0m[38;5;8m][0m Reading args...
[src/main.rs:222] &config = Configuration {
    produce_entity_occurrence_count: true,
    embeddings_dimension: 64,
    max_number_of_iteration: 64,
    seed: None,
    prepend_field: false,
    log_every_n: 10000,
    in_memory_embedding_calculation: true,
    input: [
        "data.tsv",
    ],
    file_type: Tsv,
    output_dir: Some(
        "results",
    ),
    output_format: Numpy,
    relation_name: "emb",
    columns: [
        Column {
            name: "user",
            transient: true,
            complex: false,
            reflexive: false,
            ignored: false,
        },
        Column {
            name: "anime",
            transient: false,
            complex: true,
            reflexive: false,
            ignored: false,
        },
    ],
}
[0m[38;5;8m[[0m2024-02-11T17:11:00Z [0m[32mINFO [0m cleora[0m[38;5;8m][0m Starting calculation...
[src/pipelin

CPU times: user 179 ms, sys: 19.5 ms, total: 198 ms
Wall time: 30.7 s


[0m[38;5;8m[[0m2024-02-11T17:11:30Z [0m[32mINFO [0m cleora::embedding[0m[38;5;8m][0m Done iter: 63. Dims: 64, entities: 34120, num data points: 9707338.
[0m[38;5;8m[[0m2024-02-11T17:11:30Z [0m[32mINFO [0m cleora::embedding[0m[38;5;8m][0m Done propagating.
[0m[38;5;8m[[0m2024-02-11T17:11:30Z [0m[32mINFO [0m cleora::embedding[0m[38;5;8m][0m Start saving embeddings.
[0m[38;5;8m[[0m2024-02-11T17:11:30Z [0m[32mINFO [0m cleora::embedding[0m[38;5;8m][0m Done saving embeddings.
[0m[38;5;8m[[0m2024-02-11T17:11:30Z [0m[32mINFO [0m cleora::embedding[0m[38;5;8m][0m Finalizing embeddings calculations!
[0m[38;5;8m[[0m2024-02-11T17:11:30Z [0m[32mINFO [0m cleora[0m[38;5;8m][0m Finished in 30 sec
[0m[38;5;8m[[0m2024-02-11T17:11:30Z [0m[32mINFO [0m cleora[0m[38;5;8m][0m Reading args...
[src/main.rs:222] &config = Configuration {
    produce_entity_occurrence_count: true,
    embeddings_dimension: 8,
    max_number_of_iteration: 64,
    seed:

In [248]:
class RatingGenerator:
    def __init__(self):
        self.users_count = None
        self.columns = ["user", "anime"]
        self.rankings = dict()
        self.anime_by_genre = self.get_anime_by_genre()

    def get_anime_by_genre(self):
        anime_df = pd.read_csv(ANIME_CSV)
        return (anime_df
            .assign(genre=lambda x: x['Genres'].apply(lambda x: list(map(str.strip, x.split(",")))))
            .filter(items=['anime_id', 'genre'])
            .explode('genre')
            .groupby('genre')
            .agg(list))

    def get_artifacts(self):
        p = "results/emb__"
        files = ["labels", "vects_iter"]
        suf = [".out.entities", ".out.npy"]

        return {'users': {f: f"{p}{self.columns[0]}__{self.columns[1]}{suf[idx]}"
                            for idx, f in enumerate(files)},
                'genres': {f: f"{p}genre__anime{suf[idx]}"
                            for idx, f in enumerate(files)}}

    def get_dist_to_genre(self, genre: str):
        dists = []
        for anime_id in self.labels:
            dists.append(genre in self.anime_df.loc[anime_id]['Genres'])

        return np.array(dists).astype(float)

    def load_artifacts(self):
        artifacts = self.get_artifacts()
        with open(artifacts['users']['labels'], "r") as entities:
            self.labels = np.array([int(i) for i in json.load(entities)])

        with open(artifacts['genres']['labels'], "r") as entities:
            self.glabels = glabels = np.array([int(i) for i in json.load(entities)])

        self.vects_iter = np.load(artifacts['users']['vects_iter'])
        self.gvects_iter = np.load(artifacts['genres']['vects_iter'])

        self.vects_iter = self.vects_iter[self.labels.argsort()]
        self.labels.sort()

        self.gvects_iter = self.gvects_iter[glabels.argsort()]
        glabels.sort()

        # assert np.all(self.labels == glabels)


    def load_rankings(self, idx: int, preffered_genres: list[str]):
        real_id = np.where(self.labels == idx)[0][0]

        uv = self.vects_iter[real_id]
        udist = sklearn.metrics.pairwise.cosine_similarity(uv.reshape(1, -1),
                                                          self.vects_iter,
                                                          dense_output=True)

        gv = self.gvects_iter[real_id]
        gdist = sklearn.metrics.pairwise.cosine_similarity(gv.reshape(1, -1),
                                                           self.gvects_iter,
                                                           dense_output=True)

        if len(preffered_genres) == 0:
            dists_to_preffered_genres = udist[0]
        else:
            dists_to_preffered_genres = np.mean(
                [self.get_dist_to_genre(genre) for genre in preffered_genres],
                axis=0)

        dist = 0.7 * udist[0] + 0.2 * gdist[0] + 0.1 * dists_to_preffered_genres

        ranking = (-dist).argsort()

        self.rankings[idx] = self.labels[ranking[:15]]

    def add_to_custom_ranking(self, custom_ranking, idx: int):
        anime_ranking = self.rankings[idx]

        for anime in anime_ranking:
            if anime in custom_ranking:
                custom_ranking[anime] += 1
            else:
                custom_ranking[anime] = 1

    def predict(self, already_watched, preferred_genres: list[str]):

        self.load_artifacts()
        custom_ranking = dict()

        for idx in tqdm(already_watched):
            if idx not in self.rankings:
                self.load_rankings(idx, preferred_genres)

            self.add_to_custom_ranking(custom_ranking, idx)

        return dict(sorted(custom_ranking.items(),
                           reverse=True,
                           key=lambda x: x[1]))

In [249]:
rg = RatingGenerator()
rg.load_artifacts()
rg.glabels, rg.labels

(array([    1,     5,     6, ..., 47591, 47616, 48456]),
 array([    1,     5,     6, ..., 47591, 47616, 48456]))

In [263]:
rg.anime_by_genre.loc['Action']['anime_id']

[1,
 5,
 6,
 7,
 18,
 20,
 21,
 23,
 25,
 26,
 27,
 29,
 30,
 33,
 43,
 44,
 45,
 47,
 51,
 54,
 55,
 60,
 61,
 64,
 65,
 67,
 68,
 69,
 71,
 72,
 73,
 75,
 76,
 77,
 80,
 90,
 91,
 93,
 94,
 95,
 97,
 98,
 110,
 112,
 113,
 121,
 127,
 128,
 129,
 130,
 131,
 132,
 134,
 136,
 137,
 138,
 139,
 142,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 161,
 164,
 165,
 168,
 169,
 174,
 175,
 177,
 180,
 181,
 184,
 185,
 186,
 187,
 194,
 198,
 202,
 204,
 205,
 206,
 207,
 208,
 209,
 210,
 212,
 218,
 219,
 221,
 222,
 223,
 225,
 226,
 227,
 230,
 231,
 238,
 249,
 266,
 267,
 269,
 270,
 271,
 272,
 274,
 278,
 282,
 284,
 285,
 288,
 290,
 296,
 297,
 300,
 301,
 305,
 315,
 317,
 320,
 321,
 340,
 347,
 355,
 356,
 359,
 366,
 370,
 374,
 375,
 377,
 378,
 384,
 389,
 392,
 395,
 396,
 397,
 398,
 399,
 400,
 404,
 408,
 411,
 416,
 417,
 426,
 432,
 434,
 438,
 442,
 446,
 447,
 449,
 450,
 451,
 452,
 459,
 460,
 461,
 462,
 463,
 464,
 465,
 466,
 467,
 473,
 478,
 479,
 481,
 4

In [243]:
# Ranking

ratingGenerator = RatingGenerator()

def pandas_extract_content(row, label):
    name = row[label].to_string()
    return name.split("    ")[1]

ranking = ratingGenerator.predict([67, 6702, 242], [])
for label, times in ranking.items():
    filter = animes_df[animes_df.anime_id == label]
    if len(filter) == 0:
        continue

    print(f"{pandas_extract_content(filter, 'Name')}: {times}")

  0%|          | 0/3 [00:00<?, ?it/s]

Okusama wa Joshikousei (TV): 1
Koisuru Tenshi Angelique: Kagayaki no Ashita: 1
Tong Ling Fei: 1
Mahoutsukai no Yome: 1
Kodomo no Omocha (TV): 1
Akagami no Shirayuki-hime 2nd Season: 1
Romeo x Juliet: 1
Shinkyoku Soukai Polyphonica: 1
Sister Princess: 1
Bakuman. 2nd Season: 1
Basilisk: Kouga Ninpou Chou: 1
Nagi no Asu kara: 1
I My Me! Strawberry Eggs: 1
RDG: Red Data Girl: 1
Otome wa Boku ni Koishiteru: 1
Naruto: 1
King's Raid: Ishi wo Tsugumono-tachi: 1
Nanatsu no Taizai: Kamigami no Gekirin: 1
Boruto: Jump Festa 2016 Special: 1
Pokemon Generations: 1
Ta ga Tame no Alchemist: 1
Divine Gate: 1
Gate: Jieitai Kanochi nite, Kaku Tatakaeri Part 2: 1
Pokemon Best Wishes!: 1
Toaru Kagaku no Railgun: Misaka-san wa Ima Chuumoku no Mato desu kara: 1
Gunjou no Magmell: 1
Isekai Cheat Magician: 1
Macross Dynamite 7: 1
Street Fighter II Movie: 1
Toaru Majutsu no Index II: 1
Nodame Cantabile OVA: 1
Yuuki Yuuna wa Yuusha de Aru: Washio Sumi no Shou 1 - Tomodachi: 1
Acchi Kocchi: 1
Yuuki Yuuna wa Yuus

In [None]:
def extract_year(aired):
    return aired.split(",")[1].split(" ")[1]

def search_str(s, search):
    return search in str(s).lower()

def find_in_rows(name):
    name = name.lower()
    mask = anime_df.apply(lambda x: x.map(lambda s: search_str(s, name)))
    return anime_df.loc[mask.any(axis=1)]

def find_by_name(name):
    return anime_df[anime_df['Name'].str.lower().str.contains(name.lower(), na=False)]
    
find_by_name("Shingeki no Kyojin").head(10).anime_id