In [None]:
# Standard IPython notebook imports
%matplotlib inline

import os
import csv
import math
import ast
import json
import shutil

import pyarrow
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import sklearn.metrics

import subprocess

from tqdm.notebook import tqdm
from itertools import chain
from typing import Optional

pd.options.display.max_colwidth = 300

In [None]:
users_csv="dataset/user-filtered.csv"
users_df = pd.read_csv(users_csv, sep=",")

In [None]:
users_df.head() # is user data loaded?
len(users_df)

In [None]:
class AnimeRecomendation:
    def __init__(self):
        self.users_df = pd.DataFrame()
        self.users_count = None
        self.columns = ["user", "anime"]
        self.grouped_columns = ["user_id", "id_rating"]
        self.tsv_filename = None

    def number_of_users(self):
        if self.users_count is None:
            self.users_count = self.users_df.max()['user_id']
        return self.users_count

    def save_to_tsv(self, tsv_filename: str):
        self.tsv_filename = tsv_filename
        self.grouped_df.to_csv(self.tsv_filename,
                               index=False,
                               sep='\t',
                               columns=self.grouped_columns,
                               mode='w',
                               header=False)

    def fit(self,
            users_df,
            lines: Optional[int] = None,
            rating_threshold: int = 6):

        self.users_df = users_df.head(int(lines)) \
                        if lines is not None else users_df

        def adj_mult(rating: int) -> int:
            match rating:
                case 6 | 7: return 1
                case 8 | 9: return 2
                case 10: return 3
                case _: return 0
    
        def agg_fun(anime: str, rating: int) -> str:
            return " ".join([str(anime)] * adj_mult(rating))

        self.users_df['id_rating'] = list(zip(self.users_df['anime_id'], self.users_df['rating']))

        self.grouped_df = self.users_df[self.users_df.rating >= rating_threshold]                 \
            .groupby("user_id")["id_rating"]                                                      \
            .agg(lambda animes: " ".join([agg_fun(anime, rating) for (anime, rating) in animes])) \
            .reset_index()

        return self.number_of_users()

    def from_csv(self, file: str):
        self.grouped_df = pd.read_csv(file,
                                      sep="\t",
                                      header=None,
                                      names=self.grouped_columns)

    def choose(self, nousers: int):
        self.grouped_df = self.grouped_df.sample(n=nousers)

    def cleora_train(self, cleora_exe="cleora", dimensions = 32, iter = 16):
        if self.tsv_filename is None:
            raise RuntimeError("TSV filename not yet created")
        if not os.access(cleora_exe, os.X_OK) and shutil.which(cleora_exe) is None:
            raise RuntimeError("cleora executable not found")

        command = [cleora_exe,
                   "--type", "tsv",
                   f"--columns=transient::{self.columns[0]} complex::{self.columns[1]}",
                   "--dimension", str(dimensions),
                   "--number-of-iterations", str(iter),
                   "--prepend-field-name", "0",
                   "-f", "numpy",
                   "-o", "results",
                   "-e", "1",
                   self.tsv_filename]
        subprocess.run(command, check=True)


In [None]:
model = AnimeRecomendation()

In [None]:
%%time

LINES = 40000000
RATING_THRESHOLD = 6

model.fit(users_df, lines=LINES, rating_threshold=RATING_THRESHOLD)

In [None]:
model.choose(20000)
model.grouped_df

In [None]:
%%time
# constructing the tsv data

tsv_file_name = "data2.tsv"
model.save_to_tsv(tsv_file_name)

In [None]:
%%time

# You shouldnt be running it every time!

# Commented it out for safety reasons xd
model.cleora_train()

In [None]:
class RatingGenerator:
    def __init__(self):
        self.users_count = None
        self.columns = ["user", "anime"]
        self.rankings = dict()

    def get_artifacts(self):
        p = "results/emb__"
        files = ["labels", "vects_iter"]
        suf = [".out.entities", ".out.npy"]

        return {f: f"{p}{self.columns[0]}__{self.columns[1]}{suf[idx]}"
                for idx, f in enumerate(files)}

    def load_artifacts(self):
        artifacts = self.get_artifacts()
        with open(artifacts['labels'], "r") as entities:
            self.labels = np.array([int(i) for i in json.load(entities)])
        # Load results to numpy
        self.vects_iter = np.load(artifacts['vects_iter'])

    def load_rankings(self, idx: int):
        real_id = np.where(self.labels == idx)[0][0]

        v = self.vects_iter[real_id]
        dist = sklearn.metrics.pairwise.cosine_similarity(v.reshape(1, -1),
                                                          self.vects_iter,
                                                          dense_output=True)
        ranking = (-dist).argsort()[0]

        self.rankings[self.labels[real_id]] = self.labels[ranking[:15]]

    def add_to_custom_ranking(self, custom_ranking, idx: int):
        anime_ranking = self.rankings[idx]

        for anime in anime_ranking:
            if anime in custom_ranking:
                custom_ranking[anime] += 1
            else:
                custom_ranking[anime] = 1

    def predict(self, already_watched):

        self.load_artifacts()
        custom_ranking = dict()

        for idx in tqdm(already_watched):
            if idx not in self.rankings:
                self.load_rankings(idx)

            self.add_to_custom_ranking(custom_ranking, idx)

        return dict(sorted(custom_ranking.items(),
                           reverse=True,
                           key=lambda x: x[1]))

In [None]:
two_mode_data="dataset/anime-dataset-2023.csv"
anime_df = pd.read_csv(two_mode_data, sep=",")

In [None]:
# Ranking

ratingGenerator = RatingGenerator()

def pandas_extract_content(row, label):
    name = row[label].to_string()
    return name.split("    ")[1]

ranking = ratingGenerator.predict([67, 6702, 242])
for label, times in ranking.items():
    filter = anime_df[anime_df.anime_id == label]
    if len(filter) == 0:
        continue

    print(f"{pandas_extract_content(filter, 'Name')}: {times}")

In [None]:
def extract_year(aired):
    return aired.split(",")[1].split(" ")[1]

def search_str(s, search):
    return search in str(s).lower()

def find_in_rows(name):
    name = name.lower()
    mask = anime_df.apply(lambda x: x.map(lambda s: search_str(s, name)))
    return anime_df.loc[mask.any(axis=1)]

def find_by_name(name):
    return anime_df[anime_df['Name'].str.lower().str.contains(name.lower(), na=False)]
    


find_by_name("Shingeki no Kyojin").head(10).anime_id