In [33]:
# Standard IPython notebook imports
%matplotlib inline

import os
import json

import csv
import networkx as nx
import pyarrow
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import math
import ast
from itertools import chain
import matplotlib.pyplot as plt
import subprocess
import sklearn.metrics

pd.options.display.max_colwidth = 300

In [34]:
two_mode_data="dataset/anime-dataset-2023.csv"
users_csv="dataset/user-filtered.csv"

In [35]:
# importing data from csv to pandas

class Data_factory:
    def from_csv(file: str) -> pd.DataFrame:
        return pd.read_csv(file)

anime_df = Data_factory.from_csv(two_mode_data)
users_df = Data_factory.from_csv(users_csv)

In [36]:
users_df.head() # is user data loaded?

Unnamed: 0,user_id,anime_id,rating
0,0,67,9
1,0,6702,7
2,0,242,10
3,0,4898,0
4,0,21,10


In [37]:
class AnimeRecomendation:
    def __init__(self, dimensions = 1024, iter = 4):
        self.users_df     = pd.DataFrame()
        self.users_count  = None
        self.dimensions   = dimensions
        self.iterations   = iter
        self.columns      = ["user", "anime"]
        self.tsv_filename = None
        self.rankings     = dict()

    def anime_label(idx: int) -> str:
        return f"anime__{idx}"

    def number_of_users(self):
        if self.users_count is None:
            self.users_count = self.users_df.max()['user_id']
        return self.users_count

    def save_to_tsv(self, tsv_filename: str):
        grouped_df        = self.grouped_df
        users_count       = self.number_of_users()
        self.tsv_filename = tsv_filename

        with open(tsv_filename, "w") as tsv_file:
            for user_id in tqdm(range(users_count)):
                animes = grouped_df[grouped_df.user_id == user_id] \
                    .filter(items=["anime_id"]).to_numpy()

                if len(animes) == 0:
                    continue

                animes = animes[0][0]
                tsv_line = f"{user_id}\t" + " ".join([str(i) for i in animes])
                tsv_file.write(tsv_line + "\n")

    def fit(self, users_df, lines: int, rating_threshold: int):
        self.users_df = users_df.head(lines)
        users_df = self.users_df

        self.grouped_df = users_df[users_df.rating >= rating_threshold] \
            .groupby("user_id")["anime_id"].agg(list).reset_index()

        self.number_of_users()

    def get_artifacts(self, name: str):
        d = dict()
        p = "results/emb__"
        files  = ["labels", "vects_iter"]
        suf = [".out.entities", ".out.npy"]
    
        return { f:f"{p}{name}__{name}{suf[idx]}" 
                    for idx, f in enumerate(files) }

    def cleora_train(self):
        if self.tsv_filename is None:
            raise RuntimeError("TSV filename not yet created")
        
        command = ["cleora",
                   "--type", "tsv",
                   f"--columns={self.columns[0]} complex::reflexive::{self.columns[1]}",
                   "--dimension", str(self.dimensions),
                   "--number-of-iterations", str(self.iterations),
                   "--prepend-field-name", "1",
                   "-f", "numpy",
                   "-o", "results",
                   self.tsv_filename]
        subprocess.run(command, check=True, stderr=subprocess.DEVNULL)

    def load_artifacts(self):
        artifacts = self.get_artifacts(self.columns[1])
        with open(artifacts['labels'], "r") as entities:
            self.labels     = json.load(entities)
        # Load results to numpy
        self.vects_iter     = np.load(artifacts['vects_iter'])

    def load_rankings(self, idx: int):
        real_id = self.labels.index(f"anime__{idx}")

        v = self.vects_iter[real_id]
        dist = sklearn.metrics.pairwise.cosine_similarity(v.reshape(1, -1), self.vects_iter, dense_output=True)
        ranking = (-dist).argsort()[0]

        self.rankings[self.labels[real_id]] = ranking[:15]

    def add_to_custom_ranking(self, custom_ranking, idx: int):
        anime_ranking = self.rankings[f"anime__{idx}"]

        for anime in anime_ranking:
            if anime in custom_ranking:
                custom_ranking[anime] += 1
            else:
                custom_ranking[anime] = 1

    def predict(self, already_watched):
        
        self.load_artifacts()
        custom_ranking = dict()
        
        for idx in tqdm(already_watched):
            if f"anime__{idx}" not in self.rankings:
                self.load_rankings(idx)

            self.add_to_custom_ranking(custom_ranking, idx)

        return dict(sorted(custom_ranking.items(), reverse=True, key=lambda x:x[1]))

In [38]:
%%time

LINES = 2000000
RATING_THRESHOLD = 6

model = AnimeRecomendation()
model.fit(users_df, LINES, RATING_THRESHOLD)

model.number_of_users()

CPU times: user 420 ms, sys: 55.9 ms, total: 476 ms
Wall time: 486 ms


6810

In [39]:
# constructing the tsv data

tsv_file_name = "data1.tsv"
model.save_to_tsv(tsv_file_name)

  0%|          | 0/6810 [00:00<?, ?it/s]

In [40]:
# %%time
# # alternative

# lines_n = 1000000
# tsv_file_name_n = "data2.tsv"
# grouped_df_n =                             \
#     users_df[users_df.rating >= 6]         \
#     .head(lines_n)                         \
#     .filter(items=["user_id", "anime_id"]) \
#     .to_csv(tsv_file_name_n, index=False, sep="\t")

In [41]:
%%time

# You shouldnt be running it every time!

# Commented it out for safety reasons xd
# model.cleora_train()

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 6.91 µs


In [42]:
# Ranking

def anime_id_to_name(id):
    name = str(anime_df[anime_df.anime_id == label]["Name"])
    return "".join(name.split("\n")[0].split(" ")[4:])

ranking = model.predict([67, 6702, 242])
for label, times in ranking.items():
    filter = anime_df[anime_df.anime_id == label]["Name"]
    if len(filter) == 0:
        continue

    print(f"{anime_id_to_name(label)}: {times}")

  0%|          | 0/3 [00:00<?, ?it/s]

Eyeshield21: 2
D4Princess: 1
MidnightPanther: 1
TokyoMajinGakuenKenpucho:TouDaiNiMaku: 1
WarauSalesman: 1
D.Gray-man: 1
ShinChouKyouRyo:CondorHeroII: 1
TenLittleGallForce: 1
MashounoKao: 1
SaintBeast:SeijuuKourin-hen: 1
CowboyBebop: 1
HanayoriDango: 1
KamisamaKazoku: 1
JokeiKazoku:Inbou: 1
ChoujinLocke:ShinsekaiSentai: 1
TaiyounoYuushaFighbird: 1
Wizardry: 1
RyoujokuHitozumaOnsen: 1
Oishinbo: 1
ShakugannoShana-tanMovie: 1
RurouniKenshin:MeijiKenkakuRomantan-Tsuioku-hen: 1
CandyCandy: 1
UchuuSenkanYamato:AratanaruTabidachi: 1
Mizuiro(2003): 1
ChikanshaThomas: 1
EarlyReins: 1
ViperGTS: 1
Momone: 1
ToaruMajutsunoIndex-tan: 1
ChouHenshinCosprayersSpecials: 1
ToHeart:RememberMyMemories: 1


In [45]:
def pandas_extract_content(row, label):
    name = row[label].to_string()
    return name.split("    ")[1]
    
def extract_year(aired):
    return aired.split(",")[1].split(" ")[1]

row = anime_df[anime_df.anime_id == 1]
pandas_extract_content(row, "Image URL")

'https://cdn.myanimelist.net/images/anime/4/19644.jpg'

In [3]:
def df_from_array(array):
    return pd.DataFrame(data=array[1:,1:], index=array[1:,0], columns=array[0,1:])

def read_data(file):
    with open(file, newline='') as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        return [ np.array([i] + list(row)) for i, row in enumerate(reader, start=-1)]

def get_genre_list(genres: str):
  return list(map(lambda x: x.strip().lower(), genres.split(",")))

def filter_out(word, col):
  return lambda row: not (word.lower() in get_genre_list(row[col]))

def filter_in(word, col):
  return lambda row: (word.lower() in get_genre_list(row[col]))

def create_two_mode_df(data):
    sub_df = data[["anime_id", "English name", "Genres"]]
    result = [['id', 'source', 'target']]
    i=0
    for _, row in sub_df.iterrows():
        for genre in get_genre_list(row['Genres']):
            to_append = [i, row['English name'], genre]
            result.append(to_append)
            i+=1
    return df_from_array(np.array(result))

def create_weighted_matrix(two_mode_df):
    assert(two_mode_df.columns.tolist() == ['source', 'target'])
    source_uniq = two_mode_df['source'].unique()
    source_len = source_uniq.shape[0]
    source_dict = {k: v for v, k in enumerate(source_uniq)}
    array = np.zeros((source_len, source_len), dtype=int)

    for _, source1 in tqdm(two_mode_df.iterrows(), total=len(two_mode_df.index)):
        for _, source2 in two_mode_df.iterrows():
            if source1['target'] == source2['target']:
                array[source_dict[source1['source']], source_dict[source2['source']]] += 1
    return pd.DataFrame(data=array, columns=source_uniq, index=source_uniq)

def create_weighted_matrix2(two_mode_df):
    adj_df = pd.crosstab(two_mode_df.source, two_mode_df.target)
    adj_np = adj_df.to_numpy()
    adj_np2 = adj_np@adj_np.T
    return (adj_np2, adj_df.index)


def create_weighted_df(matrix_df):
    result=[['', 'source', 'target', 'weight']]
    c = 0
    for rowIndex, row in tqdm(matrix_df.iterrows(), total=len(matrix_df.index)): #iterate over rows
        for colIndex, value in row.items():
            weight = matrix_df.at[rowIndex, colIndex]
            if weight > 0:
                to_append = [str(c), rowIndex, colIndex, weight]
                result.append(to_append)
                c += 1
    return df_from_array(np.array(result))


In [4]:
raw_data = read_data(two_mode_data)
# raw_data = raw_data[:1000]

In [5]:
all_data = df_from_array(np.array(raw_data))

In [6]:
all_data.columns

Index(['anime_id', 'Name', 'English name', 'Other name', 'Score', 'Genres',
       'Synopsis', 'Type', 'Episodes', 'Aired', 'Premiered', 'Status',
       'Producers', 'Licensors', 'Studios', 'Source', 'Duration', 'Rating',
       'Rank', 'Popularity', 'Favorites', 'Scored By', 'Members', 'Image URL'],
      dtype='object')

In [7]:
all_data = all_data[(all_data['Score'] != "UNKNOWN") & (all_data['Score'] != '')].astype({'Score': 'float'})
# all_data = all_data.drop(columns=['Synopsis', 'Aired', 'Image URL', 'Scored By', 'Members', 'Licensors'])
# all_data = all_data[all_data['Score'] > 7.0]
important_data = all_data.drop(columns=['Synopsis', 'Aired', 'Image URL', 'Scored By', 'Members', 'Licensors'])

In [8]:
movies = important_data[important_data['Type'] == "Movie"]
TVs = important_data[important_data['Type'] == "TV"]
specials = important_data[important_data['Type'] == "Special"]

specified = important_data

not_hentai = specified[specified.apply(filter_out("Hentai", 'Genres'), axis=1)]
hentai = specified[specified.apply(filter_in("Hentai", 'Genres'), axis=1)]

mappa = not_hentai[not_hentai.apply(filter_in("Mappa", "Studios"), axis=1)]

In [9]:
# not_hentai.sort_values(['Score'], ascending=False)
print(len(hentai))
print(len(not_hentai))

1465
14227


In [10]:
data = all_data[["anime_id", "English name", "Genres", "Score"]]
data = data[(data['Score'] != "UNKNOWN") & (data['Score'] != '')].astype({'Score': 'float'})

In [11]:
data.dtypes

mapped = map(get_genre_list, data["Genres"].to_numpy().flatten().tolist())

uniq = np.unique(list(chain.from_iterable(mapped))).tolist()

print(len(uniq), uniq)

22 ['action', 'adventure', 'avant garde', 'award winning', 'boys love', 'comedy', 'drama', 'ecchi', 'erotica', 'fantasy', 'girls love', 'gourmet', 'hentai', 'horror', 'mystery', 'romance', 'sci-fi', 'slice of life', 'sports', 'supernatural', 'suspense', 'unknown']


In [12]:
two_mode_df = create_two_mode_df(data)

two_mode_df

Unnamed: 0,source,target
0,Cowboy Bebop,action
1,Cowboy Bebop,award winning
2,Cowboy Bebop,sci-fi
3,Cowboy Bebop: The Movie,action
4,Cowboy Bebop: The Movie,sci-fi
...,...,...
30955,One Piece: Recapping Fierce Fights! The Counte...,action
30956,One Piece: Recapping Fierce Fights! The Counte...,adventure
30957,One Piece: Recapping Fierce Fights! The Counte...,comedy
30958,One Piece: Recapping Fierce Fights! The Counte...,fantasy


In [13]:
(weighted_matrix_np, weighted_matrix_index) = create_weighted_matrix2(two_mode_df)

In [14]:
weighted_matrix_df2 = pd.DataFrame(data=weighted_matrix_np, index=weighted_matrix_index, columns=weighted_matrix_index)


In [15]:
weighted_matrix_df2

source,"""Deji"" Meets Girl","""Ippon"" again!","""Parade"" de Satie","""Star""t",-OutsideR:RequieM-,.Koni-chan,.hack//G.U. Trilogy,.hack//G.U. Trilogy: Parody Mode,.hack//Gift,.hack//Legend of The Twilight,...,selector spread WIXOSS,sound / phantasma / mirror,teeter,the FLY BanD!,tsuritama,xxxHOLiC,xxxHOLiC The Movie: A Midsummer Night's Dream,∀ Gundam,∀ Gundam I: Earth Light,∀ Gundam II: Moonlight Butterfly
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"""Deji"" Meets Girl",1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
"""Ippon"" again!",0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"""Parade"" de Satie",0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"""Star""t",0,0,0,1,1,0,0,0,0,0,...,1,0,1,1,0,0,0,0,0,0
-OutsideR:RequieM-,0,0,0,1,1,0,0,0,0,0,...,1,0,1,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
xxxHOLiC,0,0,0,0,0,1,0,1,1,1,...,0,0,0,0,1,4,4,1,1,1
xxxHOLiC The Movie: A Midsummer Night's Dream,0,0,0,0,0,1,0,1,1,1,...,0,0,0,0,1,4,4,1,1,1
∀ Gundam,0,0,0,0,0,0,1,1,0,1,...,0,0,0,0,1,1,1,6,3,3
∀ Gundam I: Earth Light,0,0,0,0,0,0,1,1,0,0,...,0,0,0,0,1,1,1,3,3,3


In [16]:
weighted_matrix_df_pickle_file = "weighted_matrix_df.df"
if os.path.exists(weighted_matrix_df_pickle_file):
    weighted_matrix_df = pd.read_pickle(weighted_matrix_df_pickle_file)
else:
    weighted_matrix_df = weighted_matrix_df2
    weighted_matrix_df.to_pickle(weighted_matrix_df_pickle_file)

In [17]:
weighted_matrix_df

source,"""Deji"" Meets Girl","""Ippon"" again!","""Parade"" de Satie","""Star""t",-OutsideR:RequieM-,.Koni-chan,.hack//G.U. Trilogy,.hack//G.U. Trilogy: Parody Mode,.hack//Gift,.hack//Legend of The Twilight,...,selector spread WIXOSS,sound / phantasma / mirror,teeter,the FLY BanD!,tsuritama,xxxHOLiC,xxxHOLiC The Movie: A Midsummer Night's Dream,∀ Gundam,∀ Gundam I: Earth Light,∀ Gundam II: Moonlight Butterfly
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"""Deji"" Meets Girl",1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
"""Ippon"" again!",0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"""Parade"" de Satie",0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"""Star""t",0,0,0,1,1,0,0,0,0,0,...,1,0,1,1,0,0,0,0,0,0
-OutsideR:RequieM-,0,0,0,1,1,0,0,0,0,0,...,1,0,1,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
xxxHOLiC,0,0,0,0,0,1,0,1,1,1,...,0,0,0,0,1,4,4,1,1,1
xxxHOLiC The Movie: A Midsummer Night's Dream,0,0,0,0,0,1,0,1,1,1,...,0,0,0,0,1,4,4,1,1,1
∀ Gundam,0,0,0,0,0,0,1,1,0,1,...,0,0,0,0,1,1,1,6,3,3
∀ Gundam I: Earth Light,0,0,0,0,0,0,1,1,0,0,...,0,0,0,0,1,1,1,3,3,3


In [18]:
# weighted_df_pickle_file = "weighted_df.df"
# if os.path.exists(weighted_df_pickle_file):
#     weighted_df = pd.read_pickle(weighted_df_pickle_file)
# else:
#     weighted_df = create_weighted_df(weighted_matrix_df)
#     weighted_df.to_pickle(weighted_df_pickle_file)

In [19]:
# weighted_df

In [20]:
# weighted_df.columns

In [21]:
# for i in [3, 2, 1]:
# G = nx.from_pandas_edgelist(
#         weighted_df, source='source',
#         target='target', edge_attr='weight')

plt.figure(figsize=(35,35))

G = nx.from_numpy_array(weighted_matrix_np)
graph_pos = nx.spring_layout(G)
nx.draw_networkx_nodes(G, graph_pos, node_size=10, node_color='blue', alpha=0.3)
nx.draw_networkx_edges(G, graph_pos)
nx.draw_networkx_labels(G, graph_pos, font_size=8, font_family='sans-serif')

# plt.show()

# nx.draw(G)

KeyboardInterrupt: 

In [None]:
plt.savefig("plot.svg", dpi=1200)

<Figure size 640x480 with 0 Axes>

In [None]:
net = Network(notebook=True)
net.from_nx(G)
net.show("example.html")