### Imports 

In [86]:
import os
from typing import List
import re
import pickle
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

### Creating files needed to run the recommender

In [4]:
anime_data_path = "../../data/anime_data.csv"

In [50]:
animes_df = pd.read_csv(anime_data_path, encoding="utf-8")[["name", "code", "premiered", "genres"]]

In [53]:
genres = list({genre  for genres_list in animes_df.genres.tolist() 
                        for genre in genres_list.split(";")})

In [54]:
one_hot = OneHotEncoder(handle_unknown="ignore")
one_hot.fit([[genre] for genre in genres])

OneHotEncoder(categories='auto', drop=None, dtype=<class 'numpy.float64'>,
              handle_unknown='ignore', sparse=True)

In [55]:
def get_genres_vector(genres_str: str):
    """returns the sum of the one-hot representations of the genres"""
    genres = genres_str.split(";")
    genres_vector = sum([one_hot.transform([[genre]]).toarray() for genre in genres])[0]
    return genres_vector

In [56]:
animes_df["genres"] = animes_df.genres.apply(get_genres_vector)
animes_df["name"] = animes_df.name.apply(lambda x: re.sub(r"\s\s+", " ", re.sub(r"[\_+-]", " ", x)))
animes_df.to_pickle("./anime_genres_df.pkl")

### Testing recommender

In [57]:
anime_genres_df = pd.read_pickle("./anime_genres_df.pkl")

In [93]:
df = anime_genres_df.copy()
anime_data = df.loc[anime_genres_df["name"] == "naruto"]
df["similarity"] = df.genres.map(lambda x: np.dot(x, anime_data["genres"].values[0]))
df = df.sort_values(by=["premiered", "similarity"], ascending=False)
print(df.head())
recommendations = df.iloc[:num_recommendations].code.tolist()

                                                   name   code   premiered  \
377                              the god of high school  41353  2020-01-07   
1768  muhyo to rouji no mahouritsu soudan jimusho 2n...  39948  2020-01-07   
265                       enen no shouboutai ni no shou  40956  2020-01-07   
458   sword art online alicization war of underworld...  40540  2020-01-07   
1102                                         deca dence  40056  2020-01-07   

                                                 genres  similarity  
377   [1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...         4.0  
1768  [1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...         3.0  
265   [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...         2.0  
458   [1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...         2.0  
1102  [1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...         2.0  


In [92]:
recommendations

[41353, 39948, 40956, 40540, 40056]