# _Imports and Opening Datasets_


In [1]:
import pandas as pd
import pickle
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors


In [2]:
animes_path = "../data/AnimeList.csv"
users_path = "../data/UserList.csv"
reviews_path = "../data/UserAnimeList.parquet"


In [3]:
reviews_df = pd.read_parquet(reviews_path, columns=["username", "anime_id", "my_score"])
animes_df = pd.read_csv(animes_path)


In [4]:
reviews_df = reviews_df.merge(
    animes_df[["anime_id", "title", "title_english", "type"]], on="anime_id", how="left"
)


In [5]:
reviews_df


Unnamed: 0,username,anime_id,my_score,title,title_english,type
0,karthiga,21,9,One Piece,One Piece,TV
1,karthiga,59,7,Chobits,Chobits,TV
2,karthiga,74,7,Gakuen Alice,Gakuen Alice,TV
3,karthiga,120,7,Fruits Basket,Fruits Basket,TV
4,karthiga,178,7,Ultra Maniac,Ultramaniac - Magical Girl,TV
...,...,...,...,...,...,...
80076107,mini_kaila,5940,8,Seiken no Blacksmith,The Sacred Blacksmith,TV
80076108,mini_kaila,6030,0,Needless,Needless,TV
80076109,mini_kaila,6500,8,Seikon no Qwaser,The Qwaser of Stigmata,TV
80076110,mini_kaila,7058,8,Uragiri wa Boku no Namae wo Shitteiru,The Betrayal Knows My Name,TV


# Animes_df Data Exploration and Preparation


In [6]:
# create user_id
reviews_df["user_id"] = reviews_df.groupby("username").ngroup()


In [7]:
reviews_df.head()


Unnamed: 0,username,anime_id,my_score,title,title_english,type,user_id
0,karthiga,21,9,One Piece,One Piece,TV,222757
1,karthiga,59,7,Chobits,Chobits,TV,222757
2,karthiga,74,7,Gakuen Alice,Gakuen Alice,TV,222757
3,karthiga,120,7,Fruits Basket,Fruits Basket,TV,222757
4,karthiga,178,7,Ultra Maniac,Ultramaniac - Magical Girl,TV,222757


## Analyze the data


In [8]:
reviews_df = reviews_df[reviews_df.type == "TV"]


In [9]:
print("Number of Unique Animes:", reviews_df["title"].nunique())
reviews_df["title"].value_counts(ascending=False)


Number of Unique Animes: 4271


Death Note                             197400
Code Geass: Hangyaku no Lelouch        165235
Shingeki no Kyojin                     157033
Sword Art Online                       156430
Toradora!                              156059
                                        ...  
Xiao Hua Xian                               8
Chara to Otamajakushi Shima                 7
Xiao Li Yu Li Xian Ji                       4
Oshi ga Budoukan Ittekuretara Shinu         4
Xiongmao He Xiao Yan Shu                    3
Name: title, Length: 4271, dtype: int64

In [10]:
display(
    reviews_df["title"]
    .value_counts(ascending=False)
    .quantile([0.25, 0.4, 0.5, 0.6, 0.75])
)


third_quantile = (
    reviews_df["title"]
    .value_counts(ascending=False)
    .quantile([0.25, 0.4, 0.5, 0.6, 0.75])
    .values[3]
)
title_counts = reviews_df["title"].value_counts()
reviews_df = reviews_df[
    reviews_df["title"].isin(title_counts.index[title_counts.gt(third_quantile)])
]


0.25      243.0
0.40     1092.0
0.50     2828.0
0.60     6056.0
0.75    15875.5
Name: title, dtype: float64

In [11]:
print("Number of Unique Animes:", reviews_df["title"].nunique())
reviews_df["title"].value_counts()


Number of Unique Animes: 1708


Death Note                                197400
Code Geass: Hangyaku no Lelouch           165235
Shingeki no Kyojin                        157033
Sword Art Online                          156430
Toradora!                                 156059
                                           ...  
Di Gi Charat                                6100
Maria-sama ga Miteru 4th                    6081
Mushi-Uta                                   6079
Mutsu Enmei Ryuu Gaiden: Shura no Toki      6072
Street Fighter II V                         6061
Name: title, Length: 1708, dtype: int64

In [12]:
reviews_df[reviews_df["title"] == "Code Geass: Hangyaku no Lelouch"]


Unnamed: 0,username,anime_id,my_score,title,title_english,type,user_id
215,RedvelvetDaisuki,1575,10,Code Geass: Hangyaku no Lelouch,Code Geass: Lelouch of the Rebellion,TV,127798
920,Damonashu,1575,5,Code Geass: Hangyaku no Lelouch,Code Geass: Lelouch of the Rebellion,TV,35503
1364,bskai,1575,10,Code Geass: Hangyaku no Lelouch,Code Geass: Lelouch of the Rebellion,TV,190000
1750,Bas_G,1575,9,Code Geass: Hangyaku no Lelouch,Code Geass: Lelouch of the Rebellion,TV,19924
2801,sprite1989,1575,0,Code Geass: Hangyaku no Lelouch,Code Geass: Lelouch of the Rebellion,TV,264490
...,...,...,...,...,...,...,...
80073458,Scarlet95,1575,0,Code Geass: Hangyaku no Lelouch,Code Geass: Lelouch of the Rebellion,TV,137029
80074096,TheClockworkGuy,1575,8,Code Geass: Hangyaku no Lelouch,Code Geass: Lelouch of the Rebellion,TV,154990
80074252,skillshot,1575,10,Code Geass: Hangyaku no Lelouch,Code Geass: Lelouch of the Rebellion,TV,262202
80074414,Qimosabe,1575,10,Code Geass: Hangyaku no Lelouch,Code Geass: Lelouch of the Rebellion,TV,124127


In [13]:
# Data Curation on username
reviews_df[reviews_df["user_id"] != 0]


Unnamed: 0,username,anime_id,my_score,title,title_english,type,user_id
0,karthiga,21,9,One Piece,One Piece,TV,222757
1,karthiga,59,7,Chobits,Chobits,TV,222757
2,karthiga,74,7,Gakuen Alice,Gakuen Alice,TV,222757
3,karthiga,120,7,Fruits Basket,Fruits Basket,TV,222757
4,karthiga,178,7,Ultra Maniac,Ultramaniac - Magical Girl,TV,222757
...,...,...,...,...,...,...,...
80076107,mini_kaila,5940,8,Seiken no Blacksmith,The Sacred Blacksmith,TV,238238
80076108,mini_kaila,6030,0,Needless,Needless,TV,238238
80076109,mini_kaila,6500,8,Seikon no Qwaser,The Qwaser of Stigmata,TV,238238
80076110,mini_kaila,7058,8,Uragiri wa Boku no Namae wo Shitteiru,The Betrayal Knows My Name,TV,238238


In [14]:
# reviews_df = reviews_df.sample(frac=1).head(5000000)


In [15]:
final_dataset = reviews_df.pivot(index="title", columns="user_id", values="my_score")
final_dataset.head()


user_id,-1,0,1,2,3,4,5,6,7,8,...,283034,283035,283036,283037,283038,283039,283040,283041,283042,283043
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
.hack//Roots,,,,,,,,,,,...,,,,,,,,,,0.0
.hack//Sign,4.0,,,,,,,,,,...,4.0,,,,4.0,,,,,
.hack//Tasogare no Udewa Densetsu,,,,,,,,,,,...,,,,,,,,,,
07-Ghost,,6.0,,,,,,,7.0,,...,,,,,,7.0,8.0,,,
11eyes,,0.0,,,,,,,5.0,,...,,,,,,,,,,


In [16]:
# final_dataset.fillna(0,inplace=True)
final_dataset.values[final_dataset.isna()] = 0
final_dataset.head()


user_id,-1,0,1,2,3,4,5,6,7,8,...,283034,283035,283036,283037,283038,283039,283040,283041,283042,283043
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
.hack//Roots,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
.hack//Sign,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0
.hack//Tasogare no Udewa Densetsu,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
07-Ghost,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,...,0.0,0.0,0.0,0.0,0.0,7.0,8.0,0.0,0.0,0.0
11eyes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
final_dataset[final_dataset.index == "Dragon Ball"].values


array([[0., 8., 0., ..., 7., 0., 0.]])

In [18]:
final_dataset[final_dataset.index == "Naruto"].values


array([[6., 7., 0., ..., 5., 0., 0.]])

In [19]:
# A CSR is a compressed sparse row or compressed row storage matrix.
# It’s just a fancy way of storing only the non-zero entries in a matrix.
csr_data = csr_matrix(final_dataset.values)

# left -> user_id
# right -> anime_id
# float -> rating
print(csr_data[:, 3])


  (15, 0)	9.0
  (232, 0)	7.0
  (237, 0)	5.0
  (247, 0)	8.0
  (290, 0)	8.0
  (324, 0)	7.0
  (328, 0)	7.0
  (392, 0)	7.0
  (419, 0)	8.0
  (431, 0)	7.0
  (432, 0)	7.0
  (855, 0)	6.0
  (970, 0)	7.0
  (997, 0)	9.0
  (1019, 0)	5.0
  (1102, 0)	7.0
  (1237, 0)	7.0
  (1271, 0)	7.0
  (1280, 0)	7.0
  (1286, 0)	8.0
  (1500, 0)	6.0
  (1515, 0)	5.0


In [20]:
knn = NearestNeighbors(metric="cosine", algorithm="brute", n_neighbors=20, n_jobs=-1)
knn.fit(csr_data)


NearestNeighbors(algorithm='brute', metric='cosine', n_jobs=-1, n_neighbors=20)

In [21]:
def anime_recommender(anime):

    # get the distance and anime_names of the Animes recommended to the user
    distances, indices = knn.kneighbors(
        final_dataset.loc[anime, :].values.reshape(1, -1), n_neighbors=11
    )
    anime_names = []

    for indice in indices.flatten():
        anime_names.append(final_dataset.index[indice])

    anime_recs = pd.DataFrame({"Distances": distances.flatten()}, index=anime_names)
    anime_recs.index.name = "title"

    return anime_recs


In [22]:
anime_recommender("Dragon Ball")


Unnamed: 0_level_0,Distances
title,Unnamed: 1_level_1
Dragon Ball,0.0
Dragon Ball Z,0.22331
Dragon Ball GT,0.282797
Naruto,0.508912
Death Note,0.531604
Bleach,0.550518
Fullmetal Alchemist,0.555498
Dragon Ball Kai,0.562159
Pokemon,0.562935
One Piece,0.566727


In [23]:
anime_recommender("Bleach")


Unnamed: 0_level_0,Distances
title,Unnamed: 1_level_1
Bleach,8.790746e-13
Naruto,0.3366178
Naruto: Shippuuden,0.3712829
Death Note,0.3900309
Fullmetal Alchemist,0.4248502
Code Geass: Hangyaku no Lelouch,0.4377455
One Piece,0.4519826
Fairy Tail,0.4590226
Fullmetal Alchemist: Brotherhood,0.463695
Soul Eater,0.4643126


In [24]:
anime_recommender("One Outs")


Unnamed: 0_level_0,Distances
title,Unnamed: 1_level_1
One Outs,1.119105e-13
Gyakkyou Burai Kaiji: Ultimate Survivor,0.5776778
Touhai Densetsu Akagi: Yami ni Maiorita Tensai,0.6056053
Major S1,0.6090202
Hajime no Ippo,0.6094884
Major S2,0.6148369
Major S3,0.6198591
Hajime no Ippo: New Challenger,0.6202515
Major S4,0.6232441
Major S5,0.6255552


In [25]:
anime_list = final_dataset.index.tolist()
len(anime_list)


1708

In [26]:
anime_list = final_dataset.index.unique().tolist()
predictions_dict = {}

for anime in anime_list:

    predictions_list = []
    distances, indices = knn.kneighbors(
        final_dataset.loc[anime, :].values.reshape(1, -1), n_neighbors=11
    )

    for i in range(1, len(distances.flatten())):

        predictions_list.append(final_dataset.index[indices.flatten()[i]])

    predictions_dict[anime] = predictions_list


## Turning the data into a Pickle to use it in Streamlit


In [27]:
predictions_dict["Naruto"]


['Naruto: Shippuuden',
 'Bleach',
 'Death Note',
 'Fullmetal Alchemist',
 'Code Geass: Hangyaku no Lelouch',
 'Fullmetal Alchemist: Brotherhood',
 'Shingeki no Kyojin',
 'One Piece',
 'Fairy Tail',
 'Sword Art Online']

In [28]:
predictions_file_path = "../anime_predictions_list_knn.pkl"

# anime_predictions_json = json.dump(anime_predictions_list)
predictions_file = open(predictions_file_path, "wb")
pickle.dump(predictions_dict, predictions_file)


In [29]:
predictions_file = open(predictions_file_path, "rb")
output = pickle.load(predictions_file)


In [30]:
output["Naruto"]


['Naruto: Shippuuden',
 'Bleach',
 'Death Note',
 'Fullmetal Alchemist',
 'Code Geass: Hangyaku no Lelouch',
 'Fullmetal Alchemist: Brotherhood',
 'Shingeki no Kyojin',
 'One Piece',
 'Fairy Tail',
 'Sword Art Online']

## Saving and Storing the Model


In [31]:
# Its important to use binary mode
filename = "../anime_recommender_knn.pkl"
pickle.dump(knn, open(filename, "wb"))


In [32]:
# load the model from disk
loaded_model = pickle.load(open(filename, "rb"))
