In [1]:
# https://github.com/So-ham/Movie-Recommendation-System/blob/main/movie-recommendation.ipynb

# https://github.com/aniketng21/Movie-Recommendation-System-Using-KNN-Algorithm/blob/master/Movie_Recommendation_System.ipynb
#https://github.com/aniketng21/Movie-Recommendation-System-Using-KNN-Algorithm/blob/master/Movie_Recommendation_System.ipynb

# https://towardsdatascience.com/how-did-we-build-book-recommender-systems-in-an-hour-part-2-k-nearest-neighbors-and-matrix-c04b3c2ef55c

# _Imports and Opening Datasets_


In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

In [22]:
animes_path = '../data/AnimeList.csv'
users_path = '../data/UserList.csv'
reviews_path = '../data/UserAnimeList.parquet'

In [23]:
reviews_df = pd.read_parquet(reviews_path, columns=['username', 'anime_id', 'my_score'])
animes_df = pd.read_csv(animes_path)

In [24]:
animes_df.columns

Index(['anime_id', 'title', 'title_english', 'title_japanese',
       'title_synonyms', 'image_url', 'type', 'source', 'episodes', 'status',
       'airing', 'aired_string', 'aired', 'duration', 'rating', 'score',
       'scored_by', 'rank', 'popularity', 'members', 'favorites', 'background',
       'premiered', 'broadcast', 'related', 'producer', 'licensor', 'studio',
       'genre', 'opening_theme', 'ending_theme'],
      dtype='object')

In [25]:
reviews_df = reviews_df.merge(animes_df[['anime_id', 'title', 'title_english', 'type']], on='anime_id', how='left')

In [26]:
reviews_df

Unnamed: 0,username,anime_id,my_score,title,title_english,type
0,karthiga,21,9,One Piece,One Piece,TV
1,karthiga,59,7,Chobits,Chobits,TV
2,karthiga,74,7,Gakuen Alice,Gakuen Alice,TV
3,karthiga,120,7,Fruits Basket,Fruits Basket,TV
4,karthiga,178,7,Ultra Maniac,Ultramaniac - Magical Girl,TV
...,...,...,...,...,...,...
80076107,mini_kaila,5940,8,Seiken no Blacksmith,The Sacred Blacksmith,TV
80076108,mini_kaila,6030,0,Needless,Needless,TV
80076109,mini_kaila,6500,8,Seikon no Qwaser,The Qwaser of Stigmata,TV
80076110,mini_kaila,7058,8,Uragiri wa Boku no Namae wo Shitteiru,The Betrayal Knows My Name,TV


# Animes_df Data Exploration and Preparation


In [27]:
# create user_id
reviews_df['user_id'] = reviews_df.groupby("username").ngroup()

In [28]:
reviews_df.shape

(80076112, 7)

In [29]:
reviews_df.head()

Unnamed: 0,username,anime_id,my_score,title,title_english,type,user_id
0,karthiga,21,9,One Piece,One Piece,TV,222757
1,karthiga,59,7,Chobits,Chobits,TV,222757
2,karthiga,74,7,Gakuen Alice,Gakuen Alice,TV,222757
3,karthiga,120,7,Fruits Basket,Fruits Basket,TV,222757
4,karthiga,178,7,Ultra Maniac,Ultramaniac - Magical Girl,TV,222757


## Analyze the data

In [30]:
reviews_df = reviews_df[reviews_df.type=='TV']

In [31]:
print('Number of Unique Animes:', reviews_df['title'].nunique())
reviews_df['title'].value_counts(ascending=False)

Number of Unique Animes: 4271


Death Note                             197400
Code Geass: Hangyaku no Lelouch        165235
Shingeki no Kyojin                     157033
Sword Art Online                       156430
Toradora!                              156059
                                        ...  
Xiao Hua Xian                               8
Chara to Otamajakushi Shima                 7
Xiao Li Yu Li Xian Ji                       4
Oshi ga Budoukan Ittekuretara Shinu         4
Xiongmao He Xiao Yan Shu                    3
Name: title, Length: 4271, dtype: int64

In [32]:
#reviews_df['title'].value_counts(ascending=False).quantile(.7)

# you can actually see various quantiles
display(reviews_df['title'].value_counts(ascending=False).quantile([.25, .4, .5, .6, .75]))


third_quantile = reviews_df['title'].value_counts(ascending=False).quantile([.25, .4, .5, .6, .75]).values[3]
title_counts = reviews_df['title'].value_counts()
reviews_df = reviews_df[reviews_df['title'].isin(title_counts.index[title_counts.gt(third_quantile)])]

0.25      243.0
0.40     1092.0
0.50     2828.0
0.60     6056.0
0.75    15875.5
Name: title, dtype: float64

In [33]:
print('Number of Unique Animes:', reviews_df['title'].nunique())
reviews_df['title'].value_counts()

Number of Unique Animes: 1708


Death Note                                197400
Code Geass: Hangyaku no Lelouch           165235
Shingeki no Kyojin                        157033
Sword Art Online                          156430
Toradora!                                 156059
                                           ...  
Di Gi Charat                                6100
Maria-sama ga Miteru 4th                    6081
Mushi-Uta                                   6079
Mutsu Enmei Ryuu Gaiden: Shura no Toki      6072
Street Fighter II V                         6061
Name: title, Length: 1708, dtype: int64

In [34]:
reviews_df[reviews_df['title']=='Code Geass: Hangyaku no Lelouch']

Unnamed: 0,username,anime_id,my_score,title,title_english,type,user_id
215,RedvelvetDaisuki,1575,10,Code Geass: Hangyaku no Lelouch,Code Geass: Lelouch of the Rebellion,TV,127798
920,Damonashu,1575,5,Code Geass: Hangyaku no Lelouch,Code Geass: Lelouch of the Rebellion,TV,35503
1364,bskai,1575,10,Code Geass: Hangyaku no Lelouch,Code Geass: Lelouch of the Rebellion,TV,190000
1750,Bas_G,1575,9,Code Geass: Hangyaku no Lelouch,Code Geass: Lelouch of the Rebellion,TV,19924
2801,sprite1989,1575,0,Code Geass: Hangyaku no Lelouch,Code Geass: Lelouch of the Rebellion,TV,264490
...,...,...,...,...,...,...,...
80073458,Scarlet95,1575,0,Code Geass: Hangyaku no Lelouch,Code Geass: Lelouch of the Rebellion,TV,137029
80074096,TheClockworkGuy,1575,8,Code Geass: Hangyaku no Lelouch,Code Geass: Lelouch of the Rebellion,TV,154990
80074252,skillshot,1575,10,Code Geass: Hangyaku no Lelouch,Code Geass: Lelouch of the Rebellion,TV,262202
80074414,Qimosabe,1575,10,Code Geass: Hangyaku no Lelouch,Code Geass: Lelouch of the Rebellion,TV,124127


In [35]:
# Data Curation on username
#reviews_df = reviews_df[(reviews_df['username'].notnull()) & (reviews_df['username'] != False) & (reviews_df['username'] != True)]
# -1 or 0
#reviews_df[reviews_df['user_id']==0]
reviews_df[reviews_df['user_id']!=0]

Unnamed: 0,username,anime_id,my_score,title,title_english,type,user_id
0,karthiga,21,9,One Piece,One Piece,TV,222757
1,karthiga,59,7,Chobits,Chobits,TV,222757
2,karthiga,74,7,Gakuen Alice,Gakuen Alice,TV,222757
3,karthiga,120,7,Fruits Basket,Fruits Basket,TV,222757
4,karthiga,178,7,Ultra Maniac,Ultramaniac - Magical Girl,TV,222757
...,...,...,...,...,...,...,...
80076107,mini_kaila,5940,8,Seiken no Blacksmith,The Sacred Blacksmith,TV,238238
80076108,mini_kaila,6030,0,Needless,Needless,TV,238238
80076109,mini_kaila,6500,8,Seikon no Qwaser,The Qwaser of Stigmata,TV,238238
80076110,mini_kaila,7058,8,Uragiri wa Boku no Namae wo Shitteiru,The Betrayal Knows My Name,TV,238238


In [36]:
reviews_df = reviews_df.sample(frac=1).head(5000000)

In [37]:
final_dataset = reviews_df.pivot(index='title', columns='user_id', values='my_score')
final_dataset.head()

user_id,-1,0,2,3,4,5,7,8,11,12,...,283032,283033,283034,283037,283038,283039,283040,283041,283042,283043
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
.hack//Roots,,,,,,,,,,,...,,,,,,,,,,
.hack//Sign,,,,,,,,,,,...,,,,,,,,,,
.hack//Tasogare no Udewa Densetsu,,,,,,,,,,,...,,,,,,,,,,
07-Ghost,,,,,,,,,,,...,,,,,,,,,,
11eyes,,,,,,,,,,,...,,,,,,,,,,


In [42]:
final_dataset[final_dataset.index=='Dragon Ball'].values

array([[0., 0., 0., ..., 0., 0., 0.]])

In [39]:
#final_dataset.fillna(0,inplace=True)
final_dataset.values[final_dataset.isna()] = 0
final_dataset.head()

user_id,-1,0,2,3,4,5,7,8,11,12,...,283032,283033,283034,283037,283038,283039,283040,283041,283042,283043
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
.hack//Roots,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
.hack//Sign,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
.hack//Tasogare no Udewa Densetsu,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
07-Ghost,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11eyes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
# A CSR is a compressed sparse row or compressed row storage matrix. It’s just a fancy way of storing only the non-zero entries in a matrix. In this case, I’d assume its somehow encoding the relationship between entities.
csr_data = csr_matrix(final_dataset.values)

In [44]:
# esquerda -> user_id
# direita -> anime_id
print(csr_data)

  (0, 1659)	7.0
  (0, 2042)	7.0
  (0, 3923)	6.0
  (0, 6069)	8.0
  (0, 7681)	9.0
  (0, 8906)	8.0
  (0, 9544)	9.0
  (0, 10171)	7.0
  (0, 10271)	5.0
  (0, 11025)	4.0
  (0, 12409)	7.0
  (0, 13618)	5.0
  (0, 14402)	7.0
  (0, 15198)	9.0
  (0, 16851)	7.0
  (0, 17442)	2.0
  (0, 18058)	7.0
  (0, 18396)	4.0
  (0, 19077)	6.0
  (0, 19911)	10.0
  (0, 20445)	6.0
  (0, 21280)	7.0
  (0, 22589)	7.0
  (0, 22622)	5.0
  (0, 24788)	8.0
  :	:
  (1707, 127193)	7.0
  (1707, 129310)	2.0
  (1707, 139963)	5.0
  (1707, 153306)	8.0
  (1707, 155274)	6.0
  (1707, 157482)	6.0
  (1707, 160102)	8.0
  (1707, 161098)	7.0
  (1707, 162584)	4.0
  (1707, 163081)	5.0
  (1707, 179413)	4.0
  (1707, 180065)	7.0
  (1707, 183145)	3.0
  (1707, 190176)	6.0
  (1707, 191883)	8.0
  (1707, 192242)	4.0
  (1707, 196947)	8.0
  (1707, 197291)	7.0
  (1707, 197689)	7.0
  (1707, 209474)	7.0
  (1707, 210843)	7.0
  (1707, 211783)	6.0
  (1707, 211790)	7.0
  (1707, 213880)	7.0
  (1707, 215365)	5.0


In [45]:
knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)


In [46]:
knn.fit(csr_data)

NearestNeighbors(algorithm='brute', metric='cosine', n_jobs=-1, n_neighbors=20)

In [61]:
query_index = 'Shaman King'

In [62]:
distances, indices = knn.kneighbors(final_dataset.loc[query_index,:].values.reshape(1, -1), n_neighbors = 11)

In [63]:
for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(query_index))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, final_dataset.index[indices.flatten()[i]], distances.flatten()[i]))

Recommendations for Shaman King:

1: Katsugeki/Touken Ranbu, with distance of 0.9825331387539241:
2: Kyou kara Maou!, with distance of 0.983382865581031:
3: Romeo x Juliet, with distance of 0.9841556734747661:
4: Digimon Savers, with distance of 0.9849970449655567:
5: Ookami to Koushinryou, with distance of 0.986260088613413:
6: Oniichan no Koto nanka Zenzen Suki ja Nai n da kara ne!!, with distance of 0.9862879639087334:
7: Claymore, with distance of 0.986364902139375:
8: Fullmetal Alchemist, with distance of 0.9865867851160205:
9: Working!!!, with distance of 0.9867817344362183:
10: Canvas 2: Niji-iro no Sketch, with distance of 0.9871292365179363:


## Saving and Storing the Model

In [65]:
import pickle

# Its important to use binary mode 
knnPickle = open('../anime_recommender_knn.pkl', 'wb') 

# source, destination 
pickle.dump(knn, knnPickle)                      


In [None]:
# load the model from disk
loaded_model = pickle.load(open('../anime_recommender_knn', 'rb'))