In [1]:
# https://github.com/So-ham/Movie-Recommendation-System/blob/main/movie-recommendation.ipynb

# https://github.com/aniketng21/Movie-Recommendation-System-Using-KNN-Algorithm/blob/master/Movie_Recommendation_System.ipynb
#https://github.com/aniketng21/Movie-Recommendation-System-Using-KNN-Algorithm/blob/master/Movie_Recommendation_System.ipynb

# https://towardsdatascience.com/how-did-we-build-book-recommender-systems-in-an-hour-part-2-k-nearest-neighbors-and-matrix-c04b3c2ef55c

# _Imports and Opening Datasets_


In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import seaborn as sns
import sys
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

In [32]:
animes_path = '../data/AnimeList.csv'
users_path = '../data/UserList.csv'
reviews_path = '../data/UserAnimeList.parquet'

In [33]:
reviews_df = pd.read_parquet(reviews_path, columns=['username', 'anime_id', 'my_score'])
animes_df = pd.read_csv(animes_path)

In [34]:
animes_df.columns

Index(['anime_id', 'title', 'title_english', 'title_japanese',
       'title_synonyms', 'image_url', 'type', 'source', 'episodes', 'status',
       'airing', 'aired_string', 'aired', 'duration', 'rating', 'score',
       'scored_by', 'rank', 'popularity', 'members', 'favorites', 'background',
       'premiered', 'broadcast', 'related', 'producer', 'licensor', 'studio',
       'genre', 'opening_theme', 'ending_theme'],
      dtype='object')

In [35]:
reviews_df = reviews_df.merge(animes_df[['anime_id', 'title', 'title_english', 'type']], on='anime_id', how='left')

In [36]:
reviews_df

Unnamed: 0,username,anime_id,my_score,title,title_english,type
0,karthiga,21,9,One Piece,One Piece,TV
1,karthiga,59,7,Chobits,Chobits,TV
2,karthiga,74,7,Gakuen Alice,Gakuen Alice,TV
3,karthiga,120,7,Fruits Basket,Fruits Basket,TV
4,karthiga,178,7,Ultra Maniac,Ultramaniac - Magical Girl,TV
...,...,...,...,...,...,...
80076107,mini_kaila,5940,8,Seiken no Blacksmith,The Sacred Blacksmith,TV
80076108,mini_kaila,6030,0,Needless,Needless,TV
80076109,mini_kaila,6500,8,Seikon no Qwaser,The Qwaser of Stigmata,TV
80076110,mini_kaila,7058,8,Uragiri wa Boku no Namae wo Shitteiru,The Betrayal Knows My Name,TV


# Animes_df Data Exploration and Preparation


In [37]:
# create user_id
reviews_df['user_id'] = reviews_df.groupby("username").ngroup()

In [38]:
reviews_df.shape

(80076112, 7)

In [39]:
reviews_df.head()

Unnamed: 0,username,anime_id,my_score,title,title_english,type,user_id
0,karthiga,21,9,One Piece,One Piece,TV,222757
1,karthiga,59,7,Chobits,Chobits,TV,222757
2,karthiga,74,7,Gakuen Alice,Gakuen Alice,TV,222757
3,karthiga,120,7,Fruits Basket,Fruits Basket,TV,222757
4,karthiga,178,7,Ultra Maniac,Ultramaniac - Magical Girl,TV,222757


## Analyze the data

In [40]:
reviews_df = reviews_df[reviews_df.type=='TV']

In [41]:
print('Number of Unique Animes:', reviews_df['title'].nunique())
reviews_df['title'].value_counts(ascending=False)

Number of Unique Animes: 4271


Death Note                             197400
Code Geass: Hangyaku no Lelouch        165235
Shingeki no Kyojin                     157033
Sword Art Online                       156430
Toradora!                              156059
                                        ...  
Xiao Hua Xian                               8
Chara to Otamajakushi Shima                 7
Xiao Li Yu Li Xian Ji                       4
Oshi ga Budoukan Ittekuretara Shinu         4
Xiongmao He Xiao Yan Shu                    3
Name: title, Length: 4271, dtype: int64

In [42]:
#reviews_df['title'].value_counts(ascending=False).quantile(.7)

# you can actually see various quantiles
display(reviews_df['title'].value_counts(ascending=False).quantile([.25, .4, .5, .6, .75]))


third_quantile = reviews_df['title'].value_counts(ascending=False).quantile([.25, .4, .5, .6, .75]).values[3]
title_counts = reviews_df['title'].value_counts()
reviews_df = reviews_df[reviews_df['title'].isin(title_counts.index[title_counts.gt(third_quantile)])]

0.25      243.0
0.40     1092.0
0.50     2828.0
0.60     6056.0
0.75    15875.5
Name: title, dtype: float64

In [43]:
print('Number of Unique Animes:', reviews_df['title'].nunique())
reviews_df['title'].value_counts()

Number of Unique Animes: 1708


Death Note                                197400
Code Geass: Hangyaku no Lelouch           165235
Shingeki no Kyojin                        157033
Sword Art Online                          156430
Toradora!                                 156059
                                           ...  
Di Gi Charat                                6100
Maria-sama ga Miteru 4th                    6081
Mushi-Uta                                   6079
Mutsu Enmei Ryuu Gaiden: Shura no Toki      6072
Street Fighter II V                         6061
Name: title, Length: 1708, dtype: int64

In [44]:
reviews_df[reviews_df['title']=='Code Geass: Hangyaku no Lelouch']

Unnamed: 0,username,anime_id,my_score,title,title_english,type,user_id
215,RedvelvetDaisuki,1575,10,Code Geass: Hangyaku no Lelouch,Code Geass: Lelouch of the Rebellion,TV,127798
920,Damonashu,1575,5,Code Geass: Hangyaku no Lelouch,Code Geass: Lelouch of the Rebellion,TV,35503
1364,bskai,1575,10,Code Geass: Hangyaku no Lelouch,Code Geass: Lelouch of the Rebellion,TV,190000
1750,Bas_G,1575,9,Code Geass: Hangyaku no Lelouch,Code Geass: Lelouch of the Rebellion,TV,19924
2801,sprite1989,1575,0,Code Geass: Hangyaku no Lelouch,Code Geass: Lelouch of the Rebellion,TV,264490
...,...,...,...,...,...,...,...
80073458,Scarlet95,1575,0,Code Geass: Hangyaku no Lelouch,Code Geass: Lelouch of the Rebellion,TV,137029
80074096,TheClockworkGuy,1575,8,Code Geass: Hangyaku no Lelouch,Code Geass: Lelouch of the Rebellion,TV,154990
80074252,skillshot,1575,10,Code Geass: Hangyaku no Lelouch,Code Geass: Lelouch of the Rebellion,TV,262202
80074414,Qimosabe,1575,10,Code Geass: Hangyaku no Lelouch,Code Geass: Lelouch of the Rebellion,TV,124127


In [45]:
# Data Curation on username
#reviews_df = reviews_df[(reviews_df['username'].notnull()) & (reviews_df['username'] != False) & (reviews_df['username'] != True)]
# -1 or 0
#reviews_df[reviews_df['user_id']==0]
reviews_df[reviews_df['user_id']!=0]

Unnamed: 0,username,anime_id,my_score,title,title_english,type,user_id
0,karthiga,21,9,One Piece,One Piece,TV,222757
1,karthiga,59,7,Chobits,Chobits,TV,222757
2,karthiga,74,7,Gakuen Alice,Gakuen Alice,TV,222757
3,karthiga,120,7,Fruits Basket,Fruits Basket,TV,222757
4,karthiga,178,7,Ultra Maniac,Ultramaniac - Magical Girl,TV,222757
...,...,...,...,...,...,...,...
80076107,mini_kaila,5940,8,Seiken no Blacksmith,The Sacred Blacksmith,TV,238238
80076108,mini_kaila,6030,0,Needless,Needless,TV,238238
80076109,mini_kaila,6500,8,Seikon no Qwaser,The Qwaser of Stigmata,TV,238238
80076110,mini_kaila,7058,8,Uragiri wa Boku no Namae wo Shitteiru,The Betrayal Knows My Name,TV,238238


In [46]:
reviews_df = reviews_df.sample(frac=1).head(5000000)

In [47]:
final_dataset = reviews_df.pivot(index='title', columns='user_id', values='my_score')
final_dataset.head()

user_id,-1,0,1,2,3,4,5,7,8,9,...,283034,283035,283036,283037,283038,283039,283040,283041,283042,283043
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
.hack//Roots,,,,,,,,,,,...,,,,,,,,,,
.hack//Sign,,,,,,,,,,,...,,,,,,,,,,
.hack//Tasogare no Udewa Densetsu,,,,,,,,,,,...,,,,,,,,,,
07-Ghost,,,,,,,,,,,...,,,,,,,,,,
11eyes,,,,,,,,,,,...,,,,,,,,,,


In [48]:
final_dataset[final_dataset.index=='Dragon Ball'].values

array([[nan,  8., nan, ..., nan, nan, nan]])

In [49]:
#final_dataset.fillna(0,inplace=True)
final_dataset.values[final_dataset.isna()] = 0
final_dataset.head()

user_id,-1,0,1,2,3,4,5,7,8,9,...,283034,283035,283036,283037,283038,283039,283040,283041,283042,283043
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
.hack//Roots,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
.hack//Sign,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
.hack//Tasogare no Udewa Densetsu,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
07-Ghost,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11eyes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [50]:
# A CSR is a compressed sparse row or compressed row storage matrix. It’s just a fancy way of storing only the non-zero entries in a matrix. In this case, I’d assume its somehow encoding the relationship between entities.
csr_data = csr_matrix(final_dataset.values)

In [51]:
# esquerda -> user_id
# direita -> anime_id
print(csr_data)

  (0, 156)	7.0
  (0, 492)	5.0
  (0, 572)	9.0
  (0, 815)	6.0
  (0, 1639)	8.0
  (0, 1988)	10.0
  (0, 2768)	9.0
  (0, 2789)	7.0
  (0, 2858)	7.0
  (0, 3209)	6.0
  (0, 3226)	10.0
  (0, 3289)	7.0
  (0, 3452)	4.0
  (0, 3489)	7.0
  (0, 3698)	7.0
  (0, 3791)	6.0
  (0, 3990)	7.0
  (0, 5044)	4.0
  (0, 5065)	7.0
  (0, 5171)	7.0
  (0, 5996)	10.0
  (0, 6339)	8.0
  (0, 6767)	5.0
  (0, 7084)	7.0
  (0, 7119)	7.0
  :	:
  (1707, 241540)	1.0
  (1707, 241706)	2.0
  (1707, 244281)	9.0
  (1707, 247259)	7.0
  (1707, 247318)	7.0
  (1707, 247539)	6.0
  (1707, 248407)	5.0
  (1707, 248411)	7.0
  (1707, 249677)	6.0
  (1707, 250612)	7.0
  (1707, 250861)	6.0
  (1707, 251083)	8.0
  (1707, 251587)	4.0
  (1707, 251998)	5.0
  (1707, 252554)	6.0
  (1707, 252893)	8.0
  (1707, 253167)	6.0
  (1707, 253478)	1.0
  (1707, 256125)	8.0
  (1707, 256273)	7.0
  (1707, 257362)	7.0
  (1707, 258864)	7.0
  (1707, 260105)	4.0
  (1707, 260325)	8.0
  (1707, 260405)	1.0


In [52]:
knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)


In [53]:
knn.fit(csr_data)

NearestNeighbors(algorithm='brute', metric='cosine', n_jobs=-1, n_neighbors=20)

In [54]:
query_index = 'Shaman King'

In [55]:
distances, indices = knn.kneighbors(final_dataset.loc[query_index,:].values.reshape(1, -1), n_neighbors = 11)

In [56]:
for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(query_index))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, final_dataset.index[indices.flatten()[i]], distances.flatten()[i]))

Recommendations for Shaman King:

1: Code Geass: Hangyaku no Lelouch, with distance of 0.9595715566784482:
2: Naruto, with distance of 0.960148654869572:
3: One Piece, with distance of 0.9619422250179691:
4: D.Gray-man, with distance of 0.9631450848777923:
5: Naruto: Shippuuden, with distance of 0.9633403927634206:
6: Digimon Adventure, with distance of 0.9634307188473425:
7: Death Note, with distance of 0.9634672549289259:
8: Bleach, with distance of 0.9634803296563262:
9: Fullmetal Alchemist, with distance of 0.964211901514298:
10: Shijou Saikyou no Deshi Kenichi, with distance of 0.9643764825329101:


In [60]:
anime_list = final_dataset.index.tolist()
len(anime_list)

1708

In [117]:
anime_list = final_dataset.index.unique().tolist()
predictions_dict = {}

for anime in anime_list:

    predictions_list = []
    distances, indices = knn.kneighbors(final_dataset.loc[anime,:].values.reshape(1, -1), n_neighbors = 11)

    for i in range(1, len(distances.flatten())):
        
        predictions_list.append(final_dataset.index[indices.flatten()[i]])

    predictions_dict[anime] = predictions_list

In [118]:
predictions_dict['Dragon Ball']

['Dragon Ball Z',
 'Dragon Ball GT',
 'Naruto',
 'Death Note',
 'Bleach',
 'Pokemon',
 'One Piece',
 'Dragon Ball Kai',
 'Fullmetal Alchemist',
 'Naruto: Shippuuden']

In [119]:
predictions_dict['Naruto']

['Naruto: Shippuuden',
 'Bleach',
 'Death Note',
 'Fullmetal Alchemist: Brotherhood',
 'Soul Eater',
 'Sword Art Online',
 'Fullmetal Alchemist',
 'One Piece',
 'Code Geass: Hangyaku no Lelouch',
 'Fairy Tail']

In [120]:
predictions_file_path = '../anime_predictions_list_knn.pkl'

#anime_predictions_json = json.dump(anime_predictions_list)
predictions_file = open(predictions_file_path, "wb")
pickle.dump(predictions_dict, predictions_file)


In [115]:
predictions_file = open(predictions_file_path, "rb")
output = pickle.load(predictions_file)

In [116]:
output['Naruto']

['Naruto: Shippuuden',
 'Bleach',
 'Death Note',
 'Fullmetal Alchemist: Brotherhood',
 'Soul Eater']

## Saving and Storing the Model

In [28]:
# Its important to use binary mode 
filename = '../anime_recommender_knn.pkl'
pickle.dump(knn, open(filename, 'wb'))

In [29]:
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))

In [30]:
final_dataset

user_id,-1,0,2,3,4,5,7,8,9,10,...,283034,283035,283036,283037,283038,283039,283040,283041,283042,283043
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
.hack//Roots,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
.hack//Sign,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
.hack//Tasogare no Udewa Densetsu,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
07-Ghost,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11eyes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
iDOLM@STER Xenoglossia,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
s.CRY.ed,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
xxxHOLiC,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
xxxHOLiC Kei,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# for anime.unique:
# get predictions
# then save dictionary in a pickle 
# access pickle whenver someone asks for an anime