## Two recommendation system included
1)  Anime based, shows upon completion of a particular anime  <br>
2)  User based, shows on subscriber home page

## Program structure
1)  Dataset importing and processing <br>
2)  Anime based recommendation system <br>
3)  User based recommendation system

In [1]:
import pandas as pd 
import numpy as np
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
from difflib import get_close_matches


### Import anime_name dataset

In [2]:
animeName = pd.read_csv('../input/anime-recommendation-database-2020/anime.csv', usecols=['MAL_ID','Name'])

display(animeName.shape)
animeName.head(2)

(17562, 2)

Unnamed: 0,MAL_ID,Name
0,1,Cowboy Bebop
1,5,Cowboy Bebop: Tengoku no Tobira


In [3]:
print("Number of missing anime name: ", animeName.isna().sum()[1])

Number of missing anime name:  0


### Functions for interacting with anime dataset

In [4]:
def getAnimeIdByName(name):
    try:
        return animeName['MAL_ID'][animeName["Name"] == name].item()
    except:
        # if no extact match, get closest 
        closest = get_close_matches(name, animeName["Name"].to_numpy())[0]  # anime name
#         print(f"closest anime = {closest}")
        return getAnimeIdByName(closest)    # return anime ID

def getAnimeNameById(id):
    try:
        return animeName['Name'][animeName['MAL_ID'] == id].item()
    except:
#         print("NO RECORD", inputAnimeName)
        return None
        
# example
id = getAnimeIdByName("Cowboy Bebop ")
print(id)
getAnimeNameById(id)

1


'Cowboy Bebop'

### Import rating complete dataset
rating_complete.csv is a subset of animelist.csv. This dataset only considers animes that the user has watched completely (watching_status==2) and gave it a score (score!=0). This dataset contains 57 Million ratings applied to 16.872 animes by 310.059 users.

In [5]:
rating = pd.read_csv('../input/anime-recommendation-database-2020/rating_complete.csv')

display(rating.shape)
display(rating.head(2))

print("unique user ID: \t{}, unique anime ID: \t{}".format(rating['user_id'].nunique(), rating['anime_id'].nunique()))
print("max user ID: \t\t{}, max anime ID:  \t\t{}".format(rating['user_id'].max(), rating['anime_id'].max()))

(57633278, 3)

Unnamed: 0,user_id,anime_id,rating
0,0,430,9
1,0,1004,5


unique user ID: 	310059, unique anime ID: 	16872
max user ID: 		353404, max anime ID:  		48456


### Convert to csr_matrix from dataframe
csr_matrix is a "sparse matrix". <br>
Benefit:
1) Lesser storage required due to no need store "0" values. <br>
2) Computing time due to logically designed data structure that only contains non-zero values.

In [6]:
# for 1) anime based recommendation system

matRating = csr_matrix((rating['rating'], (rating['anime_id'], rating['user_id'])))

assert matRating.shape[0] == rating['anime_id'].max()+1   # max. ID of anime
assert matRating.shape[1] == rating['user_id'].max()+1   # max. ID of user

matRating.shape   # (max. ID of anime+1, max. ID of user+1), since ID start from 0

(48457, 353405)

In [7]:
# for 2) user based recommendation system

matUser = csr_matrix((rating['rating'], (rating['user_id'], rating['anime_id'])))

assert matUser.shape[0] == rating['user_id'].max()+1   # max. ID of user
assert matUser.shape[1] == rating['anime_id'].max()+1   # max. ID of anime

matUser.shape   # (max. ID of user+1, max. ID of anime+1), since ID start from 0

(353405, 48457)

### Release unuseful memory

In [8]:
# check memory usage (in GB)
from sys import getsizeof

def printSizeInGB(byte):
    return " ".join((format(byte/1024/1024/1024, ".2f"), "GB"))

def csrMatrixGetsizeof(mat):
    return mat.data.nbytes + mat.indptr.nbytes + mat.indices.nbytes

print("Memory of rating dataframe: ", printSizeInGB(getsizeof(rating)))
print("Memory of csr_matrix of matRating: ", printSizeInGB(csrMatrixGetsizeof(matRating)))
print("Memory of csr_matrix of matUser: ", printSizeInGB(csrMatrixGetsizeof(matUser)))

Memory of rating dataframe:  1.29 GB
Memory of csr_matrix of matRating:  0.64 GB
Memory of csr_matrix of matUser:  0.65 GB


In [9]:
# release memory of rating dataframe
import gc
del rating
gc.collect()

20

## 1) Anime based recommendation system

In [10]:
animeKNN = NearestNeighbors(metric="cosine")
animeKNN.fit(matRating)

NearestNeighbors(metric='cosine')

In [12]:
def getAnimeKNN(inputAnimeId, noOfNeighbors, lengthOfRecommendationList):
#     print("\n noOfNeighbors " , noOfNeighbors)
    
    distances, indices = animeKNN.kneighbors(matRating[inputAnimeId], n_neighbors=noOfNeighbors + 1)  # avoid this.inputAnimeId
    i, d = indices.flatten().tolist(), distances.flatten().tolist()
    
    rawRecommendList = list(zip(i,d))
#     print(rawRecommendList)
    
    # get anime name for recommendation list
    recommendationList = []
    for i, (id, distance) in enumerate(rawRecommendList[1:]):  # index, distance, remove first item(avoid this.inputAnime)
        anime = getAnimeNameById(id)
        if anime:
            recommendationList.append({"Order": i+1, "Anime ID": id, "Anime Name": getAnimeNameById(id)})
        
    if len(recommendationList) < lengthOfRecommendationList:
#         print("len of re list", len(recommendationList))
#         print("lengthOfRecommendationList", lengthOfRecommendationList)
        
        recommendationList = getAnimeKNN(inputAnimeId, noOfNeighbors-len(recommendationList)+noOfNeighbors, lengthOfRecommendationList)
#     display(recommendationList)
    return recommendationList
    

def getAnimeRecommendationList(inputAnimeName, lengthOfRecommendationList):
    
    inputAnimeId = getAnimeIdByName(inputAnimeName)      # id 
#     print(f"input anime id = {inputAnimeId}")
    
    recommendationList = getAnimeKNN(inputAnimeId, lengthOfRecommendationList, lengthOfRecommendationList)
      
    
    return pd.DataFrame(recommendationList[:lengthOfRecommendationList]).drop("Order", axis=1).style.set_properties(**{'text-align': 'left'})


In [13]:
inputAnimeName = "InuYashaa "
lengthOfRecommendationList = 6
getAnimeRecommendationList(inputAnimeName, lengthOfRecommendationList)

Unnamed: 0,Anime ID,Anime Name
0,6811,InuYasha: Kanketsu-hen
1,450,InuYasha Movie 2: Kagami no Naka no Mugenjo
2,452,InuYasha Movie 1: Toki wo Koeru Omoi
3,451,InuYasha Movie 3: Tenka Hadou no Ken
4,449,InuYasha Movie 4: Guren no Houraijima
5,121,Fullmetal Alchemist


In [14]:
getAnimeRecommendationList("Kaguya-sama wa Kokurasetai: Tensai-tachi no Renai Zunousen", 6)

Unnamed: 0,Anime ID,Anime Name
0,40591,Kaguya-sama wa Kokurasetai?: Tensai-tachi no Renai Zunousen
1,37450,Seishun Buta Yarou wa Bunny Girl Senpai no Yume wo Minai
2,38000,Kimetsu no Yaiba
3,32937,Kono Subarashii Sekai ni Shukufuku wo! 2
4,30831,Kono Subarashii Sekai ni Shukufuku wo!
5,37779,Yakusoku no Neverland


## 2) User based recommendation system

In [15]:
userKNN = NearestNeighbors(metric="cosine")
userKNN.fit(matUser)

NearestNeighbors(metric='cosine')

In [16]:
def getUserRecommendationList(inputUserId, lengthOfRecommendationList = 10, noOfNeighbors = 5):
        
    distances, indices = userKNN.kneighbors(matUser[inputUserId], n_neighbors=noOfNeighbors + 1)  # avoid this.inputAnimeId
    i, d = indices.flatten().tolist()[1:], distances.flatten().tolist()[1:]   # skip first user (this.userId)
    
    rawRecommendList = list(zip(i,d))
    
    # get user's rating record
    userRatingList = matUser[88895].toarray()[0]
    
    # get nearest k user's rating
    nearUserId = i
    similarity = pd.Series(d)
    similarity.update(1-similarity)   # 1 = highly similar
    
    # find the highest similarity anime
    nearUserDF = pd.DataFrame(matUser[nearUserId].toarray())   # (row, column) = (K user, anime)
    nearUserDF.replace(0, np.nan, inplace=True)
    nearUserDF = nearUserDF.mul(similarity, axis=0)  # weighted rating matrix 
    recommendScore = nearUserDF.mean(axis=0)   # score of every anime
    recommendScore = recommendScore.dropna().sort_values(ascending=False)  # sort anime list by score
    
    # get anime name for recommendation list
    recommendationList = []
    for i, (id, sim) in enumerate(recommendScore.items()):
        if userRatingList[id] == 0:
            # user have not viewed this anime
            anime = getAnimeNameById(id)
            if anime:
                recommendationList.append({"Order": i+1, "Anime ID": id, "Anime Name": getAnimeNameById(id)})
                
        if len(recommendationList) == lengthOfRecommendationList:
            # stop when enough recommended anime exists 
            break
            
    return pd.DataFrame(recommendationList[:lengthOfRecommendationList]).drop("Order", axis=1).style.set_properties(**{'text-align': 'left'})

In [17]:
userId = 88895
lengthOfRecommendationList = 6
getUserRecommendationList(userId, lengthOfRecommendationList)

Unnamed: 0,Anime ID,Anime Name
0,4280,Kara no Kyoukai 4: Garan no Dou
1,3783,Kara no Kyoukai 3: Tsuukaku Zanryuu
2,37491,Gintama.: Shirogane no Tamashii-hen - Kouhan-sen
3,5204,Kara no Kyoukai 6: Boukyaku Rokuon
4,44,Rurouni Kenshin: Meiji Kenkaku Romantan - Tsuioku-hen
5,5941,Cross Game


In [18]:
getUserRecommendationList(0, 5)

Unnamed: 0,Anime ID,Anime Name
0,1571,Ghost Hunt
1,270,Hellsing
2,459,One Piece Movie 1
3,585,Mimi wo Sumaseba
4,2251,Baccano!
