# Importing necessary packages

In [1]:
import pandas as pd
import numpy as np
import scipy
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity

# Utility Functions

In [2]:
BYTES_TO_MB_DIV = 0.000001
def mem_usage_df(df):
    mem = round(df.memory_usage().sum() * BYTES_TO_MB_DIV, 3) 
    print("Memory usage is " + str(mem) + " MB")

# Loading in the Data

In [3]:
first_time = True
# If loading in the data for the first time:
if first_time:
    # Read in the data
    anime_list = pd.read_csv('anime.csv')
    rating_list = pd.read_csv('rating.csv')
    # Drop all un-watched/un-rated anime :(
    anime_list.dropna(inplace=True)
    rating_list.dropna(inplace=True)
    # Drop users with less than 50 ratings
    rating_list = rating_list.groupby('user_id').filter(lambda x: len(x) >= 50)

    # New dataframe with users as rows and anime as columns, with ratings as values
    user_item_matrix = rating_list.pivot_table(index='anime_id', columns='user_id', values='rating').fillna(0)
    user_item_matrix1 = user_item_matrix.copy(deep=True)
#     sparse_anime_matrix = scipy.sparse.csr_matrix(anime_matrix.values)
#     # Print memory usage
#     mem_usage_df(anime_matrix)
#     # Print size of matrix
#     print(sparse_anime_matrix.shape)
#     # print new size of matrix in MB
#     print(sparse_anime_matrix.data.nbytes * BYTES_TO_MB_DIV, "MB")
#     # Save to npz file
#     scipy.sparse.save_npz('anime_matrix.npz', sparse_anime_matrix)
# else:
#     # Load in the data
#     sparse_anime_matrix = scipy.sparse.load_npz('anime_matrix.npz')
#     print(sparse_anime_matrix.shape, "[anime_id, user_id]")
#     # print the size of matrix in MB
#     print(sparse_anime_matrix.data.nbytes * BYTES_TO_MB_DIV, "[MB]")

# Preprocess the Data

**Centering the data**: We first remove the mean of the data to center it around zero. This is to better align with the intuitive sense that if a user has not seen an anime, then it isn't necessarily a negative rating and vice versa. Hence, we center around each user's average ratings and provide a sense if they liked a particular anime or not.

In [4]:
# Removing the mean
user_item_matrix_demeaned = user_item_matrix.subtract(user_item_matrix.mean(axis=1), axis=0)

In [5]:
kNN = NearestNeighbors(n_neighbors=20, metric='cosine', n_jobs=8)
kNN.fit(user_item_matrix_demeaned)
distances, indices = kNN.kneighbors(user_item_matrix_demeaned.values, n_neighbors=6)

In [6]:
user_item_matrix.head()

user_id,1,3,4,5,7,11,13,14,17,21,...,73495,73499,73500,73501,73502,73503,73504,73507,73510,73515
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,9.0,...,10.0,9.0,0.0,0.0,0.0,9.0,10.0,9.0,0.0,10.0
5,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,9.0,...,8.0,0.0,0.0,0.0,0.0,7.0,10.0,8.0,0.0,10.0
6,0.0,0.0,-1.0,8.0,0.0,0.0,-1.0,0.0,7.0,0.0,...,-1.0,9.0,0.0,0.0,0.0,9.0,9.0,9.0,0.0,10.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,7.0,0.0,0.0,0.0,9.0,0.0,7.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
uid = 1
user_index = user_item_matrix.columns.tolist().index(uid)
rated_anime = user_item_matrix[uid][user_item_matrix[uid] != 0][user_item_matrix[uid] != -1].index.to_list()
# print(rated_anime)
print(anime_list[anime_list['anime_id'].isin(rated_anime)]['name'].to_list())
for i, anime_id in list(enumerate(user_item_matrix.index)):
    # .index is the anime_ids
    # print(i,anime_id)
    # print(user_item_matrix.iloc[i, user_index])

    # Unrated Movies:
    if user_item_matrix.iloc[i, user_index] == 0:
        similar_movies = list(indices[i])
        movie_distances = list(distances[i])
        # If the movie itself is in the list of similar movies
        if i in similar_movies:
            idx = similar_movies.index(i)
            similar_movies.remove(i)
            movie_distances.pop(idx) 

        movie_similarities = [1 - x for x in movie_distances]
        sum_ = 0

        for m in range(len(similar_movies)):
            sum_ += movie_similarities[m] * user_item_matrix.iloc[similar_movies[m], user_index]
        
        if sum(movie_similarities) != 0:
            user_item_matrix1.iloc[i, user_index] = sum_ / sum(movie_similarities)
        else:
            user_item_matrix1.iloc[i, user_index] = 0
    else:
        pass

recommendations = []

for i in user_item_matrix[user_item_matrix[uid] == 0].index.tolist():
    idx = user_item_matrix.index.tolist().index(i)
    predicted_rating = user_item_matrix1.iloc[idx, user_item_matrix1.columns.tolist().index(uid)]
    recommendations.append((i, predicted_rating))

recommendations.sort(key=lambda x: x[1], reverse=True)

print(anime_list[anime_list['anime_id'].isin([x[0] for x in recommendations[:10]])]['name'].to_list())

['High School DxD New', 'Sword Art Online', 'High School DxD', 'Highschool of the Dead']


In [15]:
print(anime_list[anime_list['anime_id'].isin([x[0] for x in recommendations[:10]])]['name'].to_list())

['Cowboy Bebop', 'Cowboy Bebop: Tengoku no Tobira', 'Fairy Tail', 'Mirai Nikki (TV)', 'High School DxD BorN', 'Btooom!', 'Deadman Wonderland', 'Hagure Yuusha no Aesthetica', 'Freezing Vibration', 'Sword Art Online: Extra Edition']
