In [2]:
# libraries import

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import operator
%matplotlib inline


In [3]:
import os
print(os.getcwd())

C:\Users\Ryan Yang\Desktop\MyAnimeList Recommendation Project


In [4]:
# load the data
anime = pd.read_csv('anime.csv')
rating = pd.read_csv('rating.csv', nrows = 1000000)

In [5]:
# take a first look
print(anime.shape)
anime.head()

(12294, 7)


Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [6]:
print(rating.shape)
rating.head()

(1000000, 3)


Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


In [7]:
# note that rating score is -1 if the user watched it but didn't assign a rating
# so we consider it as missing ratings

rating.rating.replace(to_replace={-1:np.nan}, regex=True, inplace=True)
rating.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,
1,1,24,
2,1,79,
3,1,226,
4,1,241,


In [8]:
# split the data into train and test set
from sklearn.model_selection import train_test_split

rating_train, rating_test = train_test_split(rating, test_size=0.2, random_state=42)

In [9]:
# building collaborative filtering model from scratch

# first we need to calculate the number of unique users and animes
n_users = rating_train.user_id.unique().shape[0]
n_users

9285

In [10]:
n_animes = rating_train.anime_id.unique().shape[0]
n_animes

8085

In [11]:
# In order to perform user-based collaborative filtering model
# we need to create a user-item matrix which can be used
# to calculate the similarity between users and items

# each row represent a user and each column an anime
user_based_matrix = rating_train.pivot_table(index=['user_id'],
                                             columns=['anime_id'],
                                             values='rating')
print(user_based_matrix.shape)
user_based_matrix.head()

(9285, 7687)


anime_id,1,5,6,7,8,15,16,17,18,19,...,33934,33964,34015,34085,34103,34107,34136,34240,34283,34324
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,6.0,,6.0,,,...,,,,,,,,,,


In [12]:
rating = None

In [13]:
import gc
gc.collect()

63

In [14]:
import scipy as sp
#sp.sparse.csr_matrix(user_based_matrix.values)

In [15]:
# we deal with the missing ratings by
# first, subtract the row mean from each rating to standardize, and then normalize
user_matrix_norm = user_based_matrix.apply(lambda x : (x-np.mean(x))/(np.max(x)-np.min(x)),
                                           axis=1)

# second, fill the NaN with 0
user_matrix_norm.fillna(0, inplace=True)

# third, drop the columns with only zeros

In [16]:
user_matrix_norm.head()

anime_id,1,5,6,7,8,15,16,17,18,19,...,33934,33964,34015,34085,34103,34107,34136,34240,34283,34324
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.189489,0.0,0.189489,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
from sklearn.metrics.pairwise import cosine_similarity
item_similarity_cos = cosine_similarity(user_matrix_norm.T)
user_similarity_cos = cosine_similarity(user_matrix_norm)

In [18]:
item_similarity_cos.shape

(7687, 7687)

In [19]:
user_similarity_cos.shape

(9285, 9285)

In [20]:
# now we calculate the similarity using the pairwise_distance function
from sklearn.metrics.pairwise import pairwise_distances
user_similarity = pairwise_distances(user_matrix_norm, metric = 'cosine')
item_similarity = pairwise_distances(user_matrix_norm.T, metric = 'cosine')

In [21]:
user_matrix_norm.columns

Int64Index([    1,     5,     6,     7,     8,    15,    16,    17,    18,
               19,
            ...
            33934, 33964, 34015, 34085, 34103, 34107, 34136, 34240, 34283,
            34324],
           dtype='int64', name='anime_id', length=7687)

In [22]:
user_similarity.shape

(9285, 9285)

In [23]:
item_similarity.shape

(7687, 7687)

In [24]:
np.array_equal(user_similarity_cos, user_similarity)

False

In [25]:
user_similarity_cos.shape

(9285, 9285)

In [26]:
user_similarity

array([[0., 1., 1., ..., 1., 1., 1.],
       [1., 0., 1., ..., 1., 1., 1.],
       [1., 1., 0., ..., 1., 1., 1.],
       ...,
       [1., 1., 1., ..., 0., 1., 1.],
       [1., 1., 1., ..., 1., 0., 1.],
       [1., 1., 1., ..., 1., 1., 0.]])

In [27]:
# pairwise distance is 1-cosine similarity according to sklearn
# according to our need, we will be using cosine similarity

In [28]:
# Combine the similarity matricies with the original data frame index and column
user_sim_df = pd.DataFrame(user_similarity_cos, index = user_matrix_norm.index, columns = user_matrix_norm.index)
item_sim_df = pd.DataFrame(item_similarity_cos, index = user_matrix_norm.columns, columns = user_matrix_norm.columns)
 

In [68]:
# a function that returns the top 10 animes with the highest cosine similarity value

def top_animes(anime_id):
    anime_name = anime_id_to_name(anime_id)
    count = 1
    print('Similar shows to {} include:\n'.format(anime_name))
    for item_id in item_sim_df.sort_values(by = anime_id[0], ascending = False).index[1:11]:
        print('No. {}: {}'.format(count, anime_id_to_name([item_id])))
        count +=1

In [69]:
item_sim_df.sort_values(by = 1, ascending = False).index[1:11]

Int64Index([5, 205, 43, 2251, 164, 2001, 199, 467, 44, 6], dtype='int64', name='anime_id')

In [70]:
anime_name

'Series([], )'

In [71]:
top_animes([5])

Similar shows to Cowboy Bebop: Tengoku no Tobira include:

No. 1: Cowboy Bebop
No. 2: Ghost in the Shell
No. 3: Samurai Champloo
No. 4: Toki wo Kakeru Shoujo
No. 5: Mononoke Hime
No. 6: Rurouni Kenshin: Meiji Kenkaku Romantan - Tsui...
No. 7: Baccano!
No. 8: Ghost in the Shell: Stand Alone Complex
No. 9: Ghost in the Shell: Stand Alone Complex 2nd GIG
No. 10: Planetes


In [72]:
def top_users(user):
    if user not in user_matrix_norm.index:
        return('No data available on user {}'.format(user))
    
    print('Most Similar Users:\n')
    sim_values = user_sim_df.sort_values(by = user, ascending=False).loc[:,user].tolist()[1:11]
    sim_users = user_sim_df.sort_values(by = user, ascending=False).index[1:11]
    zipped = zip(sim_users, sim_values)
    for user, sim in zipped:
        print('User #{0}, Similarity value: {1:.2f}'.format(user, sim))

In [73]:
top_users(3)

Most Similar Users:

User #3028, Similarity value: 0.38
User #3681, Similarity value: 0.34
User #2986, Similarity value: 0.34
User #4481, Similarity value: 0.33
User #656, Similarity value: 0.32
User #298, Similarity value: 0.31
User #2411, Similarity value: 0.30
User #2374, Similarity value: 0.30
User #4290, Similarity value: 0.29
User #1966, Similarity value: 0.29


In [74]:
top_users(1)

Most Similar Users:

User #6207, Similarity value: 0.00
User #6229, Similarity value: 0.00
User #6230, Similarity value: 0.00
User #6231, Similarity value: 0.00
User #6232, Similarity value: 0.00
User #6233, Similarity value: 0.00
User #6234, Similarity value: 0.00
User #6235, Similarity value: 0.00
User #6236, Similarity value: 0.00
User #6237, Similarity value: 0.00


In [108]:
# a function that constructs a list of lists containing the highest rated animes for
# each similar user and return the name of the show alnog with the frequency it appears

def similar_user_anime(user):
    if user not in user_matrix_norm.index:
        return('No available data on user {}'.format(user))
    
    sim_users = user_sim_df.sort_values(by=user, ascending=False).index[1:11]
    best = []
    most_common = {}
    
    for i in sim_users:
        max_score = user_matrix_norm.loc[i, :].max()
        if max_score != 0:
            best.append(user_matrix_norm.loc[i,user_matrix_norm.loc[i, :] == max_score].index.tolist())
    for i in range(len(best)):
        for j in best[i]:
            anime_name = anime_id_to_name([j])
            if j in most_common:
                most_common[anime_name] += 1
            else:
                most_common[anime_name] = 1
                
    sorted_list = sorted(most_common.items(), key=operator.itemgetter(1), reverse=True)
    return sorted_list[:5]

In [98]:
max_score = user_matrix_norm.loc[3, :].max()
index = user_matrix_norm.loc[3,user_matrix_norm.loc[3, :] == max_score].index.tolist()
anime_id_to_name(index)

'Fullmetal Alchemist: Brotherhood\n                    Sen to Chihiro no Kamikakushi\n                                       Death Note\n                          Boku dake ga Inai Machi\nAno Hi Mita Hana no Namae wo Bokutachi wa Mada...\n                      Kuroko no Basket 3rd Season\n                               Shokugeki no Souma\n                      Kuroko no Basket 2nd Season\n                               Shingeki no Kyojin\n                       Magi: The Kingdom of Magic\n                                 Kuroko no Basket\n                                    Dragon Ball Z\n                     Magi: The Labyrinth of Magic'

In [99]:
test = {}
test[12] = 12
test[13] = 212
test[42] = 123
test.items()

dict_items([(12, 12), (13, 212), (42, 123)])

In [100]:
# test for the function
sorted(test.items(), key=operator.itemgetter(0), reverse=True)

[(42, 123), (13, 212), (12, 12)]

In [117]:
# This function calculates the weighted average of similar users
# to determine a potential rating for an input user and show

def predicted_rating(anime_id, user):
    sim_users = user_sim_df.sort_values(by=user, ascending=False).index[1:1000]
    user_values = user_sim_df.sort_values(by=user, ascending=False).loc[:,user].tolist()[1:1000]
    rating_list = []
    weight_list = []
    
    for j, i in enumerate(sim_users):
        rating = user_based_matrix.loc[i, anime_id]
        similarity = user_values[j]
        if np.isnan(rating):
            continue
        else:
            rating_list.append(rating*similarity)
            weight_list.append(similarity)
            
    return sum(rating_list)/sum(weight_list)

In [102]:
# a function that convert anime_id to anime name
def anime_id_to_name(anime_id):
    return anime[anime.anime_id.isin(anime_id)].name.to_string(index=False)

In [103]:
anime[anime.anime_id.isin([32281,33709])].name.to_string(index=False)

'Kimi no Na wa.\nGaro: Guren no Tsuki Special'

In [104]:
anime[anime.anime_id.isin([1])].name.to_string(index=False)

'Cowboy Bebop'

In [105]:
top_animes([1])

Similar shows to Cowboy Bebop include:

No. 1: Cowboy Bebop: Tengoku no Tobira
No. 2: Samurai Champloo
No. 3: Ghost in the Shell
No. 4: Baccano!
No. 5: Mononoke Hime
No. 6: Tengen Toppa Gurren Lagann
No. 7: Sen to Chihiro no Kamikakushi
No. 8: Ghost in the Shell: Stand Alone Complex
No. 9: Rurouni Kenshin: Meiji Kenkaku Romantan - Tsui...
No. 10: Trigun


In [106]:
top_users(3)

Most Similar Users:

User #3028, Similarity value: 0.38
User #3681, Similarity value: 0.34
User #2986, Similarity value: 0.34
User #4481, Similarity value: 0.33
User #656, Similarity value: 0.32
User #298, Similarity value: 0.31
User #2411, Similarity value: 0.30
User #2374, Similarity value: 0.30
User #4290, Similarity value: 0.29
User #1966, Similarity value: 0.29


In [109]:
similar_user_anime(3)

[('Sen to Chihiro no Kamikakushi', 1),
 ('Katekyo Hitman Reborn!', 1),
 ('Kurokami The Animation', 1),
 ('Fullmetal Alchemist: Brotherhood', 1),
 ('Steins;Gate', 1)]

In [118]:
predicted_rating(1, 3)

8.474795129573375

In [145]:
#create a list of every show watched by user #3

watched = user_based_matrix.loc[3, user_based_matrix.loc[3,:]>0].index.tolist()

In [146]:
# make a list of the squared errors between the actual and predicted ratings

errors = []
for i in watched:
    actual = user_based_matrix.loc[3, i]
    predicted = predicted_rating(i, 3)
    errors.append((actual-predicted)**2)

In [147]:
# compute the mean squared error
np.mean(errors)

0.8379175771849112