In [26]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.metrics.pairwise import cosine_similarity
import operator
%matplotlib inline

In [27]:
anime = pd.read_csv('anime.csv')
rating = pd.read_csv('rating.csv')

In [30]:
rating.rating.replace({-1: np.nan}, regex=True, inplace = True)
rating=rating.dropna()
rating.head()

Unnamed: 0,user_id,anime_id,rating
47,1,8074,10.0
81,1,11617,10.0
83,1,11757,10.0
101,1,15451,10.0
153,2,11771,10.0


In [50]:
anime.head()
merged = rating.merge(anime, left_on = 'anime_id', right_on = 'anime_id', suffixes= ['_user', ''])
merged.rename(columns = {'rating_user':'user_rating'}, inplace = True)
merged=merged[['user_id', 'name', 'user_rating']]
merged_sub= merged[merged.user_id <= 10000]
merged_sub.head()


Unnamed: 0,user_id,name,user_rating
0,1,Highschool of the Dead,10.0
1,3,Highschool of the Dead,6.0
2,5,Highschool of the Dead,2.0
3,12,Highschool of the Dead,6.0
4,14,Highschool of the Dead,6.0


In [51]:
piv = merged_sub.pivot_table(index=['user_id'], columns=['name'], values='user_rating')
print(piv.shape)
piv.head()

(9467, 7930)


name,&quot;Bungaku Shoujo&quot; Kyou no Oyatsu: Hatsukoi,&quot;Bungaku Shoujo&quot; Memoire,&quot;Bungaku Shoujo&quot; Movie,.hack//G.U. Returner,.hack//G.U. Trilogy,.hack//G.U. Trilogy: Parody Mode,.hack//Gift,.hack//Intermezzo,.hack//Liminality,.hack//Quantum,...,gdgd Fairies Movie: tte Iu Eiga wa Dou kana...?,iDOLM@STER Xenoglossia,iDOLM@STER Xenoglossia Specials,s.CRY.ed,xxxHOLiC,xxxHOLiC Kei,xxxHOLiC Movie: Manatsu no Yoru no Yume,xxxHOLiC Rou,xxxHOLiC Shunmuki,◯
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,2.0,,,,,
7,,,,,,,,,,,...,,,,,,,,,,


In [52]:
piv_norm = piv.apply(lambda x: (x-np.mean(x))/(np.max(x)-np.min(x)), axis=1)
piv_norm.fillna(0, inplace=True)
piv_norm = piv_norm.T
piv_norm = piv_norm.loc[:, (piv_norm != 0).any(axis=0)]

In [53]:
piv_sparse = sp.sparse.csr_matrix(piv_norm.values)

In [54]:
item_similarity = cosine_similarity(piv_sparse)
user_similarity = cosine_similarity(piv_sparse.T)

In [55]:
item_sim_df = pd.DataFrame(item_similarity, index = piv_norm.index, columns = piv_norm.index)
user_sim_df = pd.DataFrame(user_similarity, index = piv_norm.columns, columns = piv_norm.columns)

In [56]:
def top_animes(anime_name):
    count = 1
    print('Similar shows to {} include:\n'.format(anime_name))
    for item in item_sim_df.sort_values(by = anime_name, ascending = False).index[1:11]:
        print('No. {}: {}'.format(count, item))
        count +=1 

In [57]:
def top_users(user):
    
    if user not in piv_norm.columns:
        return('No data available on user {}'.format(user))
    
    print('Most Similar Users:\n')
    sim_values = user_sim_df.sort_values(by=user, ascending=False).loc[:,user].tolist()[1:11]
    sim_users = user_sim_df.sort_values(by=user, ascending=False).index[1:11]
    zipped = zip(sim_users, sim_values,)
    for user, sim in zipped:
        print('User #{0}, Similarity value: {1:.2f}'.format(user, sim)) 

In [58]:
def similar_user_recs(user):
    
    if user not in piv_norm.columns:
        return('No data available on user {}'.format(user))
    
    sim_users = user_sim_df.sort_values(by=user, ascending=False).index[1:11]
    best = []
    most_common = {}
    
    for i in sim_users:
        max_score = piv_norm.loc[:, i].max()
        best.append(piv_norm[piv_norm.loc[:, i]==max_score].index.tolist())
    for i in range(len(best)):
        for j in best[i]:
            if j in most_common:
                most_common[j] += 1
            else:
                most_common[j] = 1
    sorted_list = sorted(most_common.items(), key=operator.itemgetter(1), reverse=True)
    return sorted_list[:5]    


In [59]:
def predicted_rating(anime_name, user):
    sim_users = user_sim_df.sort_values(by=user, ascending=False).index[1:1000]
    user_values = user_sim_df.sort_values(by=user, ascending=False).loc[:,user].tolist()[1:1000]
    rating_list = []
    weight_list = []
    for j, i in enumerate(sim_users):
        rating = piv.loc[i, anime_name]
        similarity = user_values[j]
        if np.isnan(rating):
            continue
        elif not np.isnan(rating):
            rating_list.append(rating*similarity)
            weight_list.append(similarity)
    return sum(rating_list)/sum(weight_list)    

In [60]:
top_animes('Naruto')

Similar shows to Naruto include:

No. 1: Naruto: Shippuuden Movie 1
No. 2: Bleach
No. 3: Naruto Movie 1: Dai Katsugeki!! Yuki Hime Shinobu Houjou Dattebayo!
No. 4: Naruto: Shippuuden Movie 2 - Kizuna
No. 5: Naruto: Shippuuden Movie 6 - Road to Ninja
No. 6: Naruto: Shippuuden Movie 5 - Blood Prison
No. 7: Gunslinger Stratos
No. 8: Sword Gai
No. 9: Dragon Ball Z
No. 10: Naruto: Shippuuden Movie 3 - Hi no Ishi wo Tsugu Mono


In [61]:
top_users(7)

Most Similar Users:

User #9746, Similarity value: 0.19
User #2723, Similarity value: 0.19
User #6945, Similarity value: 0.18
User #559, Similarity value: 0.18
User #3, Similarity value: 0.17
User #345, Similarity value: 0.17
User #1106, Similarity value: 0.17
User #6836, Similarity value: 0.17
User #3038, Similarity value: 0.17
User #2715, Similarity value: 0.16


In [62]:
similar_user_recs(7)

[('Clannad: After Story', 5),
 ('Code Geass: Hangyaku no Lelouch R2', 5),
 ('Fullmetal Alchemist: Brotherhood', 5),
 ('Code Geass: Hangyaku no Lelouch', 4),
 ('Ano Hi Mita Hana no Namae wo Bokutachi wa Mada Shiranai.', 3)]

In [63]:
predicted_rating('Cowboy Bebop', 7)


8.722425091816005

In [45]:
3, 7

(3, 7)

In [64]:
watched = piv.T[piv.loc[7,:]>0].index.tolist()


In [65]:
errors = []
for i in watched:
    actual=piv.loc[7, i]
    predicted = predicted_rating(i, 7)
    errors.append((actual-predicted)**2)

In [49]:
np.sqrt(np.mean(errors))

0.9139338720361506