In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics.pairwise import linear_kernel

import warnings
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv("anime.csv")

In [3]:
data.shape

(12294, 7)

In [4]:
data.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Toy Story (1995),"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Jumanji (1995),"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Grumpier Old Men (1995),"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Waiting to Exhale (1995),"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Father of the Bride Part II (1995),"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [5]:
data['genre'].isnull().count()

12294

In [6]:
data["genre"]

0                     Drama, Romance, School, Supernatural
1        Action, Adventure, Drama, Fantasy, Magic, Mili...
2        Action, Comedy, Historical, Parody, Samurai, S...
3                                         Sci-Fi, Thriller
4        Action, Comedy, Historical, Parody, Samurai, S...
                               ...                        
12289                                               Hentai
12290                                               Hentai
12291                                               Hentai
12292                                               Hentai
12293                                               Hentai
Name: genre, Length: 12294, dtype: object

In [7]:
# converting text to feature vector

tfidf = TfidfVectorizer(ngram_range=(1, 3), stop_words='english')
tfidf_matrix = tfidf.fit_transform(data['genre'].values.astype('U'))

In [8]:
# calculate cosine similarity

cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

In [9]:
# dictionary - product cosine similarity score

results = {}

for idx, row in data.iterrows():

    similar_indices = cosine_similarities[idx].argsort()[:-100:-1]

    similar_items = [(cosine_similarities[idx][i], data['anime_id'][i]) for i in similar_indices]

    results[row['anime_id']] = similar_items[1:]

In [10]:
results

{32281: [(0.9999999999999999, 546),
  (0.9999999999999999, 547),
  (0.8930570237378614, 14669),
  (0.7094225066968285, 17585),
  (0.7094225066968285, 18053),
  (0.7094225066968285, 1624),
  (0.7094225066968285, 2129),
  (0.7094225066968285, 12175),
  (0.7094225066968285, 6351),
  (0.7094225066968285, 8481),
  (0.7094225066968285, 28725),
  (0.7094225066968285, 9988),
  (0.7094225066968285, 2927),
  (0.7094225066968285, 18045),
  (0.7094225066968285, 2926),
  (0.7094225066968285, 2179),
  (0.7094225066968285, 756),
  (0.6893319460997763, 32262),
  (0.6893319460997763, 26019),
  (0.5783994841551221, 33036),
  (0.5783994841551221, 1734),
  (0.5783994841551221, 1569),
  (0.5783994841551221, 15379),
  (0.5783994841551221, 1725),
  (0.5783994841551221, 14813),
  (0.5783994841551221, 23847),
  (0.5774899515796511, 31716),
  (0.5580859851289044, 22865),
  (0.5495727073101824, 11313),
  (0.5436296444809624, 18881),
  (0.5436296444809624, 8728),
  (0.5374056636864999, 1838),
  (0.537405663686499

In [11]:
# split product name from description
def item(anime_id):  
  return data.loc[data['anime_id'] == anime_id]['genre'].tolist()[0].split(' - ')[0]

In [12]:
# recommend product from results dictionary create earlier
def recommend(item_id, num):    
    recs = results[item_id][:num]
    recommendations = []
    for rec in recs: 
       recommendations.append((item(rec[1]),rec[0]))
    return recommendations

In [13]:
item(5)

'Action, Drama, Mystery, Sci-Fi, Space'

In [14]:
recommend(5,5)

[('Action, Drama, Mystery, Sci-Fi', 0.9134372982328338),
 ('Drama, Mystery, Sci-Fi', 0.7756312969826571),
 ('Action, Drama, Mystery, Sci-Fi, Thriller', 0.7550788987712448),
 ('Mystery, Sci-Fi, Space', 0.6950934785628836),
 ('Drama, Mystery, Sci-Fi, Seinen', 0.6634792803273937)]