In [1]:
import os 
import numpy as np 
import pandas as pd 
import warnings
import scipy as sp 

from sklearn.metrics.pairwise import cosine_similarity

pd.options.display.max_columns

warnings.filterwarnings("always")
warnings.filterwarnings("ignore")

# Preprocessing and Cleaning

In [2]:
for dirname, _, filenames in os.walk('C:\\Users\\udit hasija\\Downloads\\archive (1)'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

C:\Users\udit hasija\Downloads\archive (1)\anime.csv
C:\Users\udit hasija\Downloads\archive (1)\rating.csv


In [3]:
anime_path = 'C:\\Users\\udit hasija\\Downloads\\archive (1)\\anime.csv'
rating_path = 'C:\\Users\\udit hasija\\Downloads\\archive (1)\\rating.csv'

In [4]:
rating_df = pd.read_csv(rating_path)
rating_df.head()

Unnamed: 0,user_id,anime_id,rating
0,1,21,9
1,1,48,7
2,1,320,5
3,1,49,8
4,1,304,8


In [5]:
anime_df = pd.read_csv(anime_path)
anime_df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,Synopsis
0,1,Cowboy Bebop,"Action, Award Winning, Sci-Fi",TV,26.0,8.75,"Crime is timeless. By the year 2071, humanity ..."
1,5,Cowboy Bebop: Tengoku no Tobira,"Action, Sci-Fi",Movie,1.0,8.38,"Another day, another bounty—such is the life o..."
2,6,Trigun,"Action, Adventure, Sci-Fi",TV,26.0,8.22,"Vash the Stampede is the man with a $$60,000,0..."
3,7,Witch Hunter Robin,"Action, Drama, Mystery, Supernatural",TV,26.0,7.25,Robin Sena is a powerful craft user drafted in...
4,8,Bouken Ou Beet,"Adventure, Fantasy, Supernatural",TV,52.0,6.94,It is the dark century and the people are suff...


In [6]:
print(f"anime set (row, col): {anime_df.shape}\n\nrating set (row, col): {rating_df.shape}")

anime set (row, col): (24905, 7)

rating set (row, col): (1048575, 3)


In [7]:
print("Anime:\n")
print(anime_df.info())
print("\n","*"*50,"\nRating:\n")
print(rating_df.info())

Anime:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24905 entries, 0 to 24904
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  24905 non-null  int64  
 1   name      24905 non-null  object 
 2   genre     24905 non-null  object 
 3   type      24905 non-null  object 
 4   episodes  24294 non-null  float64
 5   rating    15692 non-null  float64
 6   Synopsis  24905 non-null  object 
dtypes: float64(2), int64(1), object(4)
memory usage: 1.3+ MB
None

 ************************************************** 
Rating:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 3 columns):
 #   Column    Non-Null Count    Dtype
---  ------    --------------    -----
 0   user_id   1048575 non-null  int64
 1   anime_id  1048575 non-null  int64
 2   rating    1048575 non-null  int64
dtypes: int64(3)
memory usage: 24.0 MB
None


Handling Missing Values

In [8]:
print("Anime missing values (%):\n")
print(round(anime_df.isnull().sum().sort_values(ascending=False)/len(anime_df.index),4)*100) 
print("\n","*"*50,"\n\nRating missing values (%):\n")
print(round(rating_df.isnull().sum().sort_values(ascending=False)/len(rating_df.index),4)*100)

Anime missing values (%):

rating      36.99
episodes     2.45
anime_id     0.00
name         0.00
genre        0.00
type         0.00
Synopsis     0.00
dtype: float64

 ************************************************** 

Rating missing values (%):

user_id     0.0
anime_id    0.0
rating      0.0
dtype: float64


In [9]:
print(anime_df['type'].mode())
print(anime_df['genre'].mode())

0    TV
Name: type, dtype: object
0    UNKNOWN
Name: genre, dtype: object


Deleting anime with 0 Rating

In [10]:

# deleting anime with 0 rating
anime_df=anime_df[~np.isnan(anime_df["rating"])]

# filling mode value for genre and type
anime_df['genre'] = anime_df['genre'].fillna(
anime_df['genre'].dropna().mode().values[0])

anime_df['type'] = anime_df['type'].fillna(
anime_df['type'].dropna().mode().values[0])

#checking if all null values are filled
anime_df.isnull().sum()

anime_id     0
name         0
genre        0
type         0
episodes    88
rating       0
Synopsis     0
dtype: int64

# feeture Engineering

In [11]:
rating_df['rating'] = rating_df['rating'].apply(lambda x: np.nan if x==-1 else x)
rating_df.head(20)

Unnamed: 0,user_id,anime_id,rating
0,1,21,9
1,1,48,7
2,1,320,5
3,1,49,8
4,1,304,8
5,1,306,8
6,1,53,7
7,1,47,5
8,1,591,6
9,1,54,7


In [12]:
#step 1
anime_df = anime_df[anime_df['type']=='TV']

#step 2
rated_anime = rating_df.merge(anime_df, left_on = 'anime_id', right_on = 'anime_id', suffixes= ['_user', ''])

#step 3
rated_anime =rated_anime[['user_id', 'name', 'rating']]

#step 4
rated_anime_7500= rated_anime[rated_anime.user_id <= 7500]
rated_anime_7500.head()

Unnamed: 0,user_id,name,rating
0,1,One Piece,8.69
1,20,One Piece,8.69
2,23,One Piece,8.69
3,67,One Piece,8.69
4,70,One Piece,8.69


In [13]:
pivot = rated_anime_7500.pivot_table(index=['user_id'], columns=['name'], values='rating')
pivot.head()

name,"""Oshi no Ko""",.hack//Roots,.hack//Sign,.hack//Tasogare no Udewa Densetsu,009-1,07-Ghost,100% Pascal-sensei (TV),100-man no Inochi no Ue ni Ore wa Tatteiru,100-man no Inochi no Ue ni Ore wa Tatteiru 2nd Season,11eyes,...,ef: A Tale of Memories.,gdMen,gdgd Fairies,gdgd Fairies 2,iDOLM@STER Xenoglossia,number24,s.CRY.ed,xxxHOLiC,xxxHOLiC◆Kei,ēlDLIVE
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,6.95,,,,,,,,...,,,,,,,7.36,,,
4,,6.87,,6.56,,7.19,,,,6.06,...,7.89,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,,,,
20,,,,,,,,,,,...,,,,,,,,,,
23,,,,,,7.19,,,,,...,,,,,,,,,,


In [14]:
# step 1
pivot_n = pivot.apply(lambda x: (x-np.mean(x))/(np.max(x)-np.min(x)), axis=1)

# step 2
pivot_n.fillna(0, inplace=True)

# step 3
pivot_n = pivot_n.T

# step 4
pivot_n = pivot_n.loc[:, (pivot_n != 0).any(axis=0)]

# step 5
piv_sparse = sp.sparse.csr_matrix(pivot_n.values)

In [26]:
pivot_n

user_id,1,4,9,20,23,37,47,48,53,66,...,7478,7483,7484,7487,7490,7494,7495,7496,7498,7499
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"""Oshi no Ko""",0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.000000,0.000000,0.000000
.hack//Roots,0.000000,-0.102473,0.0,0.0,0.0,0.0,0.000000,-0.105598,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.000000,0.000000,0.000000
.hack//Sign,-0.177262,0.000000,0.0,0.0,0.0,0.0,-0.272844,-0.086907,0.0,-0.132020,...,0.0,0.0,0.0,0.0,-0.1218,0.0,0.0,0.000000,-0.284268,-0.192614
.hack//Tasogare no Udewa Densetsu,0.000000,-0.181960,0.0,0.0,0.0,0.0,0.000000,-0.178028,0.0,-0.263777,...,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.000000,0.000000,0.000000
009-1,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
number24,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.000000,0.000000,0.000000
s.CRY.ed,-0.060453,0.000000,0.0,0.0,0.0,0.0,0.000000,0.008888,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.000000,0.000000,0.000000
xxxHOLiC,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.153747,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.110354,0.000000,0.095901
xxxHOLiC◆Kei,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.207486,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.172349,0.000000,0.000000


In [38]:
print(piv_sparse)

  (0, 15)	0.4327105508569962
  (0, 80)	0.4205602217251231
  (0, 84)	0.3928094504365693
  (0, 111)	0.3346367874868402
  (0, 155)	0.38032594929146696
  (0, 210)	0.2705362195812113
  (0, 261)	0.3669774881978856
  (0, 274)	0.4669365721997302
  (0, 321)	0.2668799699997116
  (0, 328)	0.4462509099733074
  (0, 339)	0.35125997866313513
  (0, 371)	0.44518507141760105
  (0, 457)	0.4110119505564963
  (0, 517)	0.348999480031534
  (0, 719)	0.44897853185595576
  (0, 752)	0.4277309174216391
  (0, 822)	0.4422608296780485
  (0, 863)	0.46076505556991504
  (0, 865)	0.3829087751219917
  (0, 882)	0.3146314102564105
  (0, 969)	0.38324970275880166
  (0, 1034)	0.4656514120799834
  (0, 1060)	0.4137766554433221
  (0, 1125)	0.28736925724877543
  (0, 1156)	0.38997798233281183
  :	:
  (3853, 4232)	0.29948600174978157
  (3853, 4234)	0.32511982303060116
  (3853, 4254)	0.1817172091144695
  (3853, 4280)	0.2688032287854436
  (3853, 4311)	0.15205861389459588
  (3853, 4312)	0.1702561819429761
  (3853, 4330)	0.305119770580

In [16]:
#model based on anime similarity
anime_similarity = cosine_similarity(piv_sparse)

#Df of anime similarities
ani_sim_df = pd.DataFrame(anime_similarity, index = pivot_n.index, columns = pivot_n.index)

In [33]:
ani_sim_df

name,"""Oshi no Ko""",.hack//Roots,.hack//Sign,.hack//Tasogare no Udewa Densetsu,009-1,07-Ghost,100% Pascal-sensei (TV),100-man no Inochi no Ue ni Ore wa Tatteiru,100-man no Inochi no Ue ni Ore wa Tatteiru 2nd Season,11eyes,...,ef: A Tale of Memories.,gdMen,gdgd Fairies,gdgd Fairies 2,iDOLM@STER Xenoglossia,number24,s.CRY.ed,xxxHOLiC,xxxHOLiC◆Kei,ēlDLIVE
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"""Oshi no Ko""",1.000000,-0.046321,-0.070077,-0.071556,-0.009053,-0.118344,-0.058144,-0.255031,-0.244773,-0.107604,...,0.100860,0.000000,0.000000,0.000000,-0.017592,-0.047057,-0.003957,0.030236,0.039707,-0.063475
.hack//Roots,-0.046321,1.000000,0.414776,0.460855,0.057554,0.120713,0.010247,0.035000,0.042847,0.100430,...,-0.150666,0.012345,0.024282,0.026532,0.075513,0.001986,0.105831,-0.187299,-0.136786,0.037143
.hack//Sign,-0.070077,0.414776,1.000000,0.459410,0.070373,0.138075,0.018292,0.046066,0.036641,0.074597,...,-0.146378,0.011253,0.021672,0.024050,0.090073,0.009077,0.185984,-0.193532,-0.126977,0.027606
.hack//Tasogare no Udewa Densetsu,-0.071556,0.460855,0.459410,1.000000,0.079736,0.087332,0.013694,0.071040,0.048233,0.104403,...,-0.162419,0.027850,0.009074,0.007498,0.095667,0.012812,0.125936,-0.195074,-0.131053,0.034194
009-1,-0.009053,0.057554,0.070373,0.079736,1.000000,0.010130,0.031857,0.014994,0.013670,0.051161,...,-0.122712,0.000000,0.000000,0.005246,0.071655,0.000000,0.003673,-0.135208,-0.096153,0.017186
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
number24,-0.047057,0.001986,0.009077,0.012812,0.000000,0.023147,0.000000,0.049773,0.050875,0.066426,...,-0.032183,0.136602,0.000000,0.000000,0.000000,1.000000,-0.003921,-0.047734,-0.074657,0.063067
s.CRY.ed,-0.003957,0.105831,0.185984,0.125936,0.003673,0.051033,0.000000,-0.004577,0.001861,0.006589,...,-0.012579,-0.015876,-0.005047,-0.005194,0.014378,-0.003921,1.000000,-0.073666,-0.039914,-0.000614
xxxHOLiC,0.030236,-0.187299,-0.193532,-0.195074,-0.135208,-0.142462,-0.012428,-0.042912,-0.035359,-0.163141,...,0.229850,-0.012397,-0.009565,-0.014436,-0.112811,-0.047734,-0.073666,1.000000,0.524643,-0.005327
xxxHOLiC◆Kei,0.039707,-0.136786,-0.126977,-0.131053,-0.096153,-0.187318,0.000000,-0.050243,-0.048963,-0.162304,...,0.235263,-0.018992,-0.007328,-0.013763,-0.091582,-0.074657,-0.039914,0.524643,1.000000,0.000000


In [17]:
def anime_recommendation(ani_name):
    """
    This function will return the top 5 shows with the highest cosine similarity value and show match percent
    
    example:
    >>>Input: 
    
    anime_recommendation('Death Note')
    
    >>>Output: 
    
    Recommended because you watched Death Note:

                    #1: Code Geass: Hangyaku no Lelouch, 57.35% match
                    #2: Code Geass: Hangyaku no Lelouch R2, 54.81% match
                    #3: Fullmetal Alchemist, 51.07% match
                    #4: Shingeki no Kyojin, 48.68% match
                    #5: Fullmetal Alchemist: Brotherhood, 45.99% match 

               
    """
    
    number = 1
    print('Recommended because you watched {}:\n'.format(ani_name))
    for anime in ani_sim_df.sort_values(by = ani_name, ascending = False).index[1:6]:
        print(f'#{number}: {anime}, {round(ani_sim_df[anime][ani_name]*100,2)}% match')
        number +=1  

In [18]:
anime_recommendation('Naruto')

Recommended because you watched Naruto:

#1: Fullmetal Alchemist, 57.53% match
#2: Naruto: Shippuuden, 55.49% match
#3: Bleach, 50.08% match
#4: Death Note, 45.33% match
#5: Trigun, 45.09% match


In [19]:
anime_recommendation('Death Note')

Recommended because you watched Death Note:

#1: Fullmetal Alchemist, 59.27% match
#2: Code Geass: Hangyaku no Lelouch, 58.41% match
#3: Cowboy Bebop, 52.7% match
#4: Samurai Champloo, 49.91% match
#5: Darker than Black: Kuro no Keiyakusha, 49.79% match


In [20]:
anime_recommendation("Death Note")

Recommended because you watched Death Note:

#1: Fullmetal Alchemist, 59.27% match
#2: Code Geass: Hangyaku no Lelouch, 58.41% match
#3: Cowboy Bebop, 52.7% match
#4: Samurai Champloo, 49.91% match
#5: Darker than Black: Kuro no Keiyakusha, 49.79% match
