In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

In [2]:
df=pd.read_csv('anime.csv')

In [3]:
df

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


In [4]:
df.columns

Index(['anime_id', 'name', 'genre', 'type', 'episodes', 'rating', 'members'], dtype='object')


# Data Preprocessing:


In [6]:
df.isnull().sum()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

In [7]:
num_col=[i for i in df.columns if df[i].dtype!='O']
num_col

['anime_id', 'rating', 'members']

In [8]:
cat_col=[i for i in df.columns if df[i].dtype=='O']
cat_col

['name', 'genre', 'type', 'episodes']

In [9]:
df.dropna(subset=['genre', 'type'], inplace=True)

In [10]:
df['rating'].fillna(df['rating'].mean(), inplace=True)


In [11]:
df.isnull().sum()



anime_id    0
name        0
genre       0
type        0
episodes    0
rating      0
members     0
dtype: int64

In [12]:
df.shape

(12210, 7)

In [13]:
len(df.anime_id.unique())

12210

In [14]:
len(df.name.unique())

12208

In [15]:
len(df.genre.unique())

3260

In [16]:
len(df.type.unique())

6

In [17]:
len(df.episodes.unique())

187

In [18]:
len(df.rating.unique())

599

In [19]:
len(df.members.unique())

6691

In [20]:
df.groupby('name')['rating'].mean().sort_values(ascending=False)

name
Taka no Tsume 8: Yoshida-kun no X-Files    10.00
Spoon-hime no Swing Kitchen                 9.60
Mogura no Motoro                            9.50
Kimi no Na wa.                              9.37
Kahei no Umi                                9.33
                                           ...  
Hametsu no Mars                             2.37
Utsu Musume Sayuri                          2.14
Tenkuu Danzai Skelter+Heaven                2.00
Hi Gekiga Ukiyoe Senya Ichiya               1.92
Platonic Chain: Ansatsu Jikkouchuu          1.67
Name: rating, Length: 12208, dtype: float64

# Feature Extraction:

In [22]:

def clean_genre_column(genres):
    return ','.join(
        [label.strip().lower().replace("'", "").replace('"', '').replace('[', '').replace(']', '') 
         for label in genres.split(',')]
    )

df['genre'] = df['genre'].apply(clean_genre_column)


In [23]:
#Split the cleaned 'genre' column into lists
df['genre'] = df['genre'].str.split(',')

unique_labels = df['genre'].explode().unique()
print(f"Unique labels ({len(unique_labels)}): {unique_labels}")

Unique labels (43): ['drama' 'romance' 'school' 'supernatural' 'action' 'adventure' 'fantasy'
 'magic' 'military' 'shounen' 'comedy' 'historical' 'parody' 'samurai'
 'sci-fi' 'thriller' 'sports' 'super power' 'space' 'slice of life'
 'mecha' 'music' 'mystery' 'seinen' 'martial arts' 'vampire' 'shoujo'
 'horror' 'police' 'psychological' 'demons' 'ecchi' 'josei' 'shounen ai'
 'game' 'dementia' 'harem' 'cars' 'kids' 'shoujo ai' 'hentai' 'yaoi'
 'yuri']


In [24]:
df_cleaned = df.explode('genre') #Exploding the column ensures the list values are flattened before encoding

one_hot_genres = pd.get_dummies(df_cleaned['genre']).groupby(df_cleaned.index).max()
#  Combine the one-hot encoded columns with the original DataFrame
df = pd.concat([df.drop(columns=['genre']), one_hot_genres], axis=1)

In [25]:
df

Unnamed: 0,anime_id,name,type,episodes,rating,members,action,adventure,cars,comedy,...,shounen ai,slice of life,space,sports,super power,supernatural,thriller,vampire,yaoi,yuri
0,32281,Kimi no Na wa.,Movie,1,9.37,200630,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,5114,Fullmetal Alchemist: Brotherhood,TV,64,9.26,793665,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,28977,Gintama°,TV,51,9.25,114262,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,9253,Steins;Gate,TV,24,9.17,673572,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,9969,Gintama&#039;,TV,51,9.16,151266,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,OVA,1,4.15,211,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12290,5543,Under World,OVA,1,4.28,183,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12291,5621,Violence Gekiga David no Hoshi,OVA,4,4.88,219,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,OVA,1,4.98,175,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
df.columns

Index(['anime_id', 'name', 'type', 'episodes', 'rating', 'members', 'action',
       'adventure', 'cars', 'comedy', 'dementia', 'demons', 'drama', 'ecchi',
       'fantasy', 'game', 'harem', 'hentai', 'historical', 'horror', 'josei',
       'kids', 'magic', 'martial arts', 'mecha', 'military', 'music',
       'mystery', 'parody', 'police', 'psychological', 'romance', 'samurai',
       'school', 'sci-fi', 'seinen', 'shoujo', 'shoujo ai', 'shounen',
       'shounen ai', 'slice of life', 'space', 'sports', 'super power',
       'supernatural', 'thriller', 'vampire', 'yaoi', 'yuri'],
      dtype='object')

In [27]:
df.drop(columns=['anime_id','type', 'episodes'],inplace=True)

In [28]:
df.head()

Unnamed: 0,name,rating,members,action,adventure,cars,comedy,dementia,demons,drama,...,shounen ai,slice of life,space,sports,super power,supernatural,thriller,vampire,yaoi,yuri
0,Kimi no Na wa.,9.37,200630,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
1,Fullmetal Alchemist: Brotherhood,9.26,793665,1,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,Gintama°,9.25,114262,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Steins;Gate,9.17,673572,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,Gintama&#039;,9.16,151266,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
from sklearn.preprocessing import MinMaxScaler

In [30]:
min_max=MinMaxScaler()

In [31]:
df['rating']=min_max.fit_transform(df[['rating']])

In [32]:
df.head()

Unnamed: 0,name,rating,members,action,adventure,cars,comedy,dementia,demons,drama,...,shounen ai,slice of life,space,sports,super power,supernatural,thriller,vampire,yaoi,yuri
0,Kimi no Na wa.,0.92437,200630,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
1,Fullmetal Alchemist: Brotherhood,0.911164,793665,1,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,Gintama°,0.909964,114262,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Steins;Gate,0.90036,673572,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,Gintama&#039;,0.89916,151266,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
df['rating'].unique()

array([0.92436975, 0.91116447, 0.90996399, 0.90036014, 0.89915966,
       0.89795918, 0.89555822, 0.89315726, 0.89195678, 0.88715486,
       0.88595438, 0.8847539 , 0.87755102, 0.87154862, 0.87034814,
       0.86554622, 0.8607443 , 0.85954382, 0.85834334, 0.85714286,
       0.85594238, 0.85354142, 0.85234094, 0.85114046, 0.84993998,
       0.8487395 , 0.84753902, 0.84633854, 0.84513806, 0.84273709,
       0.84153661, 0.84033613, 0.83913565, 0.83793517, 0.83673469,
       0.83433373, 0.83313325, 0.83193277, 0.83073229, 0.82953181,
       0.82833133, 0.82713085, 0.82593037, 0.82472989, 0.82352941,
       0.82232893, 0.82112845, 0.81992797, 0.81872749, 0.81752701,
       0.81632653, 0.81512605, 0.81392557, 0.81272509, 0.81152461,
       0.81032413, 0.80912365, 0.80792317, 0.80672269, 0.80552221,
       0.80432173, 0.80312125, 0.80192077, 0.80072029, 0.79951981,
       0.79831933, 0.79711885, 0.79591837, 0.79471789, 0.79351741,
       0.79231693, 0.79111645, 0.78991597, 0.78871549, 0.78751

In [34]:
df['members']=min_max.fit_transform(df[['members']])


In [35]:
df.head()

Unnamed: 0,name,rating,members,action,adventure,cars,comedy,dementia,demons,drama,...,shounen ai,slice of life,space,sports,super power,supernatural,thriller,vampire,yaoi,yuri
0,Kimi no Na wa.,0.92437,0.197872,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
1,Fullmetal Alchemist: Brotherhood,0.911164,0.78277,1,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,Gintama°,0.909964,0.112689,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Steins;Gate,0.90036,0.664325,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,Gintama&#039;,0.89916,0.149186,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [36]:
array_scaled=df['members'].unique()

In [37]:
print("Min value:", np.min(array_scaled))
print("Max value:", np.max(array_scaled))


Min value: 0.0
Max value: 1.0


# Recommendation system using cosine similarity:

In [39]:
df.columns

Index(['name', 'rating', 'members', 'action', 'adventure', 'cars', 'comedy',
       'dementia', 'demons', 'drama', 'ecchi', 'fantasy', 'game', 'harem',
       'hentai', 'historical', 'horror', 'josei', 'kids', 'magic',
       'martial arts', 'mecha', 'military', 'music', 'mystery', 'parody',
       'police', 'psychological', 'romance', 'samurai', 'school', 'sci-fi',
       'seinen', 'shoujo', 'shoujo ai', 'shounen', 'shounen ai',
       'slice of life', 'space', 'sports', 'super power', 'supernatural',
       'thriller', 'vampire', 'yaoi', 'yuri'],
      dtype='object')

# item based

In [41]:
df

Unnamed: 0,name,rating,members,action,adventure,cars,comedy,dementia,demons,drama,...,shounen ai,slice of life,space,sports,super power,supernatural,thriller,vampire,yaoi,yuri
0,Kimi no Na wa.,0.924370,0.197872,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
1,Fullmetal Alchemist: Brotherhood,0.911164,0.782770,1,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,Gintama°,0.909964,0.112689,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Steins;Gate,0.900360,0.664325,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,Gintama&#039;,0.899160,0.149186,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12289,Toushindai My Lover: Minami tai Mecha-Minami,0.297719,0.000203,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12290,Under World,0.313325,0.000176,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12291,Violence Gekiga David no Hoshi,0.385354,0.000211,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12292,Violence Gekiga Shin David no Hoshi: Inma Dens...,0.397359,0.000168,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [42]:

df_melted = df.melt(
    id_vars=['name', 'rating', 'members'],  # Columns to keep
    value_vars=[col for col in df.columns if col not in ['name', 'rating', 'members']],  # Genre columns
    var_name='genre',  # Name for the genre column
    value_name='present'  # Binary value indicating presence
)

# Filter only rows where genre is present (value is 1)
df_filtered = df_melted[df_melted['present'] == 1]

df_pivot = df_filtered.pivot_table(
    index='name', 
    columns='genre', 
    values='rating',  
    aggfunc='mean'  
)

# Merge the 'members' column from the original DataFrame with the pivot table
df_pivot = df_pivot.merge(
    df[['name', 'members']].drop_duplicates(),
    left_index=True,  
    right_on='name', 
    how='left'  
)


df_pivot.set_index('name', inplace=True)

# Check the result
df_pivot.head()


Unnamed: 0_level_0,action,adventure,cars,comedy,dementia,demons,drama,ecchi,fantasy,game,...,slice of life,space,sports,super power,supernatural,thriller,vampire,yaoi,yuri,members
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
&quot;0&quot;,,,,,,,,,,,...,,,,,,,,,,0.001149
"&quot;Aesop&quot; no Ohanashi yori: Ushi to Kaeru, Yokubatta Inu",,,,,,,,,,,...,,,,,,,,,,0.000107
&quot;Bungaku Shoujo&quot; Kyou no Oyatsu: Hatsukoi,,,,0.647059,,,,,0.647059,,...,,,,,,,,,,0.014149
&quot;Bungaku Shoujo&quot; Memoire,,,,,,,0.704682,,,,...,,,,,,,,,,0.017761
&quot;Bungaku Shoujo&quot; Movie,,,,,,,0.715486,,,,...,,,,,,,,,,0.040417


In [43]:
df_pivot.fillna(0,axis=1,inplace=True) 

In [44]:
df_pivot.head()

Unnamed: 0_level_0,action,adventure,cars,comedy,dementia,demons,drama,ecchi,fantasy,game,...,slice of life,space,sports,super power,supernatural,thriller,vampire,yaoi,yuri,members
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
&quot;0&quot;,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001149
"&quot;Aesop&quot; no Ohanashi yori: Ushi to Kaeru, Yokubatta Inu",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000107
&quot;Bungaku Shoujo&quot; Kyou no Oyatsu: Hatsukoi,0.0,0.0,0.0,0.647059,0.0,0.0,0.0,0.0,0.647059,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014149
&quot;Bungaku Shoujo&quot; Memoire,0.0,0.0,0.0,0.0,0.0,0.0,0.704682,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.017761
&quot;Bungaku Shoujo&quot; Movie,0.0,0.0,0.0,0.0,0.0,0.0,0.715486,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.040417


In [45]:
df_pivot.index

Index(['&quot;0&quot;',
       '&quot;Aesop&quot; no Ohanashi yori: Ushi to Kaeru, Yokubatta Inu',
       '&quot;Bungaku Shoujo&quot; Kyou no Oyatsu: Hatsukoi',
       '&quot;Bungaku Shoujo&quot; Memoire',
       '&quot;Bungaku Shoujo&quot; Movie', '&quot;Eiji&quot;',
       '&quot;Eiyuu&quot; Kaitai', '.hack//G.U. Returner',
       '.hack//G.U. Trilogy', '.hack//G.U. Trilogy: Parody Mode',
       ...
       's.CRY.ed', 'vivi', 'xxxHOLiC', 'xxxHOLiC Kei',
       'xxxHOLiC Movie: Manatsu no Yoru no Yume', 'xxxHOLiC Rou',
       'xxxHOLiC Shunmuki', 'Üks Uks', 'ēlDLIVE', '◯'],
      dtype='object', name='name', length=12210)

In [46]:
similarities=cosine_similarity(df_pivot)  #internal similarity clacultns
similarities

array([[1.00000000e+00, 7.52304188e-07, 3.56418960e-05, ...,
        3.14435297e-07, 1.93236496e-05, 7.07106452e-01],
       [7.52304188e-07, 1.00000000e+00, 3.36368880e-06, ...,
        2.96746976e-08, 1.82366122e-06, 2.68916413e-07],
       [3.56418960e-05, 3.36368880e-06, 1.00000000e+00, ...,
        1.40589738e-06, 8.63995505e-05, 1.27404459e-05],
       ...,
       [3.14435297e-07, 2.96746976e-08, 1.40589738e-06, ...,
        1.00000000e+00, 7.62222870e-07, 7.07106529e-01],
       [1.93236496e-05, 1.82366122e-06, 8.63995505e-05, ...,
        7.62222870e-07, 1.00000000e+00, 6.90737419e-06],
       [7.07106452e-01, 2.68916413e-07, 1.27404459e-05, ...,
        7.07106529e-01, 6.90737419e-06, 1.00000000e+00]])

In [47]:
similarities[0] 

array([1.00000000e+00, 7.52304188e-07, 3.56418960e-05, ...,
       3.14435297e-07, 1.93236496e-05, 7.07106452e-01])

In [48]:
similarities.shape

(12210, 12210)

In [49]:
def recommended_anime(anime_name):
    if anime_name in df_pivot.index:             #check anime in there in our index
        index= np.where(anime_name==df_pivot.index)[0][0]     #pick the index value of anime [0][0]= [array][first element]
        
        similar_anime= sorted(list(enumerate(similarities[index])),key=lambda x: x[1],reverse=True)[1:6] 
        
        print(f'Recommended_Anime of {anime_name}')
        print('-'*20)
        
        for anime in similar_anime:   #loop in similar_anime to give the anime name from the score
            print(df_pivot.index[anime[0]])
            
        print('anime is not in the list')


In [50]:
recommended_anime('Jumanji (1995)')

In [51]:
recommended_anime('&quot;Bungaku Shoujo&quot; Kyou no Oyatsu: Hatsukoi')

Recommended_Anime of &quot;Bungaku Shoujo&quot; Kyou no Oyatsu: Hatsukoi
--------------------
Koihime†Musou OVA
Pugyuru
Kyoukaisenjou no Horizon Specials
Quiz Magic Academy: The Original Animation
Demi-chan wa Kataritai
anime is not in the list


# Interview questions

In [144]:

#1. Can you explain the difference between user-based and item-based collaborative filtering?

In [146]:
#User-based Collaborative Filtering:
#This method recommends items to a user based on the preferences of other users who are similar to them.
#The idea is that if two users have similar preferences or behaviors in the past (e.g., they liked the same anime or movie), then the items liked by one user are recommended to the other.

In [148]:
# item-based Collaborative Filtering:
#This method recommends items based on the similarity between items. It assumes that if a user liked a particular item, they will also like other similar items.
#Instead of focusing on finding similar users, it focuses on finding similarities between items (based on user interactions) and recommends items similar to the ones a user has liked.

In [150]:
# 2. What is collaborative filtering, and how does it work?

In [152]:
# Collaborative Filtering (CF) is a method used in recommendation systems to suggest items (e.g., movies, products, anime) to users based on the preferences or behaviors of other users. 
#It is based on the idea that people who agreed in the past will agree in the future. 
#In other words, CF makes recommendations by gathering preferences from many users and looking for patterns in the data.

In [None]:
# Working:
#Data collection: CF requires historical user-item interaction data, such as ratings, clicks, or purchase history.
#Similarity calculation: For user-based CF, it calculates the similarity between users (e.g., using cosine similarity, Pearson correlation). For item-based CF, it calculates the similarity between items based on user interactions.
#Recommendation generation: Based on the similarity scores, the system recommends items that are similar to the ones the user has interacted with or liked before.