In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
anime = pd.read_csv('anime.csv')

In [3]:
anime.shape

(12294, 7)

In [4]:
anime.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [5]:
anime.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [6]:
anime.isnull().sum()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

In [7]:
anime['genre'] = anime['genre'].fillna('')

In [8]:
anime['rating'] = anime['rating'].fillna(anime['rating'].mean())

In [9]:
anime['episodes'] = anime['episodes'].replace('Unknown', np.nan)

In [10]:
anime['episodes'] = anime['episodes'].astype(float)

In [11]:
anime['episodes'] = anime['episodes'].fillna(anime['episodes'].median())

In [12]:
anime['members'] = anime['members'].fillna(anime['members'].median())

In [13]:
anime['type'] = anime['type'].fillna('Unknown')

In [14]:
tfidf = TfidfVectorizer(stop_words='english')
genre_matrix = tfidf.fit_transform(anime['genre'])

In [15]:
num_features = anime[['rating', 'episodes', 'members']]
scaler = MinMaxScaler()
num_scaled = scaler.fit_transform(num_features)

In [16]:
# Combine TF-IDF genre features and normalized numeric features
combined_features = np.hstack((genre_matrix.toarray(), num_scaled))

In [17]:
#  COSINE SIMILARITY 
similarity_matrix = cosine_similarity(combined_features, combined_features)

In [18]:
#  ANIME INDEX MAPPING 
index_map = pd.Series(anime.index, index=anime['name']).drop_duplicates()

In [19]:
# RECOMMENDATION FUNCTION
def recommend(anime_title, threshold=None, top_n=5):
    if anime_title in index_map.index:
        idx = index_map[anime_title]
        scores = list(enumerate(similarity_matrix[idx]))
        sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
        sorted_scores = sorted_scores[1:]
        
        if threshold:
            sorted_scores = [s for s in sorted_scores if s[1] >= threshold]
        
        sorted_scores = sorted_scores[:top_n]

        print(f"Recommended anime for '{anime_title}':")
        for i, score in sorted_scores:
            print(f"{index_map.index[i]}  (Similarity: {score:.3f})") 
    else:
        print("Anime not found in dataset")


In [20]:
recommend('Kimi no Na wa.', top_n=5)
print("\n--- Threshold Testing ---")
for t in [0.3, 0.4, 0.5, 0.6]:
    print(f"\nThreshold >= {t}")
    recommend('Kimi no Na wa.', threshold=t, top_n=10)


Recommended anime for 'Kimi no Na wa.':
Wind: A Breath of Heart OVA  (Similarity: 0.963)
Wind: A Breath of Heart (TV)  (Similarity: 0.959)
Aura: Maryuuin Kouga Saigo no Tatakai  (Similarity: 0.958)
Shakugan no Shana II (Second)  (Similarity: 0.918)
Angel Beats!: Another Epilogue  (Similarity: 0.916)

--- Threshold Testing ---

Threshold >= 0.3
Recommended anime for 'Kimi no Na wa.':
Wind: A Breath of Heart OVA  (Similarity: 0.963)
Wind: A Breath of Heart (TV)  (Similarity: 0.959)
Aura: Maryuuin Kouga Saigo no Tatakai  (Similarity: 0.958)
Shakugan no Shana II (Second)  (Similarity: 0.918)
Angel Beats!: Another Epilogue  (Similarity: 0.916)
Shakugan no Shana  (Similarity: 0.915)
Shakugan no Shana S  (Similarity: 0.908)
Harmonie  (Similarity: 0.907)
Clannad: After Story - Mou Hitotsu no Sekai, Kyou-hen  (Similarity: 0.902)
Kokoro ga Sakebitagatterunda.  (Similarity: 0.901)

Threshold >= 0.4
Recommended anime for 'Kimi no Na wa.':
Wind: A Breath of Heart OVA  (Similarity: 0.963)
Wind: A Br

### Next 2 cells are resubmission part

In [21]:
# --- Evaluation of the Recommendation System ---

# Example evaluation for a few sample anime
sample_titles = ['Naruto', 'Death Note', 'One Piece']

for title in sample_titles:
    print(f"\nRecommendations for '{title}':")
    recs = recommend(title, top_n=5, threshold=0.3)
    print(recs)


Recommendations for 'Naruto':
Recommended anime for 'Naruto':
Naruto: Shippuuden  (Similarity: 0.991)
Dragon Ball Z  (Similarity: 0.943)
Dragon Ball  (Similarity: 0.916)
Naruto: Shippuuden Movie 4 - The Lost Tower  (Similarity: 0.906)
Naruto: Shippuuden Movie 3 - Hi no Ishi wo Tsugu Mono  (Similarity: 0.906)
None

Recommendations for 'Death Note':
Recommended anime for 'Death Note':
Mirai Nikki (TV)  (Similarity: 0.905)
Mousou Dairinin  (Similarity: 0.838)
Death Note Rewrite  (Similarity: 0.834)
Higurashi no Naku Koro ni  (Similarity: 0.833)
Higurashi no Naku Koro ni Kai  (Similarity: 0.828)
None

Recommendations for 'One Piece':
Recommended anime for 'One Piece':
One Piece: Episode of Nami - Koukaishi no Namida to Nakama no Kizuna  (Similarity: 0.940)
Shingeki no Kyojin  (Similarity: 0.939)
One Piece: Episode of Merry - Mou Hitori no Nakama no Monogatari  (Similarity: 0.938)
One Piece: Episode of Sabo - 3 Kyoudai no Kizuna Kiseki no Saikai to Uketsugareru Ishi  (Similarity: 0.935)
Hu

## Interview Questions

1. Can you explain the difference between user-based and item-based collaborative filtering?

-> User-Based Collaborative Filtering:
    a) Finds users similar to the target user.

    b) Recommends items liked by those similar users.

    c) Assumes similar users have similar tastes.

    d) Uses user–user similarity.

-> Item-Based Collaborative Filtering:

    a) Finds items similar to items the user already likes.

    b) Recommends those similar items.

    c) Assumes similar items attract similar users.

    d) Uses item–item similarity.


2.  What is collaborative filtering, and how does it work?

 -> Collaborative filtering recommends items based on user behavior and preferences by finding patterns in what users like or rate. It can be user-based, which looks for similar users, or item-based, which looks for similar items. By calculating these similarities, it predicts what a user might like and suggests items that are highly rated or closely related to their past preferences.