In [None]:
# load library
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv('/content/anime.csv')
data.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [None]:
#info
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [None]:
# Check for missing values
print("\nMissing Values:")
print(data.isnull().sum())


Missing Values:
anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64


In [None]:
# Fill missing 'genre' and 'type' with "Unknown"
data['genre'].fillna('Unknown', inplace=True)
data['type'].fillna('Unknown', inplace=True)

# Fill missing ratings with 0 (or you can use mean if preferred)
data['rating'].fillna(0, inplace=True)


In [None]:
# Convert episodes column from object to numeric
# Invalid strings like "Unknown" will be converted to NaN
data['episodes'] = pd.to_numeric(data['episodes'], errors='coerce')

# Fill missing episode values with the median
data['episodes'].fillna(data['episodes'].median(), inplace=True)

# Confirm data types are now correct
print(data.dtypes)


anime_id      int64
name         object
genre        object
type         object
episodes    float64
rating      float64
members       int64
dtype: object


In [None]:
data.describe()

Unnamed: 0,anime_id,episodes,rating,members
count,12294.0,12294.0,12294.0,12294.0
mean,14058.221653,12.095412,6.352786,18071.34
std,11455.294701,46.244062,1.343119,54820.68
min,1.0,1.0,0.0,5.0
25%,3484.25,1.0,5.82,225.0
50%,10260.5,2.0,6.55,1550.0
75%,24794.5,12.0,7.17,9437.0
max,34527.0,1818.0,10.0,1013917.0


In [None]:
# Replace NaNs with empty string
data['genre'] = data['genre'].fillna('')

In [None]:
# Use TF-IDF to convert genres into numerical features
tfidf = TfidfVectorizer(stop_words='english')
genre_matrix = tfidf.fit_transform(data['genre'])
tfidf
genre_matrix

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 40480 stored elements and shape (12294, 47)>

In [None]:
# compute
cosine_sim = cosine_similarity(genre_matrix, genre_matrix)
cosine_sim

array([[1.        , 0.14784981, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.14784981, 1.        , 0.1786367 , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.1786367 , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 1.        ,
        1.        ],
       [0.        , 0.        , 0.        , ..., 1.        , 1.        ,
        1.        ],
       [0.        , 0.        , 0.        , ..., 1.        , 1.        ,
        1.        ]])

In [None]:
# Create a Series to map anime names to indices
anime_indices = pd.Series(data.index, index=data['name']).drop_duplicates()
anime_indices

Unnamed: 0_level_0,0
name,Unnamed: 1_level_1
Kimi no Na wa.,0
Fullmetal Alchemist: Brotherhood,1
Gintama°,2
Steins;Gate,3
Gintama&#039;,4
...,...
Toushindai My Lover: Minami tai Mecha-Minami,12289
Under World,12290
Violence Gekiga David no Hoshi,12291
Violence Gekiga Shin David no Hoshi: Inma Densetsu,12292


In [None]:
# Split the dataset (e.g., 80% for training, 20% for testing)
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

In [None]:
# Reset index for convenience
train_data = train_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)
print(train_data)
print('------------')
print(test_data)

      anime_id                               name  \
0         5342                  Asura Cryin&#039;   
1         9581                       MM! Specials   
2         9810     Nyani ga Nyandaa Nyandaa Kamen   
3         1539  Touch: Cross Road - Kaze no Yukue   
4         4439                  Kurenai Sanshirou   
...        ...                                ...   
9830      4638                           Milkyway   
9831      5272        Tondemo Nezumi Daikatsuyaku   
9832      1262           Macross II: Lovers Again   
9833     22819                     Aikatsu! Movie   
9834      2364          Virus: Virus Buster Serge   

                                                  genre     type  episodes  \
0                           Action, Mecha, Supernatural       TV      13.0   
1                                 Comedy, Ecchi, School  Special       9.0   
2                                                Comedy       TV      83.0   
3                              Romance, Shounen, Sp

In [None]:
tfidf = TfidfVectorizer(stop_words='english')
genre_matrix_train = tfidf.fit_transform(train_data['genre'].fillna(''))

In [None]:
# Compute cosine similarity only within the training data

cosine_sim_train = cosine_similarity(genre_matrix_train, genre_matrix_train)
cosine_sim_train

array([[1.        , 0.        , 0.        , ..., 0.2513485 , 0.        ,
        0.41260387],
       [0.        , 1.        , 0.35694953, ..., 0.        , 0.24993161,
        0.        ],
       [0.        , 0.35694953, 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.2513485 , 0.        , 0.        , ..., 1.        , 0.        ,
        0.47237654],
       [0.        , 0.24993161, 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.41260387, 0.        , 0.        , ..., 0.47237654, 0.        ,
        1.        ]])

In [None]:
# Fit TF-IDF on genres in training data
tfidf = TfidfVectorizer(stop_words='english')
genre_matrix_train = tfidf.fit_transform(train_data['genre'].fillna(''))

# Compute cosine similarity within the training set
cosine_sim_train = cosine_similarity(genre_matrix_train, genre_matrix_train)

# Map anime names to indices in training set
train_indices = pd.Series(train_data.index, index=train_data['name']).drop_duplicates()

In [None]:
def recommend_from_train(title, threshold=0.3, top_n=10):
    if title not in train_indices:
        return f"'{title}' not found in training data."

    idx = train_indices[title]
    sim_scores = list(enumerate(cosine_sim_train[idx]))

    # Filter based on threshold and exclude self
    filtered = [(i, score) for i, score in sim_scores if i != idx and score >= threshold]
    filtered = sorted(filtered, key=lambda x: x[1], reverse=True)
    top_indices = [i for i, _ in filtered[:top_n]]

    return train_data['name'].iloc[top_indices].tolist()


In [None]:
recommend_from_train("Naruto", threshold=0.2, top_n=5)

['Boruto: Naruto the Movie',
 'Naruto Shippuuden: Sunny Side Battle',
 'Boruto: Naruto the Movie - Naruto ga Hokage ni Natta Hi',
 'Naruto Soyokazeden Movie: Naruto to Mashin to Mitsu no Onegai Dattebayo!!',
 'Naruto: Shippuuden Movie 3 - Hi no Ishi wo Tsugu Mono']

In [None]:
# Ask user for an anime title
user_input = input("Enter an anime name you like: ")

# Ask user how many recommendations they want
top_n = int(input("How many recommendations would you like to see?(Max 5) "))

# Recommend anime using the training-set-based recommender
recommendations = recommend_from_train(user_input, top_n=top_n)

# Display results
if isinstance(recommendations, str):
    print(recommendations)
else:
    print(f"\nRecommended anime similar to '{user_input}':\n")
    for i, anime in enumerate(recommendations, 1):
        print(f"{i}. {anime}")

Enter an anime name you like: Naruto
How many recommendations would you like to see?(Max 5) 5

Recommended anime similar to 'Naruto':

1. Boruto: Naruto the Movie
2. Naruto Shippuuden: Sunny Side Battle
3. Boruto: Naruto the Movie - Naruto ga Hokage ni Natta Hi
4. Naruto Soyokazeden Movie: Naruto to Mashin to Mitsu no Onegai Dattebayo!!
5. Naruto: Shippuuden Movie 3 - Hi no Ishi wo Tsugu Mono


# interview questions

Can you explain the difference between user-based and item-based collaborative filtering?

In [None]:
# User-Based Collaborative Filtering

# Recommends items based on what similar users liked.
#Idea: "If User A and User B both liked Naruto, and User A also liked Bleach, then recommend Bleach to User B."

In [None]:
#Item-Based Collaborative Filtering

#Recommends items that are similar to what the user already liked.
#Idea: "Since you liked Naruto, and people who liked Naruto also liked Bleach, we’ll recommend Bleach."

What is collaborative filtering, and how does it work?

In [None]:
# Collaborative filtering is a technique used in recommendation systems where users help each other by sharing preferences.

# how does it works

# 1. It creates a user-item interaction matrix (ratings, clicks, likes, etc.).

# 2.It finds patterns — like:

    # Which users are similar?

    # Which items are often liked together?

# 3.Based on these patterns, it makes personalized recommendations.