In [36]:
# import the libarary which are important
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler


In [37]:
# load the data or import
df = pd.read_csv("D:\\anime.csv")
df

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


In [38]:
# calculate the missing values
df.isnull().sum()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

In [39]:
# fill the missing values
df['genre'].fillna(df['genre'].mode()[0],inplace = True)
df['type'].fillna(df['type'].mode()[0],inplace = True)

In [40]:
df['rating'].fillna(df['rating'].mean(),inplace = True)

In [41]:
df.isnull().sum() # know there is no missing values

anime_id    0
name        0
genre       0
type        0
episodes    0
rating      0
members     0
dtype: int64

In [42]:
# we have to used countvectore which can convert the categorical data into vactor.
# Convert genres into vectors
cv = CountVectorizer(tokenizer=lambda x: x.split(", "))
genre_matrix = cv.fit_transform(df['genre'])



In [44]:
# Optional: If you have numerical features like ratings, normalize them
if 'rating' in df.columns:
    scaler = StandardScaler()
    ratings_scaled = scaler.fit_transform(df[['rating']])
    features = pd.concat([pd.DataFrame(genre_matrix ), pd.DataFrame(ratings_scaled)], axis=1)
else:
    features = pd.DataFrame(genre_matrix )

In [45]:
# Step 4: Split dataset for evaluation (conceptual, since content-based needs target anime)
# We'll simulate a train-test split for recommendation evaluation
train_features, test_features, train_indices, test_indices = train_test_split(
    features, df.index, test_size=0.2, random_state=42)


In [46]:
# know we have to compute the similarity of genre
cosine_sim = cosine_similarity(genre_matrix)

In [47]:
# Step 6: Recommendation function
def recommend_anime(target_index, top_n=5, threshold=0.1):
    # Get similarity scores for the target anime
    sim_scores = list(enumerate(cosine_sim[target_index]))
    # Filter out scores below the threshold
    sim_scores = [(i, score) for i, score in sim_scores if score >= threshold and i != target_index]
    # Sort by similarity
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Get top N recommendations
    recommended_indices = [i for i, score in sim_scores[:top_n]]
    return df.loc[recommended_indices, 'name'].values

In [48]:
# Example usage
target_anime_index = 0  # Index of anime to get recommendations for
print("Recommendations for:", df.loc[target_anime_index, 'name'])
print(recommend_anime(target_anime_index, top_n=5, threshold=0.2))


Recommendations for: Kimi no Na wa.
['Wind: A Breath of Heart OVA' 'Wind: A Breath of Heart (TV)'
 'Aura: Maryuuin Kouga Saigo no Tatakai' 'Kokoro ga Sakebitagatterunda.'
 'Clannad: After Story - Mou Hitotsu no Sekai, Kyou-hen']


In [51]:

# Step 7: Evaluation (conceptual)
# For real evaluation, you need user interaction data (ratings)
# Here, we simulate by assuming top-1 recommendation should match the test set index
# This is just illustrative, not a proper real evaluation
recommended_anime = recommend_anime(0, top_n=5, threshold=0.2)
y_true = [1 if i in test_indices[:5].tolist() else 0 for i in range(len(df))]
y_pred = [1 if df.loc[i, 'name'] in recommended_anime else 0 for i in range(len(df))]



In [52]:
precision = precision_score(y_true, y_pred, zero_division=0)
recall = recall_score(y_true, y_pred, zero_division=0)
f1 = f1_score(y_true, y_pred, zero_division=0)


In [54]:
# interview question

#1. Can you explain the difference between user-based and item-based collaborative filtering?
#User-based and item-based collaborative filtering are two approaches to making recommendations based on user interactions,
#such as ratings or purchases. User-based collaborative filtering focuses on finding users who are similar to a target user 
#and recommends items that those similar users have liked. For example, if two users have rated many of the same movies
#similarly, the system assumes they have similar tastes and suggests movies liked by one user to the other.



#2. What is collaborative filtering, and how does it work? in paragraph
#Collaborative filtering is a popular technique used in recommendation systems to suggest items to users based
#on the preferences or behavior of other users. It works on the principle that users who have agreed in the past will 
#likely agree in the future, or that users with similar tastes will enjoy similar items. There are two main types: 
#user-based and item-based.