In [22]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from scipy.sparse import hstack, csr_matrix
from tqdm import tqdm

In [2]:
anime_data = pd.read_csv('anime.csv')

In [3]:
anime_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [4]:
anime_data.describe()

Unnamed: 0,anime_id,rating,members
count,12294.0,12064.0,12294.0
mean,14058.221653,6.473902,18071.34
std,11455.294701,1.026746,54820.68
min,1.0,1.67,5.0
25%,3484.25,5.88,225.0
50%,10260.5,6.57,1550.0
75%,24794.5,7.18,9437.0
max,34527.0,10.0,1013917.0


In [5]:
from scipy.stats import skew

In [6]:
anime_data['rating'].skew()

-0.5435700688578503

In [7]:
anime_data['rating'].median()

6.57

In [8]:
anime_data.head(5)

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


approximately symmetric with slight negative skew

In [9]:
anime_data.isnull().sum()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

In [10]:
anime_data['rating'] = anime_data['rating'].fillna(value=anime_data['rating'].median())

In [11]:
anime_data.isnull().sum()

anime_id     0
name         0
genre       62
type        25
episodes     0
rating       0
members      0
dtype: int64

In [12]:
anime_data = anime_data.dropna(subset = ['genre','type'])

In [13]:
anime_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12210 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12210 non-null  int64  
 1   name      12210 non-null  object 
 2   genre     12210 non-null  object 
 3   type      12210 non-null  object 
 4   episodes  12210 non-null  object 
 5   rating    12210 non-null  float64
 6   members   12210 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 763.1+ KB


In [14]:
anime_data = anime_data.drop('anime_id',axis = 1)

In [15]:
anime_data.head(5)

Unnamed: 0,name,genre,type,episodes,rating,members
0,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [16]:
anime_data['type'].value_counts()

TV         3777
OVA        3310
Movie      2306
Special    1674
ONA         655
Music       488
Name: type, dtype: int64

In [17]:
anime_community = anime_data.copy()
anime_community.sort_values(['members'],ascending = False)

Unnamed: 0,name,genre,type,episodes,rating,members
40,Death Note,"Mystery, Police, Psychological, Supernatural, ...",TV,37,8.71,1013917
86,Shingeki no Kyojin,"Action, Drama, Fantasy, Shounen, Super Power",TV,25,8.54,896229
804,Sword Art Online,"Action, Adventure, Fantasy, Game, Romance",TV,25,7.83,893100
1,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
159,Angel Beats!,"Action, Comedy, Drama, School, Supernatural",TV,13,8.39,717796
...,...,...,...,...,...,...
10989,Gan Gan Ganko-chan,"Comedy, Kids, Sci-Fi",TV,Unknown,6.57,19
8367,Chotto Ugoku!? &quot;Futeneko&quot;,Comedy,OVA,1,6.00,17
10464,Taka no Tsume 8: Yoshida-kun no X-Files,"Comedy, Parody",Movie,1,10.00,13
10444,Sushi Azarashi,Comedy,TV,30,3.00,12


In [18]:
anime_community.sort_values(['rating'],ascending = False)

Unnamed: 0,name,genre,type,episodes,rating,members
10464,Taka no Tsume 8: Yoshida-kun no X-Files,"Comedy, Parody",Movie,1,10.00,13
10400,Spoon-hime no Swing Kitchen,"Adventure, Kids",TV,Unknown,9.60,47
9595,Mogura no Motoro,Slice of Life,Movie,1,9.50,62
0,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
9078,Kahei no Umi,Historical,Movie,1,9.33,44
...,...,...,...,...,...,...
8056,Hametsu no Mars,"Horror, Sci-Fi",OVA,1,2.37,27557
8057,Utsu Musume Sayuri,"Comedy, Dementia",OVA,1,2.14,4047
8058,Tenkuu Danzai Skelter+Heaven,"Mecha, Sci-Fi",OVA,1,2.00,7680
12258,Hi Gekiga Ukiyoe Senya Ichiya,"Action, Hentai",Movie,1,1.92,129


In [19]:
filtered = anime_community[anime_community['members'] > 500]
filtered.sort_values(['rating'],ascending = False)

Unnamed: 0,name,genre,type,episodes,rating,members
0,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...
8054,Kokuhaku,Horror,ONA,1,2.67,816
8055,Nami,Dementia,Movie,1,2.67,1817
8056,Hametsu no Mars,"Horror, Sci-Fi",OVA,1,2.37,27557
8057,Utsu Musume Sayuri,"Comedy, Dementia",OVA,1,2.14,4047


In [20]:
filtered = anime_community[anime_community['genre'].str.contains('Romance', case=False, na=False)]
top_romance_drama = filtered.sort_values(by='rating', ascending=False)
top_romance_drama.head(10)

Unnamed: 0,name,genre,type,episodes,rating,members
0,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
10,Clannad: After Story,"Drama, Fantasy, Romance, Slice of Life, Supern...",TV,24,9.06,456749
16,Shigatsu wa Kimi no Uso,"Drama, Music, Romance, School, Shounen",TV,22,8.92,416397
21,Rurouni Kenshin: Meiji Kenkaku Romantan - Tsui...,"Action, Drama, Historical, Martial Arts, Roman...",OVA,4,8.83,129307
25,Suzumiya Haruhi no Shoushitsu,"Comedy, Mystery, Romance, School, Sci-Fi, Supe...",Movie,1,8.81,240297
26,Monogatari Series: Second Season,"Comedy, Mystery, Romance, Supernatural, Vampire",TV,26,8.8,205959
35,Howl no Ugoku Shiro,"Adventure, Drama, Fantasy, Romance",Movie,1,8.74,333186
39,Bakuman. 3rd Season,"Comedy, Drama, Romance, Shounen",TV,25,8.71,133620
45,Kara no Kyoukai 5: Mujun Rasen,"Action, Drama, Mystery, Romance, Supernatural,...",Movie,1,8.68,111074
50,Yojouhan Shinwa Taikei,"Mystery, Psychological, Romance",TV,11,8.65,122531


In [29]:
train_data, test_data = train_test_split(anime_data, test_size=0.2, random_state=42)
train_data.reset_index(drop=True, inplace=True)
test_data.reset_index(drop=True, inplace=True)

In [30]:
vectorizer = CountVectorizer(tokenizer=lambda x: x.split(', '))
genre_train = vectorizer.fit_transform(train_data['genre'])
genre_test = vectorizer.transform(test_data['genre'])

In [34]:
train_data.head(5)

Unnamed: 0,name,genre,type,episodes,rating,members
0,Fate/kaleid liner Prisma☆Illya Specials,"Comedy, Ecchi, Fantasy, Magic",Special,5,6.88,16886
1,Narara Wondeogongju,"Action, Adventure, Sci-Fi, Space, Super Power",Movie,1,3.25,87
2,Dragon Collection,"Fantasy, Shounen",TV,51,6.39,3229
3,Pokemon Omega Ruby &amp; Alpha Sapphire: Mega ...,"Action, Adventure, Fantasy, Kids",ONA,1,7.04,4068
4,Recorder to Randoseru Mi☆,"Comedy, School, Seinen, Slice of Life",TV,12,6.8,12240


In [37]:
type(genre_train)

scipy.sparse.csr.csr_matrix

In [45]:
print(genre_train[0:4])

  (0, 3)	1
  (0, 7)	1
  (0, 8)	1
  (0, 16)	1
  (1, 0)	1
  (1, 1)	1
  (1, 28)	1
  (1, 35)	1
  (1, 37)	1
  (2, 8)	1
  (2, 32)	1
  (3, 8)	1
  (3, 0)	1
  (3, 1)	1
  (3, 15)	1


In [47]:
genre_train[0:4].toarray()

array([[0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [48]:
scaler = MinMaxScaler()
scaled_train = scaler.fit_transform(train_data[['rating', 'members']])
scaled_test = scaler.transform(test_data[['rating', 'members']])

In [50]:
scaled_train

array([[6.13861386e-01, 1.66493739e-02],
       [1.64603960e-01, 8.08748688e-05],
       [5.53217822e-01, 3.17976314e-03],
       ...,
       [5.63118812e-01, 3.08705292e-04],
       [7.26485149e-01, 2.76947112e-03],
       [4.52970297e-01, 3.58906887e-03]])

In [49]:
X_train = hstack([genre_train, csr_matrix(scaled_train)])
X_test = hstack([genre_test, csr_matrix(scaled_test)])

In [58]:
print(X_train.toarray())

[[0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  6.13861386e-01 1.66493739e-02]
 [1.00000000e+00 1.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  1.64603960e-01 8.08748688e-05]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  5.53217822e-01 3.17976314e-03]
 ...
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  5.63118812e-01 3.08705292e-04]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  7.26485149e-01 2.76947112e-03]
 [1.00000000e+00 1.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  4.52970297e-01 3.58906887e-03]]


In [59]:
type(X_train)

scipy.sparse.coo.coo_matrix

In [60]:
X_train = X_train.tocsr()
X_test = X_test.tocsr()

In [62]:
print(X_train.toarray())

[[0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  6.13861386e-01 1.66493739e-02]
 [1.00000000e+00 1.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  1.64603960e-01 8.08748688e-05]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  5.53217822e-01 3.17976314e-03]
 ...
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  5.63118812e-01 3.08705292e-04]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  7.26485149e-01 2.76947112e-03]
 [1.00000000e+00 1.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  4.52970297e-01 3.58906887e-03]]


In [64]:
def recommend_anime(title, top_n=5):
    if title not in train_data['name'].values:
        return f"'{title}' not found in training data."
    idx = train_data[train_data['name'] == title].index[0]
    anime_vector = X_train[idx]
    sim_scores = cosine_similarity(anime_vector, X_train)[0]
    top_indices = sim_scores.argsort()[::-1][1:top_n+1]
    return train_data.iloc[top_indices][['name', 'genre', 'rating', 'members']]

In [66]:
rating_threshold = anime_data['rating'].mean()
y_true, y_pred = [], []

for i in tqdm(range(X_test.shape[0]), desc="Evaluating on Split Data"):
    test_vector = X_test[i]
    sim_scores = cosine_similarity(test_vector, X_train)[0]
    top_indices = sim_scores.argsort()[::-1][:5]
    top_recommendations = train_data.iloc[top_indices]

    true_label = 1 if test_data.iloc[i]['rating'] >= rating_threshold else 0
    pred_label = 1 if any(top_recommendations['rating'] >= rating_threshold) else 0

    y_true.append(true_label)
    y_pred.append(pred_label)

# Output metrics
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")

Evaluating on Split Data: 100%|████████████| 2442/2442 [00:03<00:00, 618.07it/s]

Precision: 0.78
Recall: 1.00
F1-Score: 0.87





In [67]:
recommend_anime("Naruto")

Unnamed: 0,name,genre,rating,members
5585,Naruto: Shippuuden,"Action, Comedy, Martial Arts, Shounen, Super P...",7.94,533578
4904,Naruto: Shippuuden Movie 3 - Hi no Ishi wo Tsu...,"Action, Comedy, Martial Arts, Shounen, Super P...",7.5,83515
577,Boruto: Naruto the Movie,"Action, Comedy, Martial Arts, Shounen, Super P...",8.03,74690
7585,Naruto x UT,"Action, Comedy, Martial Arts, Shounen, Super P...",7.58,23465
1176,Boruto: Naruto the Movie - Naruto ga Hokage ni...,"Action, Comedy, Martial Arts, Shounen, Super P...",7.68,16868


- Loaded and cleaned the dataset, handling missing values in rating, genre, and type.

Explored data distribution, including checking for skewness and basic statistics.

Preprocessed text features (e.g., genre) using CountVectorizer and scaled numerical features with MinMaxScaler.

Combined features and computed cosine similarity to find similar anime titles.

Evaluated the model using precision, recall, and F1-score.

Used libraries like scikit-learn, pandas, and tqdm to manage and process data efficiently.

1. Can you explain the difference between user-based and item-based collaborative filtering?

User-based collaborative filtering looks for users who have similar preferences and recommends items that those similar users liked. In contrast, item-based collaborative filtering focuses on finding items that are similar to what a user has already liked or rated, and then recommends those similar items. So user-based compares people, while item-based compares items.

2. What is collaborative filtering, and how does it work?

Collaborative filtering is a recommendation method that suggests items to a user based on the preferences or behavior of other users. It works by finding patterns in user-item interactions, like ratings or purchases, and assumes that if two users liked similar things in the past, they will like similar things in the future, or if an item is liked by people with similar tastes, it might be liked by the current user too.