In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

df = pd.read_csv('anime (2).csv')
df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [2]:
print("Missing values:\n", df.isnull().sum())
df = df.dropna(subset=['genre', 'type', 'rating', 'episodes'])
df['episodes'] = pd.to_numeric(df['episodes'], errors='coerce')
df = df.dropna(subset=['episodes'])
df['genre'] = df['genre'].str.split(', ')
print("Shape after cleaning:", df.shape)
print(df.describe())

Missing values:
 anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64
Shape after cleaning: (11830, 7)
           anime_id      episodes        rating       members
count  11830.000000  11830.000000  11830.000000  1.183000e+04
mean   13404.150211     12.486729      6.484609  1.851100e+04
std    11110.087616     47.097131      1.019147  5.537144e+04
min        1.000000      1.000000      1.670000  1.200000e+01
25%     3326.250000      1.000000      5.892500  2.322500e+02
50%     9820.500000      2.000000      6.570000  1.589500e+03
75%    23302.500000     12.000000      7.190000  9.832000e+03
max    34519.000000   1818.000000     10.000000  1.013917e+06


In [3]:
mlb = MultiLabelBinarizer()
genres_encoded = pd.DataFrame(mlb.fit_transform(df['genre']), columns=mlb.classes_, index=df.index)
type_encoded = pd.get_dummies(df['type'], prefix='type')

numerical = df[['episodes', 'rating', 'members']]
scaler = StandardScaler()
numerical_scaled = pd.DataFrame(scaler.fit_transform(numerical), columns=numerical.columns, index=df.index)

features = pd.concat([genres_encoded, type_encoded, numerical_scaled], axis=1)
print("Feature shape:", features.shape)

Feature shape: (11830, 52)


In [4]:
cosine_sim = cosine_similarity(features)
print("Cosine sim matrix shape:", cosine_sim.shape)

Cosine sim matrix shape: (11830, 11830)


In [5]:
def recommend_anime(target_name, threshold=0.5, top_n=5):
    idx = df[df['name'] == target_name].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = [s for s in sim_scores if s[1] >= threshold and s[0] != idx]
    anime_indices = [i[0] for i in sim_scores[:top_n]]
    return df['name'].iloc[anime_indices].tolist()

print(recommend_anime('Kimi no Na wa.', threshold=0.6, top_n=10))

['Hotarubi no Mori e', 'Suzumiya Haruhi no Shoushitsu', 'Hotaru no Haka', 'Clannad: After Story - Mou Hitotsu no Sekai, Kyou-hen', 'Kotonoha no Niwa', 'Yahari Ore no Seishun Love Comedy wa Machigatteiru. Zoku', 'Howl no Ugoku Shiro', 'Toki wo Kakeru Shoujo', 'Koe no Katachi', 'Clannad: Mou Hitotsu no Sekai, Tomoyo-hen']


In [6]:
print("Threshold 0.7:", recommend_anime('Steins;Gate', threshold=0.7, top_n=5))
print("Threshold 0.5:", recommend_anime('Steins;Gate', threshold=0.5, top_n=5))
print("Threshold 0.3:", recommend_anime('Steins;Gate', threshold=0.3, top_n=5))

Threshold 0.7: ['Death Note', 'Code Geass: Hangyaku no Lelouch', 'Shingeki no Kyojin', 'Psycho-Pass', 'Durarara!!']
Threshold 0.5: ['Death Note', 'Code Geass: Hangyaku no Lelouch', 'Shingeki no Kyojin', 'Psycho-Pass', 'Durarara!!']
Threshold 0.3: ['Death Note', 'Code Geass: Hangyaku no Lelouch', 'Shingeki no Kyojin', 'Psycho-Pass', 'Durarara!!']


In [7]:
# Difference between user-based and item-based collaborative filtering
print("User-based: Recommends items based on similar users' preferences.")
print("Item-based: Recommends items similar to those the user has liked, using item similarities.")

User-based: Recommends items based on similar users' preferences.
Item-based: Recommends items similar to those the user has liked, using item similarities.


In [8]:
# What is collaborative filtering, and how does it work?
print("Collaborative filtering predicts user preferences based on patterns from many users.")
print("It works by finding similarities (e.g., cosine) in ratings or behaviors to suggest items.")

Collaborative filtering predicts user preferences based on patterns from many users.
It works by finding similarities (e.g., cosine) in ratings or behaviors to suggest items.
