In [43]:
##Can you explain the difference between user-based and item-based collaborative filtering?
#User-Based Collaborative Filtering (UBCF)
#Concept:Recommends items based on the preferences of similar users.
#Assumes that users who liked similar things in the past will like similar things in the future.

#How it works: Identify users who are similar to the target user (using similarity metrics like cosine similarity, Pearson correlation, etc.).
#Recommend items that those similar users liked but the target user hasn’t interacted with yet.

#Example: If User A and User B both like Naruto and Attack on Titan, and User B also likes Death Note, then User A might be recommended Death Note.
# Pros:Captures taste patterns among users.

#Cons:Suffers when the number of users grows (less scalable).
#     Cold start problem for new users with few ratings.

##Item-Based Collaborative Filtering (IBCF)
# Concept:Recommends items that are similar to what the user already liked.
#Assumes that if a user likes one item, they will like similar items.

#How it works:Compute similarity between items based on users' rating patterns.
#For a given user, recommend items that are similar to the ones they've already rated highly.

#Example:If a user liked Naruto, and Bleach is similar to Naruto based on user ratings, then recommend Bleach.

#Pros:More scalable (number of items is usually less than users).
#          Better performance with sparse data.

#Cons:Might miss user-specific tastes.

###Summary Table:
#Feature:	           User-Based CF 	                            Item-Based CF
#Based on:	        Similarity between users	                  Similarity between items
#Focus:	            Who is similar to the user?	                  What is similar to what the user liked?
#Works best when:	Users have many overlapping ratings	          Items have many ratings
#Scalability:  	        Less scalable	                               More scalable
#Cold Start Problem:	New users	                                   New items

In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer, MinMaxScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split

# Load and clean data
df = pd.read_csv("anime.csv")
df = df.dropna(subset=["genre", "type", "rating"])
df['episodes'] = pd.to_numeric(df['episodes'], errors='coerce')
df['episodes'] = df['episodes'].fillna(df['episodes'].median())
df['genre'] = df['genre'].apply(lambda x: x.split(', ') if isinstance(x, str) else [])

# Encode genres
mlb = MultiLabelBinarizer()
genre_encoded = mlb.fit_transform(df['genre'])
genre_df = pd.DataFrame(genre_encoded, columns=mlb.classes_)

# Normalize numeric features
scaler = MinMaxScaler()
numerical_df = pd.DataFrame(scaler.fit_transform(df[['rating', 'episodes', 'members']]), 
                            columns=['rating', 'episodes', 'members'])

# Combine features
features_df = pd.concat([genre_df, numerical_df], axis=1)

# Split dataset and reset indices
train_df, test_df, train_features, test_features = train_test_split(
    df, features_df, test_size=0.2, random_state=42)

train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)
train_features = train_features.reset_index(drop=True)
test_features = test_features.reset_index(drop=True)

# Fit Nearest Neighbors model
nn_model = NearestNeighbors(metric='cosine', algorithm='brute')
nn_model.fit(train_features)

# Store train genres as sets for quick comparison
train_genres = train_df['genre'].apply(set).tolist()

# Evaluation
top_n = 5
precision_list, recall_list, f1_list = [], [], []

for i, row in test_df.iterrows():
    test_genres = set(row['genre'])
    if not test_genres:
        continue  # Skip if genre is empty

    test_vector = test_features.iloc[[i]]
    distances, indices = nn_model.kneighbors(test_vector, n_neighbors=top_n)

    rec_indices = indices[0]
    rec_genres = [train_genres[j] for j in rec_indices]

    # Relevance: at least one overlapping genre
    relevant = [1 if len(test_genres.intersection(g)) > 0 else 0 for g in rec_genres]
    true_positives = sum(relevant)

    precision = true_positives / top_n
    recall = true_positives / len(test_genres)
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) else 0

    precision_list.append(precision)
    recall_list.append(recall)
    f1_list.append(f1)

# Show average metrics
print(f"\n📊 Evaluation Results (Top-{top_n} Recommendations):")
print(f"Average Precision: {np.mean(precision_list):.4f}")
print(f"Average Recall:    {np.mean(recall_list):.4f}")
print(f"Average F1 Score:  {np.mean(f1_list):.4f}")



📊 Evaluation Results (Top-5 Recommendations):
Average Precision: 1.0000
Average Recall:    2.3880
Average F1 Score:  1.3025


In [9]:
# Recommendation function
def recommend_anime(title, top_n=10):
    if title not in anime_indices:
        return f"Anime titled '{title}' not found in the dataset."
    
    idx = anime_indices[title]
    distances, indices = nn_model.kneighbors([features_df.iloc[idx]], n_neighbors=top_n+1)
    
    rec_indices = indices[0][1:]
    rec_distances = distances[0][1:]
    
    recommendations = df.iloc[rec_indices][['name', 'genre', 'rating']]
    recommendations['similarity_score'] = 1 - rec_distances
    
    return recommendations.reset_index(drop=True)

# Example: Recommend similar anime to Naruto
print(recommend_anime("Naruto", top_n=5))


                             name                                    genre  \
0      Sousei no Aquarion Special                          [Comedy, Mecha]   
1                Examurai Sengoku  [Action, Martial Arts, Samurai, Sci-Fi]   
2                         Kobato.                 [Comedy, Drama, Fantasy]   
3  R²: Rise R to the Second Power                                  [Music]   
4                   Flip Flappers                         [Comedy, Sci-Fi]   

   rating  similarity_score  
0    6.48          0.997027  
1    6.65          0.969183  
2    8.09          0.968273  
3    5.39          0.962833  
4    7.73          0.962131  


