In [41]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity

# Load dataset
df = pd.read_csv("anime.csv")

# --- Data Preprocessing ---

# Fill missing genres and types
df['genre'].fillna('Unknown', inplace=True)
df['type'].fillna('Unknown', inplace=True)

# Convert episodes to numeric, handle 'Unknown' and missing values
df['episodes'] = pd.to_numeric(df['episodes'], errors='coerce')
df['episodes'].fillna(df['episodes'].median(), inplace=True)

# Fill missing ratings with the mean
df['rating'].fillna(df['rating'].mean(), inplace=True)

# Split genre into list
df['genre_list'] = df['genre'].apply(lambda x: x.split(', ') if isinstance(x, str) else [])

# One-hot encode genres
mlb = MultiLabelBinarizer()
genre_encoded = pd.DataFrame(mlb.fit_transform(df['genre_list']), columns=mlb.classes_, index=df.index)

# Normalize numerical features
scaler = MinMaxScaler()
numerical_features = df[['rating', 'episodes', 'members']]
numerical_scaled = pd.DataFrame(scaler.fit_transform(numerical_features), columns=numerical_features.columns, index=df.index)

# Combine genre and numeric features
features = pd.concat([genre_encoded, numerical_scaled], axis=1)

# Create title to index mapping
title_to_index = pd.Series(df.index, index=df['name']).drop_duplicates()

# --- Recommendation Function ---

def recommend_anime(anime_title, top_n=5):
    if anime_title not in title_to_index:
        return f"Anime titled '{anime_title}' not found in the dataset."
    
    idx = title_to_index[anime_title]
    target_vector = features.iloc[idx].values.reshape(1, -1)
    
    # Compute cosine similarity only with target
    sim_scores = cosine_similarity(target_vector, features).flatten()
    
    # Get top N most similar anime (excluding the input itself)
    similar_indices = sim_scores.argsort()[::-1][1:top_n+1]
    
    # Return recommended anime with scores
    recommendations = df.iloc[similar_indices][['name', 'rating', 'members']].copy()
    recommendations['similarity'] = sim_scores[similar_indices]
    
    return recommendations.reset_index(drop=True)

# --- Example Usage ---
# Replace "Steins;Gate" with any anime title in your dataset
print(recommend_anime("Steins;Gate", top_n=5))


                                                name    rating  members  \
0         Steins;Gate Movie: Fuka Ryouiki no Déjà vu  8.610000   192424   
1              Steins;Gate: Oukoubakko no Poriomania  8.460000   159548   
2  Steins;Gate: Kyoukaimenjou no Missing Link - D...  8.340000    38147   
3                                      Steins;Gate 0  6.473902    60999   
4                                      Under the Dog  6.550000    29922   

   similarity  
0    0.965256  
1    0.959809  
2    0.936681  
3    0.928466  
4    0.772356  


In [43]:
##Can you explain the difference between user-based and item-based collaborative filtering?
#User-Based Collaborative Filtering (UBCF)
#Concept:Recommends items based on the preferences of similar users.
#Assumes that users who liked similar things in the past will like similar things in the future.

#How it works: Identify users who are similar to the target user (using similarity metrics like cosine similarity, Pearson correlation, etc.).
#Recommend items that those similar users liked but the target user hasn’t interacted with yet.

#Example: If User A and User B both like Naruto and Attack on Titan, and User B also likes Death Note, then User A might be recommended Death Note.
# Pros:Captures taste patterns among users.

#Cons:Suffers when the number of users grows (less scalable).
#     Cold start problem for new users with few ratings.

##Item-Based Collaborative Filtering (IBCF)
# Concept:Recommends items that are similar to what the user already liked.
#Assumes that if a user likes one item, they will like similar items.

#How it works:Compute similarity between items based on users' rating patterns.
#For a given user, recommend items that are similar to the ones they've already rated highly.

#Example:If a user liked Naruto, and Bleach is similar to Naruto based on user ratings, then recommend Bleach.

#Pros:More scalable (number of items is usually less than users).
#          Better performance with sparse data.

#Cons:Might miss user-specific tastes.

###Summary Table:
#Feature:	           User-Based CF 	                            Item-Based CF
#Based on:	        Similarity between users	                  Similarity between items
#Focus:	            Who is similar to the user?	                  What is similar to what the user liked?
#Works best when:	Users have many overlapping ratings	          Items have many ratings
#Scalability:  	        Less scalable	                               More scalable
#Cold Start Problem:	New users	                                   New items