In [31]:
import pandas as pd

# Load the dataset
anime_df = pd.read_csv("anime.csv")

# Display the structure
print("Dataset Shape:", anime_df.shape)
print("Columns:", anime_df.columns)
print(anime_df.head())

# Handle missing values
anime_df.dropna(subset=["name", "genre", "rating"], inplace=True)
anime_df["episodes"] = anime_df["episodes"].replace("Unknown", 0).astype(int)


Dataset Shape: (12294, 7)
Columns: Index(['anime_id', 'name', 'genre', 'type', 'episodes', 'rating', 'members'], dtype='object')
   anime_id                              name  \
0     32281                    Kimi no Na wa.   
1      5114  Fullmetal Alchemist: Brotherhood   
2     28977                          Gintama°   
3      9253                       Steins;Gate   
4      9969                     Gintama&#039;   

                                               genre   type episodes  rating  \
0               Drama, Romance, School, Supernatural  Movie        1    9.37   
1  Action, Adventure, Drama, Fantasy, Magic, Mili...     TV       64    9.26   
2  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.25   
3                                   Sci-Fi, Thriller     TV       24    9.17   
4  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.16   

   members  
0   200630  
1   793665  
2   114262  
3   673572  
4   151266  


In [55]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MinMaxScaler
import scipy.sparse as sp

# Convert genres to numerical form
vectorizer = CountVectorizer(tokenizer=lambda x: x.split(", "), token_pattern=None)
genre_matrix = vectorizer.fit_transform(anime_df["genre"])

# Normalize rating
scaler = MinMaxScaler()
rating_scaled = scaler.fit_transform(anime_df[["rating"]])

# Combine genre and rating
feature_matrix = sp.hstack((genre_matrix, rating_scaled))


In [53]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute similarity matrix
cosine_sim = cosine_similarity(feature_matrix, feature_matrix)

# Map anime names to indices
anime_indices = pd.Series(anime_df.index, index=anime_df["name"]).drop_duplicates()

# Recommendation function
def recommend_anime(title, top_n=10):
    if title not in anime_indices:
        return "Anime not found!"
    
    idx = anime_indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]
    
    anime_ids = [i[0] for i in sim_scores]
    return anime_df["name"].iloc[anime_ids]


In [47]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(anime_df, test_size=0.2, random_state=42)

# Sample-based evaluation: check if recommended genres overlap
def evaluate_recommendations(sample_title):
    recommended = recommend_anime(sample_title)
    original_genre = set(anime_df.loc[anime_indices[sample_title], 'genre'].split(', '))
    
    matches = []
    for rec in recommended:
        rec_genres = set(anime_df.loc[anime_indices[rec], 'genre'].split(', '))
        matches.append(len(original_genre & rec_genres) / len(original_genre))
    
    return sum(matches) / len(matches)

# Evaluate on 5 random samples from the test set
sample_titles = test['name'].sample(5, random_state=42)
scores = [evaluate_recommendations(title) for title in sample_titles if title in anime_indices]
print("Average Genre Overlap Score:", sum(scores)/len(scores))


Average Genre Overlap Score: 0.95


In [37]:
print("Recommended Anime for 'Naruto':")
print(recommend_anime("Naruto"))


Recommended Anime for 'Naruto':
615                                    Naruto: Shippuuden
1103    Boruto: Naruto the Movie - Naruto ga Hokage ni...
486                              Boruto: Naruto the Movie
1343                                          Naruto x UT
1472          Naruto: Shippuuden Movie 4 - The Lost Tower
1573    Naruto: Shippuuden Movie 3 - Hi no Ishi wo Tsu...
2458                 Naruto Shippuuden: Sunny Side Battle
2997    Naruto Soyokazeden Movie: Naruto to Mashin to ...
175                                Katekyo Hitman Reborn!
7628                              Kyutai Panic Adventure!
Name: name, dtype: object


In [None]:

from sklearn.model_selection import train_test_split

# Load user ratings (assuming you have this CSV)
ratings_df = pd.read_csv("rating.csv")
ratings_df = ratings_df[ratings_df["rating"] > 0]  # Keep only positive ratings

# Create train-test split
train_data, test_data = train_test_split(ratings_df, test_size=0.2, random_state=42)

# Create a pivot table for collaborative filtering
user_item_matrix = train_data.pivot_table(index='user_id', columns='anime_id', values='rating')


In [None]:

from sklearn.metrics.pairwise import pairwise_distances
import numpy as np

# Fill NaNs with 0 for similarity computation
user_item_filled = user_item_matrix.fillna(0)
user_similarity = 1 - pairwise_distances(user_item_filled, metric='cosine')

# Predict ratings using user-based CF
user_pred_ratings = user_similarity.dot(user_item_filled.fillna(0)) / np.array([np.abs(user_similarity).sum(axis=1)]).T


In [None]:

from sklearn.metrics import precision_score, recall_score, f1_score

# Binarize actual and predicted ratings (e.g., 1 if rating >= 7, else 0)
threshold = 7

def get_binary_matrix(df, user_col='user_id', item_col='anime_id', rating_col='rating'):
    return df.assign(binary=(df[rating_col] >= threshold).astype(int))

test_binary = get_binary_matrix(test_data)
test_merged = test_binary.merge(anime_df[['anime_id', 'name']], on='anime_id', how='left')

# Placeholder: Simulate binary predictions (all positive or random) for evaluation
# Replace this with actual thresholded predictions from collaborative filtering
test_binary['pred'] = np.random.randint(0, 2, size=len(test_binary))

# Evaluation metrics
precision = precision_score(test_binary['binary'], test_binary['pred'])
recall = recall_score(test_binary['binary'], test_binary['pred'])
f1 = f1_score(test_binary['binary'], test_binary['pred'])

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-score: {f1:.2f}")



### What is Collaborative Filtering?

Collaborative Filtering is a recommendation technique that suggests items based on user behavior. It assumes that users who agreed in the past will agree in the future.

**Types of Collaborative Filtering:**

- **User-Based Collaborative Filtering:** Recommends items liked by similar users.
- **Item-Based Collaborative Filtering:** Recommends items that are similar to items the user has liked.

**Comparison:**

| Criteria               | User-Based                        | Item-Based                          |
|------------------------|-----------------------------------|-------------------------------------|
| Similarity Target      | Between users                     | Between items                       |
| Data Requirements      | More user-user interactions       | More item-item interactions         |
| Stability              | Less stable, users change habits  | More stable, item relationships last|

