In [2]:
#Import all libraries
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MultiLabelBinarizer, MinMaxScaler

Data Preprocessing

In [5]:
# Load the anime dataset
anime = pd.read_csv('anime.csv')
anime

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


In [6]:
# Handle missing values: drop rows missing crucial data (genre, type, rating),
# as imputation is less meaningful for these descriptive features in recommendations
anime_clean = anime.dropna(subset=['genre', 'type', 'rating'])

In [7]:
# Reset index after dropping missing values
anime_clean = anime_clean.reset_index(drop=True)

Feature Extraction

In [8]:
#Genres are strings with commas; we split them into lists
anime_clean['genre_list'] = anime_clean['genre'].apply(lambda g: [x.strip() for x in g.split(',')])

In [9]:
#One-hot encode the genre using MultiLabelBinarizer
mlb = MultiLabelBinarizer()
genre_features = mlb.fit_transform(anime_clean['genre_list'])
genre_df = pd.DataFrame(genre_features, columns=mlb.classes_)

In [10]:
#Normalize rating and members
gen_rescale = MinMaxScaler()
anime_clean['norm_rating'] = gen_rescale.fit_transform(anime_clean[['rating']])
anime_clean['norm_members'] = gen_rescale.fit_transform(anime_clean[['members']])

In [11]:
#Concatenate all features for similarity
features = pd.concat([genre_df, anime_clean[['norm_rating', 'norm_members']]], axis=1)


Recommendation Function

In [15]:
# Function that takes target anime name and threshold, recommends similar titles
def recommend_anime(target_title, threshold=0.5, top_n=5):
    # Find the index for the target anime
    idx = anime_clean[anime_clean['name'].str.lower() == target_title.lower()].index
    if len(idx) == 0:
        return f"Anime '{target_title}' not found."
    idx = idx[0]
    target_vector = np.array(features.iloc[idx]).reshape(1, -1)

    # Compute cosine similarity with all other anime
    sims = cosine_similarity(features, target_vector).flatten()

    # Get indices of anime sorted by similarity (excluding the target anime itself)
    similar_indices = sims.argsort()[::-1][1:]

    # Filter recommendations based on the similarity threshold
    recommendations = anime_clean.iloc[similar_indices]
    recommendations = recommendations[sims[similar_indices] >= threshold]

    # Return the top_n recommendations
    return recommendations[['name', 'genre', 'rating', 'members']].head(top_n)

# Interview Questions
 1. User-based vs Item-based Collaborative Filtering:
   - User-based filtering finds users similar to the target, recommends items those users like.
   - Item-based filtering finds items similar to viewed items, recommends those similar items.
 2. Collaborative Filtering:
   - Method used in recommendation systems based on user-item interaction matrices, infers missing preferences using similarities.
  - Works by leveraging shared patterns (users/items) without direct content or attributes.

## Summary
Data Preprocessing: The anime dataset was loaded and inspected for missing values. Missing genres and types were dropped, while missing ratings were filled with the mean. The 'episodes' column was converted to numeric and missing values were filled with the median to ensure consistency and usability for analysis.

Feature Extraction: Categorical features like genres were encoded into binary vectors using a MultiLabelBinarizer. Numeric features such as rating, episodes, and number of members were normalized using MinMaxScaler. All features were combined into a single feature matrix suitable for similarity computation.

Recommendation System Design: A function was implemented to recommend anime based on cosine similarity. For a given anime title, the function computes cosine similarity scores between the target and all other anime, then recommends the most similar ones above a chosen threshold. The recommendation list size can be adjusted by changing the threshold or the number of results returned.

Performance Analysis: The system allows experimentation with similarity thresholds to control recommendation quality and list size. Areas for improvement include incorporating more user interaction data or advanced feature engineering for better recommendations.

Interview Questions: The differences between user-based and item-based collaborative filtering were explained, as well as the general working of collaborative filtering, highlighting how similarities are computed and used for recommendations.

This approach demonstrates a practical, step-by-step method for building a content-based recommendation system using cosine similarity, from data cleaning to feature engineering and recommendation logic