In [1]:
# ANIME RECOMMENDATION SYSTEM

In [2]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

In [3]:
# 1. Load & Explore Data

In [4]:
# Load dataset
anime_df = pd.read_csv(r'C:\Users\admin\Downloads\Excelr Assignment\Recommendation System\Recommendation System\anime.csv')
anime_df

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


In [5]:
anime_df.shape

(12294, 7)

In [6]:
anime_df.columns

Index(['anime_id', 'name', 'genre', 'type', 'episodes', 'rating', 'members'], dtype='object')

In [7]:
anime_df.isnull().sum()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

In [8]:
# 2. Data Preprocessing

In [9]:
# Handle missing values in 'genre' and 'rating'
anime_df['genre'] = anime_df['genre'].fillna('Unknown')
anime_df['rating'] = anime_df['rating'].fillna(anime_df['rating'].mean())
anime_df

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


In [10]:
# Convert 'episodes' to numeric (replace 'Unknown' with 0)
anime_df['episodes'] = anime_df['episodes'].replace('Unknown', 0)
anime_df['episodes'] = pd.to_numeric(anime_df['episodes'])
anime_df

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


In [11]:
# 3. Feature Extraction

In [12]:
# We use genres + rating + members as features
anime_df['features'] = anime_df['genre'] + " " + anime_df['type']

In [13]:
# Convert text features into numerical vectors (TF-IDF)
# Ensure no NaN in 'features' column
anime_df['features'] = anime_df['features'].fillna('')

# Convert to string type explicitly
anime_df['features'] = anime_df['features'].astype(str)

# Now TF-IDF works fine
vectorizer = TfidfVectorizer(stop_words='english')
feature_matrix = vectorizer.fit_transform(anime_df['features'])


In [14]:
# 4. Recommendation System (Cosine Similarity)

In [15]:
# Compute cosine similarity between anime
cosine_sim = cosine_similarity(feature_matrix, feature_matrix)

In [16]:
# Function to recommend similar anime based on cosine similarity
def recommend_anime(title, top_n=5):
    """
    Recommend top_n similar anime based on cosine similarity.
    """
    # Check if title exists
    if title not in anime_df['name'].values:
        return "Anime not found in database."
    # Get index of the anime
    idx = anime_df[anime_df['name'] == title].index[0]


    # Get pairwise similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))


    # Sort by similarity score (descending)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)


    # Get top_n recommendations (excluding itself)
    sim_scores = sim_scores[1:top_n+1]


    # Get anime indices
    anime_indices = [i[0] for i in sim_scores]


    return anime_df[['name', 'genre', 'rating']].iloc[anime_indices]

In [17]:
# Example Recommendation
print("\nRecommended Anime for 'Naruto':\n", recommend_anime('Naruto'))


Recommended Anime for 'Naruto':
                                              name  \
841                                        Naruto   
1343                                  Naruto x UT   
1796                               Rekka no Honoo   
486                      Boruto: Naruto the Movie   
1472  Naruto: Shippuuden Movie 4 - The Lost Tower   

                                                  genre  rating  
841   Action, Comedy, Martial Arts, Shounen, Super P...    7.81  
1343  Action, Comedy, Martial Arts, Shounen, Super P...    7.58  
1796  Action, Adventure, Martial Arts, Shounen, Supe...    7.44  
486   Action, Comedy, Martial Arts, Shounen, Super P...    8.03  
1472  Action, Comedy, Martial Arts, Shounen, Super P...    7.53  


In [18]:
# 5. Evaluation

In [19]:
# For evaluation, we simulate by splitting data into train/test
train_df, test_df = train_test_split(anime_df, test_size=0.2, random_state=42)

In [20]:
# Define simple precision metric: proportion of recommendations with the same genre


def evaluate_recommender(test_sample=50):
    """
    Evaluate the recommendation system by checking if recommended anime
    share at least one genre with the target anime.
    
    Metric: Precision = Correct Recommendations / Total Recommendations
    """
    correct = 0
    total = 0

    # Pick a sample of anime titles from the test set
    for title in test_df['name'].sample(test_sample, random_state=42):
        try:
            recs = recommend_anime(title, top_n=5)
            true_genres = set(anime_df[anime_df['name'] == title]['genre'].values[0].split(', '))

            # Compare genres of recommended anime with the target anime
            for rec_genres in recs['genre']:
                rec_genres_set = set(rec_genres.split(', '))
                if len(true_genres.intersection(rec_genres_set)) > 0:
                    correct += 1
                total += 1
        except Exception as e:
            # Skip if any error occurs (e.g., anime not found)
            continue

    precision = correct / total if total > 0 else 0
    return precision


# Run evaluation
print("\nEvaluation Precision:", evaluate_recommender())



Evaluation Precision: 1.0


In [21]:
# Interview Questions and Answers