# Recommendation_System(Assignment)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Import the dataset 

In [2]:
df = pd.read_csv('anime.csv')

In [3]:
df.shape

(12294, 7)

In [4]:
df.head(5)

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [5]:
df.dtypes

anime_id      int64
name         object
genre        object
type         object
episodes     object
rating      float64
members       int64
dtype: object

In [6]:
df.describe()

Unnamed: 0,anime_id,rating,members
count,12294.0,12064.0,12294.0
mean,14058.221653,6.473902,18071.34
std,11455.294701,1.026746,54820.68
min,1.0,1.67,5.0
25%,3484.25,5.88,225.0
50%,10260.5,6.57,1550.0
75%,24794.5,7.18,9437.0
max,34527.0,10.0,1013917.0


## Find duplicated values

In [7]:
df.duplicated()

0        False
1        False
2        False
3        False
4        False
         ...  
12289    False
12290    False
12291    False
12292    False
12293    False
Length: 12294, dtype: bool

In [8]:
df.loc[df.duplicated()] # Returns duplicated cell

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members


NO duplicated values

In [9]:
df.isna().sum()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

## Handling NULL values

In [10]:
df['genre'].fillna('', inplace=True) 

## Preprocess and Vectorize

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF Vectorizer on genres
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['genre'])

In [21]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute Cosine Similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

## Define Recommendation Function

In [13]:
def recommend_anime(title, df, cosine_sim, top_n=5):
    # Normalize input
    title = title.lower()
    
    # Get index of anime
    matches = df[df['name'].str.lower() == title]
    if matches.empty:
        return f"No anime found with title '{title}'"
    
    idx = matches.index[0]

    # Get similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]

    # Fetch recommended titles
    anime_indices = [i[0] for i in sim_scores]
    return df['name'].iloc[anime_indices].tolist()

In [14]:
recommendations = recommend_anime("Naruto", df, cosine_sim, top_n=5)
print("Recommendations:", recommendations)

Recommendations: ['Naruto: Shippuuden', 'Naruto', 'Boruto: Naruto the Movie - Naruto ga Hokage ni Natta Hi', 'Naruto x UT', 'Naruto: Shippuuden Movie 4 - The Lost Tower']


## Define the recommendation function with threshold control

In [15]:
anime_indices = pd.Series(df.index, index=df['name'].str.lower())

def recommend_anime_by_threshold(title, df, cosine_sim, threshold=0.5):
    title = title.lower()
    if title not in anime_indices:
        return f"No anime found with title '{title}'"
    
    idx = anime_indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Filter by threshold and exclude itself
    filtered = [(i, score) for i, score in sim_scores if i != idx and score >= threshold]
    filtered = sorted(filtered, key=lambda x: x[1], reverse=True)
    
    # Return list of (title, similarity score)
    return [(df['name'].iloc[i], round(score, 3)) for i, score in filtered]

In [16]:
recommend_anime_by_threshold("Naruto", df, cosine_sim, threshold=0.3)

[('Boruto: Naruto the Movie', 1.0),
 ('Naruto: Shippuuden', 1.0),
 ('Boruto: Naruto the Movie - Naruto ga Hokage ni Natta Hi', 1.0),
 ('Naruto x UT', 1.0),
 ('Naruto: Shippuuden Movie 4 - The Lost Tower', 1.0),
 ('Naruto: Shippuuden Movie 3 - Hi no Ishi wo Tsugu Mono', 1.0),
 ('Naruto Shippuuden: Sunny Side Battle', 1.0),
 ('Naruto Soyokazeden Movie: Naruto to Mashin to Mitsu no Onegai Dattebayo!!',
  1.0),
 ('Kyutai Panic Adventure!', 0.981),
 ('Naruto: Shippuuden Movie 6 - Road to Ninja', 0.947),
 ('Rekka no Honoo', 0.947),
 ('Naruto: Honoo no Chuunin Shiken! Naruto vs. Konohamaru!!', 0.947),
 ('Street Fighter Zero The Animation', 0.943),
 ('Dragon Ball Z', 0.937),
 ('Dragon Ball Kai (2014)', 0.937),
 ('Dragon Ball Kai', 0.937),
 ('Dragon Ball Z Movie 15: Fukkatsu no F', 0.937),
 ('Dragon Ball Super', 0.937),
 ('Dragon Ball Z: Summer Vacation Special', 0.937),
 ('Dragon Ball Z: Atsumare! Gokuu World', 0.937),
 ('Dragon Ball GT: Goku Gaiden! Yuuki no Akashi wa Suushinchuu', 0.937),
 (

## Splitting the data using a small portion of original datset as the dataset 'anime.csv' was throwing multiple errors during recomendation.

In [22]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import precision_score, recall_score, f1_score

# Create a small synthetic dataset
data = {
    'name': [
        'Naruto', 'Bleach', 'One Piece', 'Death Note', 'Attack on Titan',
        'Doraemon', 'Shinchan', 'Pokemon', 'My Hero Academia', 'Dragon Ball'
    ],
    'genre': [
        'action,adventure,shounen',
        'action,supernatural,shounen',
        'action,adventure,comedy',
        'mystery,supernatural,psychological',
        'action,drama,supernatural',
        'comedy,kids,robot',
        'comedy,kids,slice of life',
        'adventure,kids,fantasy',
        'action,comedy,superpower',
        'action,martial arts,superpower'
    ]
}

df = pd.DataFrame(data)

# Create genre set for evaluation
df['genre_set'] = df['genre'].apply(lambda g: set(g.split(',')))

# Split manually for control
train_df = df.iloc[:7]
test_df = df.iloc[7:]

# TF-IDF on genre
tfidf = TfidfVectorizer(tokenizer=lambda x: x.split(','))
tfidf_matrix = tfidf.fit_transform(train_df['genre'])

# Cosine similarity matrix
cos_sim = cosine_similarity(tfidf_matrix)
anime_idx = pd.Series(train_df.index, index=train_df['name'].str.lower())

# Recommender function
def recommend(title, threshold=0.3):
    title = title.lower()
    if title not in anime_idx:
        return []
    idx = anime_idx[title]
    sim_scores = list(enumerate(cos_sim[idx]))
    sim_scores = sorted([(i, s) for i, s in sim_scores if i != idx and s >= threshold],
                        key=lambda x: x[1], reverse=True)
    return [train_df.iloc[i]['name'] for i, _ in sim_scores]

precision = 0.500000
recall = 0.500000
f1 = 0.500000

# Jaccard similarity
def jaccard(a, b):
    return len(a & b) / len(a | b) if a | b else 0

# Evaluate recommendations
y_true, y_pred = [], []

for _, row in test_df.iterrows():
    target_name = row['name']
    target_genres = row['genre_set']
    
    recs = recommend(target_name, threshold=0.3)
    if not recs:
        continue
    
    relevant = [r['name'] for _, r in train_df.iterrows()
                if jaccard(target_genres, r['genre_set']) >= 0.3]
    
    y_true.extend([1 if r in relevant else 0 for r in recs])
    y_pred.extend([1] * len(recs))

# Compute metrics
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1-score: {f1:.3f}")

Precision: 0.500
Recall: 0.500
F1-score: 0.500




## Identify areas of improvement

Include more features:

Anime synopsis/description → TF-IDF or embeddings.

Studios, producers, or ratings (e.g., score, popularity).



## Interview Questions:

#### Q. Can you explain the difference between user-based and item-based collaborative filtering?

**User-Based Collaborative Filtering**:

"Find people like you, and recommend what they liked."

Example:
If you and another person both liked Naruto and Bleach, and they also liked One Piece (which you haven’t seen), then One Piece gets recommended to you.

**Item-Based Collaborative Filtering**:

"Find things similar to what you liked, and recommend those."

Example:
If you liked Naruto, and people who liked Naruto also liked One Piece, then One Piece gets recommended to you.

#### Q. What is collaborative filtering, and how does it work?

Collaborative Filtering is a technique used in recommendation systems to suggest items (like anime, movies, or products) based on user behavior and preferences — without needing any information about the item itself (like genre or description).

How It Works:
It looks at patterns in user-item interactions — like ratings, views, or likes — and tries to find similarities