In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer , MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

In [2]:
# -----------------------------------
# 1. Data Preprocessing
# -----------------------------------

In [3]:
# Load dataset
df = pd.read_csv(r"C:\Users\GURU\Downloads\Recommendation System\Recommendation System\anime.csv")
df

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


In [4]:
# Handle missing values
df['genre'].fillna('Unknown', inplace = True)
df.dropna(subset = ['name'], inplace = True)

In [5]:
# Explore the dataset
print("Dataset Info:")
df.info()

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12294 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [6]:
df.isnull().sum()

anime_id      0
name          0
genre         0
type         25
episodes      0
rating      230
members       0
dtype: int64

In [7]:
print("\nDataset Description:")
df.describe()


Dataset Description:


Unnamed: 0,anime_id,rating,members
count,12294.0,12064.0,12294.0
mean,14058.221653,6.473902,18071.34
std,11455.294701,1.026746,54820.68
min,1.0,1.67,5.0
25%,3484.25,5.88,225.0
50%,10260.5,6.57,1550.0
75%,24794.5,7.18,9437.0
max,34527.0,10.0,1013917.0


In [8]:
df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [9]:
print("\nAnime Types Distribution:")
print(df['type'].value_counts())


Anime Types Distribution:
type
TV         3787
OVA        3311
Movie      2348
Special    1676
ONA         659
Music       488
Name: count, dtype: int64


In [10]:
# -----------------------------------
# 2. Feature Extraction
# -----------------------------------

In [11]:
# Split genres into lists
df['genre'] = df['genre'].apply(lambda x: x.split(', '))

In [12]:
# Convert genres into numerical format
mlb = MultiLabelBinarizer()
genre_encoded = mlb.fit_transform(df['genre'])
genre_df = pd.DataFrame(genre_encoded, columns = mlb.classes_)


In [13]:
# Normalize numerical features ('rating' and 'members')
scaler = MinMaxScaler()
numerical_features = scaler.fit_transform(df[['rating', 'members']])
numerical_df = pd.DataFrame(numerical_features, columns = ['rating','members'])

In [14]:
# Combine all features
final_features = pd.concat([genre_df, numerical_df] , axis=1)

In [15]:
# Replace NaN with 0 
final_features = final_features.fillna(0)

In [16]:
# -----------------------------------
# 3. Recommendation System
# -----------------------------------

In [17]:
# Compute cosine similarity
cosine_sim = cosine_similarity(final_features)

In [18]:
# Define recommendation function
def recommend_anime(title,df,cosine_sim, top_n =5):
    try:
        idx = df[df['name'] == title].index[0]
        sim_scores = list(enumerate(cosine_sim[idx]))
        sim_scores = sorted(sim_scores, key = lambda x:x[1], reverse = True)
        sim_scores = sim_scores[1:top_n+1]
        anime_indices = [i[0] for i in sim_scores]
        return df['name'].iloc[anime_indices].tolist()
    except:
        return "Anime not found!"


In [19]:
# Example recommendation
print("\nRecommendations for 'Steins;Gate':")
print(recommend_anime('Steins;Gate', df, cosine_sim, top_n=5))


Recommendations for 'Steins;Gate':
['Steins;Gate Movie: Fuka Ryouiki no Déjà vu', 'Steins;Gate: Oukoubakko no Poriomania', 'Steins;Gate: Kyoukaimenjou no Missing Link - Divide By Zero', 'Steins;Gate 0', 'Under the Dog']


In [20]:
# -----------------------------------
# 4. Evaluation
# -----------------------------------

In [21]:
# Split dataset into train and test (for simulation)
train_df, test_df = train_test_split(df, test_size = 0.2, random_state =42)

In [22]:
print(f"\n Training size : {len(train_df)}, Testing size : {len(test_df)}")


 Training size : 9835, Testing size : 2459


In [23]:
# Simple manual evaluation
target_anime = 'Steins;Gate'
recammened = recommend_anime(target_anime, df, cosine_sim, top_n = 5)

In [24]:
# Assume true similar animes (for demo only)
true_smilar = ['Steins;Gate Movie: Fuka Ryouiki no Déjà vu','Chaos;Head', 'Erased']

In [25]:
# Convert to sets
recommended_set = set(recammened)
true_similar_set = set(true_smilar)

In [26]:
# Calculate Precision, Recall, F1-score
precision = len(recommended_set & true_similar_set) / len(recommended_set) if len(recommended_set) > 0 else 0
recall = len(recommended_set & true_similar_set) / len(true_similar_set) if len(true_similar_set) > 0 else 0
f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

In [27]:
print(f"\nEvaluation Metrics:")
print(f"Precision : {precision:.2f}")
print(f"Recall : {recall:.2f}")
print(f"f1 : {f1:.2f}")


Evaluation Metrics:
Precision : 0.20
Recall : 0.33
f1 : 0.25


In [28]:
#Interview Questions (Answers)

In [29]:

#1. Difference between user-based and item-based collaborative filtering?

###   User-Based Collaborative Filtering
#Recommends items by finding similar users.
#Example: If User A likes X and Y, and User B likes X, suggest Y to B.
#Sensitive to number of users.

###   Item-Based Collaborative Filtering 
#Recommends items by finding similar items.
##Example: If people who like X also like Y, recommend Y to anyone who liked X.
#More stable because items don't change frequently.

In [30]:
#2. What is collaborative filtering, and how does it work?

#Collaborative filtering is a technique used in recommendation systems that makes automatic 
#predictions about the interests of a user by collecting preferences from many users (collaboration).

#It works on the assumption that if users agreed in the past, they will agree in the future.

#It is based either on users (user-user filtering) or on items (item-item filtering).
