# Recommendation System

In [24]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.simplefilter('ignore')

## Data Preprocessing:

In [25]:
df=pd.read_csv("anime.csv")
df.shape

(12294, 7)

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [27]:
df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


##  Feature Extraction :

In [28]:
df.describe()

Unnamed: 0,anime_id,rating,members
count,12294.0,12064.0,12294.0
mean,14058.221653,6.473902,18071.34
std,11455.294701,1.026746,54820.68
min,1.0,1.67,5.0
25%,3484.25,5.88,225.0
50%,10260.5,6.57,1550.0
75%,24794.5,7.18,9437.0
max,34527.0,10.0,1013917.0


In [29]:
# checking for null values
df.isnull().sum()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

In [30]:
# Fill missing ratings with average rating
df['rating'].fillna(df['rating'].mean(), inplace=True)

In [31]:
# Fill missing genres with 'Unknown'
df['genre'].fillna('Unknown', inplace=True)


In [32]:
# Drop rows with missing titles (critical field)
df.dropna(subset=['name'], inplace=True)
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12294 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12294 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


## Recommendation System:

In [33]:
# Convert genres into TF-IDF vectors
tfidf = TfidfVectorizer(stop_words='english')
genre_matrix = tfidf.fit_transform(df['genre'])
genre_matrix

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 40480 stored elements and shape (12294, 47)>

In [34]:
# Normalize ratings and members
scaler = MinMaxScaler()
numeric_features = scaler.fit_transform(df[['rating', 'members']])
numeric_features

array([[9.24369748e-01, 1.97872202e-01],
       [9.11164466e-01, 7.82770102e-01],
       [9.09963986e-01, 1.12689267e-01],
       ...,
       [3.85354142e-01, 2.11063682e-04],
       [3.97358944e-01, 1.67667411e-04],
       [4.54981993e-01, 1.35120208e-04]])

In [35]:
# Combine features: concatenate genre matrix with numeric features
import numpy as np
feature_matrix = np.hstack((genre_matrix.toarray(), numeric_features))
feature_matrix


array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 9.24369748e-01, 1.97872202e-01],
       [2.94649234e-01, 3.17606646e-01, 0.00000000e+00, ...,
        0.00000000e+00, 9.11164466e-01, 7.82770102e-01],
       [2.50631444e-01, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 9.09963986e-01, 1.12689267e-01],
       ...,
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 3.85354142e-01, 2.11063682e-04],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 3.97358944e-01, 1.67667411e-04],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 4.54981993e-01, 1.35120208e-04]])

In [36]:
# Generate similarity scores between all anime.

cosine_sim = cosine_similarity(feature_matrix, feature_matrix)
cosine_sim


array([[1.        , 0.53235245, 0.46247873, ..., 0.24157166, 0.24807781,
        0.27820617],
       [0.53235245, 1.        , 0.51682949, ..., 0.20971947, 0.2153503 ,
        0.24148453],
       [0.46247873, 0.51682949, 1.        , ..., 0.24118659, 0.24768518,
        0.27776899],
       ...,
       [0.24157166, 0.20971947, 0.24118659, ..., 1.        , 0.99994581,
        0.99824985],
       [0.24807781, 0.2153503 , 0.24768518, ..., 0.99994581, 1.        ,
        0.99881138],
       [0.27820617, 0.24148453, 0.27776899, ..., 0.99824985, 0.99881138,
        1.        ]])

In [37]:
def recommend_anime(title, top_n=5, threshold=0.5):
    # Find index of the anime
    idx = df[df['name'] == title].index[0]
    
    # Get similarity scores for this anime
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Filter by threshold
    sim_scores = [s for s in sim_scores if s[1] >= threshold]
    
    # Sort by similarity score
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get top_n recommendations (excluding the anime itself)
    sim_scores = sim_scores[1:top_n+1]
    
    # Return titles of recommended anime
    recommended = [df.iloc[i[0]]['name'] for i in sim_scores]
    return recommended

# Example usage:
recommend_anime("Naruto", top_n=5, threshold=0.6)


['Naruto: Shippuuden',
 'Dragon Ball Z',
 'Dragon Ball',
 'Naruto: Shippuuden Movie 4 - The Lost Tower',
 'Naruto: Shippuuden Movie 3 - Hi no Ishi wo Tsugu Mono']

## Interview Questions :

### 1. Can you explain the difference between user-based and item-based collaborative filtering
1. User-Based Collaborative Filtering:
This approach recommends items to a user based on the preferences of similar users.
It calculates the similarity between users, often using metrics like cosine similarity, Pearson correlation, or Jaccard similarity.
2. Item-Based Collaborative Filtering:This approach recommends items based on the similarity between items rather than users.
It calculates the similarity between items, typically based on user interactions or ratings. Common metrics include cosine similarity and adjusted cosine similarity.


### 2.What is collaborative filtering, and how does it work?
Collaborative filtering is a recommendation technique that predicts a user's preferences based on the preferences and behaviors of other users. It operates on the idea that users who have agreed in the past will likely agree in the future.
Collaborative Filtering recommends items by learning patterns from the past behavior of similar users or similar items.