In [5]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer, MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
import warnings
warnings.filterwarnings('ignore')

#  Data Preprocessing

In [58]:
anime_df = pd.read_csv('anime.csv')
anime_df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [60]:
anime_df.shape

(12294, 7)

In [62]:
anime_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [64]:
anime_df.describe()

Unnamed: 0,anime_id,rating,members
count,12294.0,12064.0,12294.0
mean,14058.221653,6.473902,18071.34
std,11455.294701,1.026746,54820.68
min,1.0,1.67,5.0
25%,3484.25,5.88,225.0
50%,10260.5,6.57,1550.0
75%,24794.5,7.18,9437.0
max,34527.0,10.0,1013917.0


In [66]:
anime_df.dtypes

anime_id      int64
name         object
genre        object
type         object
episodes     object
rating      float64
members       int64
dtype: object

In [72]:
# Handle missing values
anime_df['genre'].fillna("Unknown", inplace=True)
anime_df['type'].fillna("Unknown", inplace=True)
anime_df['rating'].fillna(anime_df['rating'].mean(), inplace=True)

In [74]:
anime_df.isnull().sum()

anime_id    0
name        0
genre       0
type        0
episodes    0
rating      0
members     0
dtype: int64

In [76]:
# One-hot encode 'genre' and 'type'
anime_df['genre'] = anime_df['genre'].apply(lambda x: x.split(', ') if x != "Unknown" else ["Unknown"])

In [82]:
mlb = MultiLabelBinarizer()
mlb

In [84]:
genre_encoded = mlb.fit_transform(anime_df['genre'])
genre_encoded

array([[0, 0, 0, ..., 0, 0, 0],
       [1, 1, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [86]:
genre_df = pd.DataFrame(genre_encoded, columns=mlb.classes_)
genre_df.head()

Unnamed: 0,Action,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,Game,...,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Unknown,Vampire,Yaoi,Yuri
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,1,1,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [88]:
# encode type colum to numeric column.

In [90]:
type_encoded = pd.get_dummies(anime_df['type'], prefix='type')

In [22]:
# Normalize numerical features
scaler = MinMaxScaler()

In [92]:
anime_df[['rating', 'members']] = scaler.fit_transform(anime_df[['rating', 'members']])

In [100]:
# Combine features
anime_df = pd.concat([anime_df, genre_df, type_encoded], axis=1)

In [98]:
# Step 2: Feature Extraction
features = anime_df.drop(columns=['anime_id', 'name', 'genre', 'type', 'episodes']).values
features

array([[0.9243697478991597, 0.19787220192679442, 0, ..., False, False,
        False],
       [0.9111644657863145, 0.7827701023362974, 1, ..., False, True,
        False],
       [0.9099639855942379, 0.11268926691862803, 1, ..., False, True,
        False],
       ...,
       [0.38535414165666265, 0.00021106368205524737, 0, ..., False,
        False, False],
       [0.397358943577431, 0.00016766741097846755, 0, ..., False, False,
        False],
       [0.4549819927971189, 0.00013512020767088265, 0, ..., False, False,
        False]], dtype=object)

#  Recommendation System

In [102]:
# Compute cosine similarity matrix
similarity_matrix = cosine_similarity(features)

In [103]:
# Step 3: Recommendation System
def get_recommendations(anime_index, similarity_matrix, threshold=0.5):
    similarity_scores = list(enumerate(similarity_matrix[anime_index]))
    similar_anime = [i for i, score in similarity_scores if score >= threshold and i != anime_index]
    return similar_anime


In [122]:
# Assuming `anime_df` is the DataFrame and `similarity_matrix` is already computed
target_anime_index = 1  # Index of the anime you want recommendations for
recommendations = get_recommendations(target_anime_index ,similarity_matrix, threshold=0.5)

In [114]:
# Display recommended anime names
recommended_anime = anime_df.iloc[recommendations]['name']

In [116]:
print("Recommended Anime:")
print(recommended_anime)

Recommended Anime:
2                                                 Gintama°
6                                   Hunter x Hunter (2011)
8        Gintama Movie: Kanketsu-hen - Yorozuya yo Eien...
9                                 Gintama&#039;: Enchousen
12                                                 Gintama
                               ...                        
11055                      Peace Maker Kurogane (Shinsaku)
11066                                         Sakura Quest
11072                  Senki Zesshou Symphogear 4th Season
11073                  Senki Zesshou Symphogear 5th Season
11074                           Sentai Hero Sukiyaki Force
Name: name, Length: 1094, dtype: object


#  Evaluation

In [41]:
#split data into training and testing
train_df, test_df = train_test_split(anime_df, test_size=0.2, random_state=42)

In [118]:
train_features = train_df.drop(columns=['anime_id', 'name', 'genre', 'type', 'episodes']).values

In [120]:
test_features = test_df.drop(columns=['anime_id', 'name', 'genre', 'type', 'episodes']).values

In [47]:
precision_scores,recall_scores,f1_scores= [],[],[]
for idx, test_anime in test_df.iterrows():
    test_genres = set(test_anime['genre'])
    similar_anime_indices = get_recommendations(idx, similarity_matrix)
    y_true = [1 if set(train_df.iloc[i]['genre']).intersection(test_genres) else 0 for i in range(len(train_df))]
    y_pred = [1 if i in similar_anime_indices else 0 for i in range(len(train_df))]
    if np.sum(y_pred) > 0:
        precision_scores.append(precision_score(y_true, y_pred))
        recall_scores.append(recall_score(y_true, y_pred))
        f1_scores.append(f1_score(y_true, y_pred))

average_precision = np.mean(precision_scores)
average_recall=np.mean(recall_scores)
average_f1= np.mean(f1_scores)


In [139]:
print('Precision Score\n\n:',precision_scores)
print('Recall Scores\n\n:',recall_scores)
print('f1_scores\n\n:',f1_scores)

Precision Score

: [0.12404580152671756, 0.668918918918919, 0.5513126491646778, 0.37986270022883295, 0.09027777777777778, 0.41101356743814843, 0.5517241379310345, 0.40222772277227725, 0.4779270633397313, 0.5280373831775701, 0.3295774647887324, 0.0964332892998679, 0.43482587064676614, 0.45537065052950076, 0.5459459459459459, 0.0945945945945946, 0.5601190476190476, 0.28300094966761635, 0.17488789237668162, 0.562874251497006, 0.11428571428571428, 0.0, 0.3977455716586151, 0.3184634448574969, 0.29809725158562367, 0.5, 0.4476138233680746, 0.4162234042553192, 0.12446351931330472, 0.06072874493927125, 0.28214285714285714, 0.37649063032367974, 0.3601496725912067, 0.5428342674139311, 0.06198347107438017, 0.1282051282051282, 0.5145985401459854, 0.38258575197889183, 0.0962566844919786, 0.48848684210526316, 0.09550561797752809, 0.08074534161490683, 0.30420280186791193, 0.096045197740113, 0.2544642857142857, 0.52303961196443, 0.1357142857142857, 0.03608247422680412, 0.5342066957787481, 0.47248182762

In [53]:
print(f'Average Precision: {average_precision:.2f}')
print(f'Average Recall: {average_recall:.2f}')
print(f'Average F1 Score: {average_f1:.2f}')

Average Precision: 0.35
Average Recall: 0.09
Average F1 Score: 0.14


# ______________________________________________________

# Interview Questions:

# Question 1 : Can you explain the difference between user-based and item-based collaborative filtering?


### Both user-based and item-based collaborative filtering are techniques used in recommendation systems. The key difference lies in the perspective from which recommendations are generated.

## 1. User-Based Collaborative Filtering:
### Definition: Recommendations are made based on the similarity between users.
## How It Works:
### Identify similar users to the target user using a similarity measure (e.g., cosine similarity, Pearson correlation).
### Recommend items that the similar users have liked but the target user has not interacted with.
## Example:
### If User A and User B have similar preferences, and User B likes a movie that User A has not seen, that movie is recommended to User A.
## Advantages:
### Intuitive and works well when there are many users with overlapping preferences.
## Disadvantages:
### Struggles with cold start for new users (insufficient interaction data).
### Performance may degrade with large numbers of users.


## Item-Based Collaborative Filtering
### Definition: Recommendations are made based on the similarity between items.
## How It Works:
### Identify similar items to those that the target user has interacted with.
### ecommend items similar to those the user has liked or interacted with.
## Example:
### If a user likes Movie A, and Movie B is similar to Movie A (based on ratings from all users), Movie B is recommended.
## Advantages:
### More scalable as the number of items is typically smaller than the number of users.
### Works well for new users as long as they interact with a few items.
## Disadvantages:
### Struggles with cold start for new items with little to no interaction history.

# _________________________________________________

# Question 2: What is collaborative filtering, and how does it work?

## Definition
### Collaborative filtering is a technique used in recommendation systems to suggest items based on patterns of user-item interactions. It relies on the principle that users who interacted similarly with certain items in the past are likely to have similar preferences.

## Types of Collaborative Filtering:
## 1. User-Based Collaborative Filtering:
### Focuses on finding users with similar preferences or behaviors.
## 2.Item-Based Collaborative Filtering:
### Focuses on finding items that are similar based on user interactions.

## How Collaborative Filtering Works
## Steps:
## Data Collection:

### Gather user-item interaction data, typically in the form of a ratings matrix.

## Similarity Computation:

### Calculate similarity between users or items using metrics like:
### 1.Cosine Similarity: Measures the cosine of the angle between vectors.
### 2.Pearson Correlation: Measures linear correlation between vectors.
### 3.Jaccard Similarity: Measures overlap between sets.

## Prediction Generation:

### For user-based filtering: Predict the rating a user would give to an item by averaging the ratings of similar users.
### For item-based filtering: Predict the rating a user would give to an item based on the ratings the user has given to similar items.

## Recommendation:

### Recommend the top 𝑁 items with the highest predicted ratings that the user has not interacted with.

# ---- The End -----