In [1]:
import warnings 
warnings.filterwarnings('ignore')
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_score

In [2]:
data = pd.read_csv('anime.csv')
data.head(5)

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [4]:
data.isnull().sum()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

In [5]:
data.fillna({'genre': '', 'rating': data['rating'].mean(), 'episodes': 0}, inplace=True)


In [6]:
# Convert 'genre' to numerical representation using one-hot encoding

mlb = MultiLabelBinarizer()
data['genre'] = data['genre'].apply(lambda x: x.split(', ') if x else [])
genre_encoded = mlb.fit_transform(data['genre'])
genre_df = pd.DataFrame(genre_encoded, columns = mlb.classes_) # type: ignore

In [7]:
# Normalize numerical features
scaler = StandardScaler()
data['rating'] = scaler.fit_transform(data[['rating']])

In [8]:
# Combine features
features = pd.concat([genre_df, data[['rating']]], axis=1)

In [9]:
# Compute cosine similarity on training set
train_features = features.loc[data.index]
cos_sim = cosine_similarity(train_features)

# Recommendation function based on cosine similarity
def recommend_anime(anime_id, df=data, features=train_features, cosine_sim=cos_sim, top_n=10):
    idx = df.index[df['anime_id'] == anime_id].tolist()[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]
    anime_indices = [i[0] for i in sim_scores]
    return df.iloc[anime_indices]


In [10]:
def recommend_anime(anime_id, cosine_sim=cos_sim, df=data, top_n=10):
    if anime_id not in df['anime_id'].values:
        return pd.DataFrame()  # Return empty DataFrame if anime_id not in training set
    
    idx = df.index[df['anime_id'] == anime_id].tolist()[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]
    anime_indices = [i[0] for i in sim_scores]
    return df[['anime_id', 'name']].iloc[anime_indices]


In [11]:
recommended_anime = recommend_anime(anime_id=9963, top_n=10)
display(recommended_anime)

Unnamed: 0,anime_id,name
344,781,Detective Conan Movie 03: The Last Wizard of t...
345,1363,Detective Conan Movie 04: Captured in Her Eyes
645,34036,Detective Conan: Episode One - Chiisaku Natta ...
454,6438,Detective Conan OVA 09: The Stranger in 10 Yea...
293,1506,Detective Conan Movie 10: Requiem of the Detec...
274,1367,Detective Conan Movie 08: Magician of the Silv...
252,235,Detective Conan
253,1364,Detective Conan Movie 05: Countdown to Heaven
524,780,Detective Conan Movie 02: The Fourteenth Target
544,779,Detective Conan Movie 01: The Timed Skyscraper


In [17]:

# Split dataset into training and testing sets
train, test = train_test_split(data, test_size=0.2, random_state=42)

# Initialize lists to store ground truth and predictions
ground_truth = []
predictions = []

# Mean rating to determine relevance
mean_rating = train['rating'].mean()

for _, row in test.iterrows():
    anime_id = row['anime_id']
    
    # Skip anime not present in the training set
    if anime_id not in train['anime_id'].values:
        continue
    
    recommended_anime = recommend_anime(anime_id, top_n=10)
    
    if recommended_anime.empty:
        continue
    
    actual_ratings = test[test['anime_id'].isin(recommended_anime['anime_id'])]['rating']
    true_relevant = (actual_ratings > mean_rating).astype(int).tolist()
    
    ground_truth.extend(true_relevant)
    predictions.extend([1] * len(true_relevant))
    
# Include non-relevant predictions to avoid all 1s in predictions
if not predictions:
    predictions = [0]

# Include non-relevant items in ground truth to avoid all 1s
if not ground_truth:
    ground_truth = [0]


In [18]:
# Compute precision, recall, and F1-score

precision = precision_score(ground_truth, predictions, average='macro', zero_division=0)
recall = recall_score(ground_truth, predictions, average='macro', zero_division=0)
f1 = f1_score(ground_truth, predictions, average='macro', zero_division=0)

print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1-Score: {f1:.2f}')

Precision: 1.00
Recall: 1.00
F1-Score: 1.00


# Can you explain the difference between user-based and item-based collaborative filtering?

1. `User-Based Collaborative Filtering`:

   Focuses on the similarity between users. Recommends items to a user based on the preferences of similar users.

   - Advantages:

     > Can capture the nuances of user preferences. Works well when there are many users with diverse tastes.

   - Disadvantages:
     > Scalability issues with large datasets due to the need to compare many user pairs. Can suffer from the "cold start" problem, where it struggles to make recommendations for new users with few interactions.

2. `Item-Based Collaborative Filtering`:

   Focuses on the similarity between items. Recommends items similar to those the user has interacted with before.

   - Advantages:

     > Typically more scalable than user-based filtering, as the number of items is usually smaller than the number of users.
     > More stable recommendations, as item similarities don’t change as frequently as user preferences might.

   - Disadvantages:
     > May not capture all aspects of user preferences as effectively as user-based methods. Can still suffer from the "cold start" problem for new items.


# What is collaborative filtering, and how does it work?

Collaborative filtering is a technique that can filter out items that a user might like on the basis of reactions by similar users. It works by searching a large group of people and finding a smaller set of users with tastes similar to a particular user. It looks at the items they like and combines them to create a ranked list of suggestions. There are many ways to decide which users are similar and combine their choices to create a list of recommendations.

1. Memory based
   - User based
   - Item based
2. Model based
   - Matrix Factorization
   - Deep Learning Models

It is applicable in E-commerce(for recommending products), Streaming services(for suggesting Movies,shows), Social Media(for showing content).
