In [20]:
import pandas as pd

# Load the dataset
anime_df = pd.read_csv('anime.csv')  # Update the file path as necessary


In [21]:
# Handle missing values
anime_df.fillna('', inplace=True)


In [22]:
# Explore the dataset
print(anime_df.info())



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   anime_id  12294 non-null  int64 
 1   name      12294 non-null  object
 2   genre     12294 non-null  object
 3   type      12294 non-null  object
 4   episodes  12294 non-null  object
 5   rating    12294 non-null  object
 6   members   12294 non-null  int64 
dtypes: int64(2), object(5)
memory usage: 672.5+ KB
None


In [23]:
# Convert episodes and rating to numeric 
anime_df['episodes'] = pd.to_numeric(anime_df['episodes'], errors='coerce')
anime_df['rating'] = pd.to_numeric(anime_df['rating'], errors='coerce')

In [24]:
# Check for NaN values
print(anime_df.isnull().sum())


anime_id      0
name          0
genre         0
type          0
episodes    340
rating      230
members       0
dtype: int64


In [25]:
# Drop rows with NaN values
anime_df.dropna(subset=['episodes', 'rating'], inplace=True)


In [26]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
anime_df['type'] = label_encoder.fit_transform(anime_df['type'])


In [27]:
# Split the genres and use a one-hot encoding approach
from sklearn.preprocessing import MultiLabelBinarizer

# Split genres into lists
anime_df['genre'] = anime_df['genre'].apply(lambda x: x.split(', ') if x != '' else [])

# Apply MultiLabelBinarizer
mlb = MultiLabelBinarizer()
genre_encoded = pd.DataFrame(mlb.fit_transform(anime_df['genre']), columns=mlb.classes_, index=anime_df.index)

# Concatenate the encoded genre DataFrame with the original DataFrame
anime_df = pd.concat([anime_df, genre_encoded], axis=1)
anime_df.drop('genre', axis=1, inplace=True)


In [28]:
anime_df.head()

Unnamed: 0,anime_id,name,type,episodes,rating,members,Action,Adventure,Cars,Comedy,...,Shounen Ai,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Yaoi,Yuri
0,32281,Kimi no Na wa.,0,1.0,9.37,200630,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,5114,Fullmetal Alchemist: Brotherhood,5,64.0,9.26,793665,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,28977,Gintama°,5,51.0,9.25,114262,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,9253,Steins;Gate,5,24.0,9.17,673572,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,9969,Gintama&#039;,5,51.0,9.16,151266,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [29]:
# Normalize numerical features 
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler() 
anime_df[['episodes', 'rating', 'members']] = scaler.fit_transform(anime_df[['episodes', 'rating', 'members']])

In [30]:
from sklearn.metrics.pairwise import cosine_similarity

# Features to use for similarity
features = ['type', 'episodes', 'rating', 'members'] + list(mlb.classes_)

# Compute the cosine similarity matrix
cosine_sim = cosine_similarity(anime_df[features])


In [48]:
def get_recommendations(title, cosine_sim=cosine_sim, threshold=0.2): #try to change threshold 
    # Check if the title exists in anime_df
    if title not in anime_df['name'].values:
        raise ValueError(f"Title '{title}' not found in the dataset.")
    
    idx_list = anime_df.index[anime_df['name'] == title].tolist()
    if not idx_list:
        raise ValueError(f"Index for title '{title}' not found.")
    
    idx = idx_list[0]

    # Check for index bounds
    if idx >= cosine_sim.shape[0]:
        raise IndexError(f"Index {idx} out of bounds for cosine_sim of shape {cosine_sim.shape}")
    
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = [score for score in sim_scores if score[1] > threshold]
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    anime_indices = [i[0] for i in sim_scores]
    return anime_df['name'].iloc[anime_indices]


In [49]:
from sklearn.metrics import precision_score, recall_score, f1_score

def evaluate_recommendations(test_data, cosine_sim, threshold=0.1):
    true_labels = []
    predicted_labels = []

    for idx, row in test_data.iterrows():
        true_anime = row['name']
        recommendations = get_recommendations(true_anime, cosine_sim=cosine_sim, threshold=threshold)
        
        for anime in recommendations:
            true_labels.append(true_anime)
            predicted_labels.append(anime)

    true_labels = [1] * len(true_labels)
    predicted_labels = [1 if pred in test_data['name'].values else 0 for pred in predicted_labels]
    
    precision = precision_score(true_labels, predicted_labels, average='micro')
    recall = recall_score(true_labels, predicted_labels, average='micro')
    f1 = f1_score(true_labels, predicted_labels, average='micro')
    
    return precision, recall, f1


In [50]:
anime_df.reset_index(drop=True, inplace=True)
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(anime_df, test_size=0.2, random_state=42)


In [51]:
print(f"anime_df size: {anime_df.shape[0]}")
print(f"cosine_sim shape: {cosine_sim.shape}")


anime_df size: 11876
cosine_sim shape: (11876, 11876)


In [52]:
print(f"Example recommendation for 'Naruto':")
print(get_recommendations('Naruto', cosine_sim=cosine_sim, threshold=0.1))


Example recommendation for 'Naruto':
2445                 Naruto Shippuuden: Sunny Side Battle
1098    Boruto: Naruto the Movie - Naruto ga Hokage ni...
174                                Katekyo Hitman Reborn!
7583                              Kyutai Panic Adventure!
1336                                          Naruto x UT
580                                                Bleach
205                                         Dragon Ball Z
177                                 Boku no Hero Academia
586                                       Dragon Ball Kai
2601                                           Medaka Box
Name: name, dtype: object


In [53]:
precision, recall, f1 = evaluate_recommendations(test_data, cosine_sim, threshold=0.1)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")


Precision: 0.19452861952861952
Recall: 0.19452861952861952
F1-score: 0.19452861952861952


1. Can you explain the difference between user-based and item-based collaborative filtering?

Collaborative Filtering is a technique used in recommendation systems where the system makes recommendations based on the past behavior of users or items. The main idea is that users who have similar preferences in the past will have similar preferences in the future.

User-Based Collaborative Filtering:

In user-based collaborative filtering, the system finds users that are similar to the target user based on their historical interactions or preferences (e.g., ratings).
It then recommends items that these similar users liked, assuming that if users liked the same items in the past, they will like similar items in the future.
Example: If User A and User B have rated movies similarly, then the movies that User B liked but User A has not yet seen will be recommended to User A.

Pros:

Simple and intuitive.
Works well when you have a good amount of user data.

Cons:

Can suffer from the cold start problem (difficulty recommending for new users or new items with little data).
It can be computationally expensive for large datasets because it requires comparing the target user with every other user in the system.

Item-Based Collaborative Filtering:

In item-based collaborative filtering, the system looks for items that are similar to the ones the target user has already interacted with.
Instead of finding similar users, it finds items that have been rated similarly by users and recommends those items.
Example: If User A likes Movie X, the system will recommend other movies that have been liked by users who also liked Movie X, even if those users have different preferences.

Pros:

It is more scalable than user-based filtering because item similarities don’t change often.
It works better when there is more item interaction data than user interaction data.

Cons:

It can struggle with recommending new or rare items (the cold start problem for items).


2) What is collaborative filtering, and how does it work?

Collaborative Filtering is a popular technique used in recommendation systems, where recommendations are made based on the historical interactions of users or items. The core idea is that if users have agreed on liking certain items in the past, they will likely agree in the future as well.

There are two primary types of collaborative filtering:

Memory-Based Collaborative Filtering:

In this approach, the system uses the entire dataset of user-item interactions to make predictions. It computes similarities between users or items using metrics like cosine similarity, Pearson correlation, or Euclidean distance.
User-Based Memory Filtering: Finds similar users and recommends items based on those.
Item-Based Memory Filtering: Finds similar items and recommends them to users who have already interacted with a given item.
Example: If User A and User B have similar ratings for certain movies, the system will recommend to User A the movies that User B has rated highly but User A has not yet seen.

Model-Based Collaborative Filtering:

In this approach, the system creates a predictive model based on the user-item interaction matrix. This model is used to predict ratings or preferences.
Techniques include Matrix Factorization (e.g., Singular Value Decomposition or SVD), Neural Networks, or Factorization Machines.
The model is trained on historical data to learn latent factors (hidden features) that explain user-item interactions.