In [19]:
# Load the dataset
import pandas as pd
df = pd.read_csv("anime.csv")
df

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


In [20]:
#drop na rows
df = df.dropna()
df

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


Based on the available columns in the dataset, we can use the following features for computing similarity:

- **genre**: Genres can be used to find similar anime based on their categories.
- **rating**: The average rating of an anime can indicate its overall quality and help find similar, highly-rated shows.
- **members**: The number of members who have added the anime to their list can indicate popularity.

In [21]:
# Convert genre to numerical representation using one-hot encoding
# Make sure to run the preceding cells to load and clean the data before running this cell.
genre_dummies = df['genre'].str.get_dummies(sep=', ')
df = pd.concat([df, genre_dummies], axis=1)
df = df.drop('genre', axis=1)
# Reset index after dropping columns to ensure contiguous 0-based index
df = df.reset_index(drop=True)
df

Unnamed: 0,anime_id,name,type,episodes,rating,members,Action,Adventure,Cars,Comedy,...,Shounen Ai,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Yaoi,Yuri
0,32281,Kimi no Na wa.,Movie,1,9.37,200630,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,5114,Fullmetal Alchemist: Brotherhood,TV,64,9.26,793665,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,28977,Gintama°,TV,51,9.25,114262,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,9253,Steins;Gate,TV,24,9.17,673572,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,9969,Gintama&#039;,TV,51,9.16,151266,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12012,9316,Toushindai My Lover: Minami tai Mecha-Minami,OVA,1,4.15,211,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12013,5543,Under World,OVA,1,4.28,183,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12014,5621,Violence Gekiga David no Hoshi,OVA,4,4.88,219,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12015,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,OVA,1,4.98,175,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
from sklearn.preprocessing import StandardScaler

# Select numerical features
numerical_features = ['rating', 'members']

# Initialize StandardScaler
scaler = StandardScaler()

# Normalize numerical features
df[numerical_features] = scaler.fit_transform(df[numerical_features])

display(df.head())

Unnamed: 0,anime_id,name,type,episodes,rating,members,Action,Adventure,Cars,Comedy,...,Shounen Ai,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Yaoi,Yuri
0,32281,Kimi no Na wa.,Movie,1,2.824474,3.292044,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,5114,Fullmetal Alchemist: Brotherhood,TV,64,2.717032,14.00241,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,28977,Gintama°,TV,51,2.707265,1.732216,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,9253,Steins;Gate,TV,24,2.629126,11.833499,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,9969,Gintama&#039;,TV,51,2.619358,2.400518,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0


# Task
Select features for similarity calculation, calculate cosine similarity, and design a recommendation function.

## Select features for similarity calculation

### Subtask:
Choose the columns from the dataframe that will be used to calculate cosine similarity.


**Reasoning**:
Identify the feature columns for similarity calculation and store them in a list.



In [23]:
# Identify genre dummy columns
genre_columns = genre_dummies.columns.tolist()

# Combine genre columns with normalized rating and members
features = genre_columns + ['rating', 'members']

## Calculate cosine similarity

### Subtask:
Compute the cosine similarity matrix based on the selected features.


**Reasoning**:
Compute the cosine similarity matrix using the selected features.



In [24]:
from sklearn.metrics.pairwise import cosine_similarity

# Select the features
features_df = df[features]

# Calculate the cosine similarity matrix
cosine_sim = cosine_similarity(features_df, features_df)

## Design recommendation function

### Subtask:
Create a Python function that takes an anime title as input and returns a list of recommended anime based on the cosine similarity matrix.


**Reasoning**:
Define a function to recommend anime based on cosine similarity.



In [25]:
def recommend_anime(title, df, cosine_sim, top_n=10, threshold=0):

    # Find the index of the input anime title in the DataFrame used for cosine_sim
    try:
        idx = df[df['name'] == title].index[0]
    except IndexError:
        print(f"Anime '{title}' not found in the dataset.")
        return []

    # Get the cosine similarity scores for the input anime
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the anime by similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Filter recommendations based on the threshold and exclude the input anime itself
    recommended_anime = [(i, score) for i, score in sim_scores if score >= threshold and i != idx]

    # Get the top N recommendations
    recommended_anime = recommended_anime[:top_n]

    # Get the anime indices
    anime_indices = [i[0] for i in recommended_anime]

    # Return the names of the top N recommended anime
    return df['name'].iloc[anime_indices].tolist()

In [26]:
# Example usage with a threshold:
recommendations_with_threshold = recommend_anime('Death Note', df, cosine_sim, threshold=0.5)
print("Recommendations with threshold (0.5):")
print(recommendations_with_threshold)

Recommendations with threshold (0.5):
['Mirai Nikki (TV)', 'Durarara!!', 'Steins;Gate', 'Shingeki no Kyojin', 'Another', 'Sword Art Online', 'Angel Beats!', 'Tokyo Ghoul', 'Bleach', 'Psycho-Pass']


In [27]:
# Example usage without a threshold (defaults to 0):
recommendations_no_threshold = recommend_anime('Death Note', df, cosine_sim)
print("\nRecommendations without threshold (0):")
print(recommendations_no_threshold)


Recommendations without threshold (0):
['Mirai Nikki (TV)', 'Durarara!!', 'Steins;Gate', 'Shingeki no Kyojin', 'Another', 'Sword Art Online', 'Angel Beats!', 'Tokyo Ghoul', 'Bleach', 'Psycho-Pass']


In [28]:
from sklearn.model_selection import train_test_split
features_for_split = df.drop(['anime_id', 'name', 'type', 'episodes'], axis=1, errors='ignore')
X_train, X_test = train_test_split(features_for_split, test_size=0.2, random_state=42)

print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)

Training set shape: (9613, 45)
Testing set shape: (2404, 45)


In [29]:
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np

# Define a function to get a hypothetical set of "relevant" anime for evaluation
# In a real-world scenario with user data, this would be based on user interactions (e.g., ratings, watch history)
# For this content-based system without user data, we'll make a simplified assumption:
# Let's assume that for each anime in the test set, the "relevant" anime are those that are highly similar (e.g., similarity score above a certain threshold)
# in the original dataset (before the train-test split).
# This is a simplified approach for demonstration purposes and might not reflect true relevance.
def get_relevant_anime(anime_title, df, cosine_sim, similarity_threshold=0.7):
    """
    Gets a hypothetical set of "relevant" anime for evaluation based on a similarity threshold.

    Args:
        anime_title: The title of the anime to find relevant anime for.
        df: The DataFrame containing anime data (should be the one used to compute cosine_sim).
        cosine_sim: The cosine similarity matrix.
        similarity_threshold: The minimum similarity score to consider an anime "relevant".

    Returns:
        A set of relevant anime titles.
    """
    try:
        # Find the index of the input anime title in the DataFrame used for cosine_sim
        idx = df[df['name'] == anime_title].index[0]
        # Find similar anime based on the original cosine similarity matrix
        sim_scores = list(enumerate(cosine_sim[idx]))
        # Filter based on similarity threshold and exclude the anime itself
        relevant_indices = [i for i, score in sim_scores if score >= similarity_threshold and i != idx]
        return set(df['name'].iloc[relevant_indices].tolist())
    except IndexError:
        return set()

# Function to evaluate recommendations for a list of anime
def evaluate_recommendations(anime_list, df, cosine_sim, top_n=10, recommendation_threshold=0, relevance_threshold=0.7):

    all_precision = []
    all_recall = []
    all_f1 = []

    for anime_title in anime_list:
        # Get recommended anime
        recommended_list = recommend_anime(anime_title, df, cosine_sim, top_n=top_n, threshold=recommendation_threshold)

        # Get relevant anime (based on our simplified definition)
        relevant_list = get_relevant_anime(anime_title, df, cosine_sim, similarity_threshold=relevance_threshold)

        # Convert lists to sets for easier comparison
        recommended_set = set(recommended_list)
        relevant_set = relevant_list

        # Calculate True Positives, False Positives, and False Negatives
        true_positives = len(recommended_set.intersection(relevant_set))
        false_positives = len(recommended_set - relevant_set)
        false_negatives = len(relevant_set - recommended_set)

        # Calculate Precision, Recall, and F1-score
        precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
        recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

        all_precision.append(precision)
        all_recall.append(recall)
        all_f1.append(f1)

    # Calculate average metrics
    avg_precision = np.mean(all_precision)
    avg_recall = np.mean(all_recall)
    avg_f1 = np.mean(all_f1)

    return avg_precision, avg_recall, avg_f1

# Select a subset of anime from the test set for evaluation (e.g., the first 100 anime)
# Ensure the selected anime titles exist in the DataFrame used for cosine similarity calculation
test_anime_titles = df.loc[X_test.index.intersection(df.index), 'name'].tolist()[:100]


# Evaluate the recommendation system
avg_precision, avg_recall, avg_f1 = evaluate_recommendations(test_anime_titles, df, cosine_sim, top_n=10, recommendation_threshold=0.5, relevance_threshold=0.7)

print(f"Average Precision: {avg_precision:.4f}")
print(f"Average Recall: {avg_recall:.4f}")
print(f"Average F1-score: {avg_f1:.4f}")

Average Precision: 0.9960
Average Recall: 0.1446
Average F1-score: 0.2048


Based on the results:

Average Precision: 0.9960 - This is very high, suggesting that when the system recommends an anime, it is almost always considered "relevant" according to our simplified definition of relevance (high similarity based on content features).

Average Recall: 0.1446 - This is quite low, indicating that the system is only recommending a small fraction of the anime that are considered "relevant". It's missing many potentially relevant recommendations.

Average F1-score: 0.2048 - This score is a harmonic mean of precision and recall and is low, which reflects the low recall.

Performance Analysis: The high precision but low recall suggests that the current content-based approach with a high similarity threshold (0.7 for relevance and 0.5 for recommendation) is very conservative. It's good at finding very similar anime but fails to capture a broader range of potentially relevant items. The definition of "relevant" here is based purely on content similarity, which might not fully align with what a user would consider relevant in a real-world scenario.

Areas of Improvement:

Refine Relevance Definition: The current evaluation's definition of relevance is based on content similarity. A better evaluation would involve actual user data (if available) to define relevance based on user interactions (e.g., high ratings, watch history).

Adjust Thresholds: Experimenting with lower relevance_threshold and recommendation_threshold values might increase recall, but it could also decrease precision.

Feature Engineering: Explore including other features like anime popularity over time, studio, or voice actors, if available, to enrich the content representation.

Explore Other Recommendation Techniques: Content-based filtering has limitations. Consider incorporating collaborative filtering techniques (based on user behavior) or hybrid approaches to improve recommendations.

Hyperparameter Tuning: If using a machine learning model for recommendations, tune its hyperparameters. (Although not applicable to the current cosine similarity approach directly, it's relevant for other methods).

User Evaluation: The best way to evaluate a recommendation system is through user studies to see if the recommendations are actually helpful and relevant to users.
This evaluation provides a starting point, but the low recall highlights the need to explore ways to recommend a wider variety of relevant anime.

In [None]:
#Can you explain the difference between user-based and item-based collaborative filtering?
'''
Similarity Basis:
UBCF finds similar users, while IBCF finds similar items

Computational Cost:
UBCF can be more computationally intensive for real-time recommendations in large user bases, while IBCF benefits from offline
pre-computation of item similarities.

Stability:
Item similarities in IBCF are typically more stable than user similarities in UBCF.

Dataset Sparsity:
IBCF can sometimes handle sparse datasets (where users have interacted with few items) more effectively than UBCF, as reliable
item-item patterns can still emerge.
'''

In [None]:
#2. What is collaborative filtering, and how does it work?
'''
Data Collection:
The process begins by collecting data on user-item interactions. This typically involves ratings, purchases, views, or other
explicit or implicit feedback from users on various items. This data is often represented in a user-item matrix, where rows
represent users and columns represent items, and the cells contain the user's interaction with the item (e.g., a rating).

Finding Similarities:
User-Based Collaborative Filtering: This approach identifies users who have similar tastes or preferences. Similarity is
calculated by comparing the ratings or interactions of different users across common items. For example, if User A and User B
both rated several movies similarly, they are considered similar users.

Item-Based Collaborative Filtering: This approach identifies items that are frequently liked or interacted with by the same
users. Similarity is calculated by comparing the rating patterns of different items. For example, if users who liked Movie X
also frequently liked Movie Y, then Movie X and Movie Y are considered similar items.

Generating Recommendations:
User-Based: Once similar users are identified, the system recommends items to the target user that their similar "neighbors"
have liked but the target user has not yet interacted with.

Item-Based: When a user interacts with an item, the system finds items similar to that one and recommends them to the user.