In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

# Load the dataset
df = pd.read_csv('anime.csv')

# Drop entries with 'Hentai' genre as they can skew similarity calculations
df = df[~df['genre'].str.contains('Hentai', na=False)]

# Handle 'Unknown' episodes and convert to numeric
df['episodes'] = df['episodes'].replace('Unknown', np.nan)
df['episodes'] = pd.to_numeric(df['episodes'])

# Corrected line to handle the Future Warning
df['episodes'] = df['episodes'].fillna(df['episodes'].median())

# Drop rows with any remaining missing values for a cleaner dataset
df.dropna(inplace=True)

# Process genres by creating a one-hot encoded DataFrame
genres = df['genre'].str.get_dummies(sep=', ')
df = pd.concat([df, genres], axis=1)

# One-hot encode the 'type' column
df = pd.get_dummies(df, columns=['type'], prefix='type', drop_first=True)

# Select numerical features for normalization
numerical_features = ['episodes', 'rating', 'members']
df[numerical_features] = df[numerical_features].replace('Unknown', np.nan)
df[numerical_features] = df[numerical_features].astype(float)
df.dropna(subset=numerical_features, inplace=True)
df[numerical_features] = df[numerical_features].fillna(df[numerical_features].mean())

# Normalize numerical features using MinMaxScaler
scaler = MinMaxScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# Combine all features into a single feature matrix
feature_cols = df.columns.difference(['anime_id', 'name', 'genre'])
feature_matrix = df[feature_cols]

print("Preprocessing and Feature Transformation Complete.")
print("Transformed Feature Matrix Shape:", feature_matrix.shape)

Preprocessing and Feature Transformation Complete.
Transformed Feature Matrix Shape: (10884, 50)


In [6]:
# --- 2. Updated Recommendation Logic ---
def get_recommendations(anime_name, feature_matrix, df, top_n=10):
    """
    Recommends anime based on cosine similarity.
    
    Args:
        anime_name (str): The name of the anime to get recommendations for.
        feature_matrix (DataFrame): The preprocessed feature matrix.
        df (DataFrame): The original DataFrame with anime names.
        top_n (int): The number of recommendations to return.
        
    Returns:
        DataFrame: A DataFrame of the top_n recommended anime.
    """
    if anime_name not in df['name'].values:
        print(f"Anime '{anime_name}' not found in the dataset.")
        return None
    
    # Get the index of the target anime
    anime_index = df[df['name'] == anime_name].index[0]
    
    # Calculate cosine similarity with all other anime
    cosine_sim = cosine_similarity(feature_matrix, feature_matrix.iloc[anime_index].values.reshape(1, -1))
    
    # Get the top N most similar anime indices
    similar_indices = cosine_sim.argsort(axis=0)[-top_n-1:-1][::-1].flatten()

    # Dynamically select the 'type' columns that exist in the DataFrame
    type_cols = [col for col in df.columns if col.startswith('type_')]
    
    # Get the names, genre, rating, and dynamically found type columns of the top N recommendations
    recommendation_cols = ['name', 'genre', 'rating'] + type_cols
    recommendations = df.iloc[similar_indices][recommendation_cols]

    # Add the similarity score
    recommendations['similarity_score'] = cosine_sim[similar_indices].flatten()
    
    return recommendations.sort_values(by='similarity_score', ascending=False)

# Example Usage:
print("\n--- Example Recommendations ---")
recommendations = get_recommendations('Death Note', feature_matrix, df)
if recommendations is not None:
    print(recommendations)


--- Example Recommendations ---
                                   name  \
144       Higurashi no Naku Koro ni Kai   
981                     Mousou Dairinin   
334           Higurashi no Naku Koro ni   
778                  Death Note Rewrite   
445                    Mirai Nikki (TV)   
833            Jigoku Shoujo Mitsuganae   
2691  Yakushiji Ryouko no Kaiki Jikenbo   
6323            Saint Luminous Jogakuin   
49              Boku dake ga Inai Machi   
250                   Zankyou no Terror   

                                                  genre    rating  type_Music  \
144      Mystery, Psychological, Supernatural, Thriller  0.809124       False   
981   Drama, Mystery, Police, Psychological, Superna...  0.728691       False   
334   Horror, Mystery, Psychological, Supernatural, ...  0.780312       False   
778   Mystery, Police, Psychological, Supernatural, ...  0.740696       False   
445   Action, Mystery, Psychological, Shounen, Super...  0.768307       False   
833    

In [7]:
# Split data into training and testing sets (80% train, 20% test)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_feature_matrix = train_df[feature_cols]

def evaluate_recommendations(test_df, train_df, train_feature_matrix):
    """
    Evaluates the recommendation system on the test set using precision and recall.
    
    Args:
        test_df (DataFrame): The test dataset.
        train_df (DataFrame): The training dataset.
        train_feature_matrix (DataFrame): The feature matrix of the training data.
        
    Returns:
        tuple: A tuple containing the average precision, recall, and F1-score.
    """
    precisions = []
    recalls = []
    
    for _, test_anime in test_df.iterrows():
        # Get recommendations from the training data for each anime in the test set
        recs = get_recommendations(test_anime['name'], train_feature_matrix, train_df, top_n=10)
        
        if recs is not None:
            # Get genres of the target test anime
            test_genres = set(test_anime['genre'].split(', '))
            
            # Count how many recommendations are "relevant" (share a genre)
            relevant_recs = 0
            for _, rec in recs.iterrows():
                rec_genres = set(rec['genre'].split(', '))
                if not test_genres.isdisjoint(rec_genres):
                    relevant_recs += 1
            
            # Calculate precision and recall
            precision = relevant_recs / 10 if 10 > 0 else 0
            recall = relevant_recs / len(test_genres) if len(test_genres) > 0 else 0
            
            precisions.append(precision)
            recalls.append(recall)
            
    avg_precision = np.mean(precisions)
    avg_recall = np.mean(recalls)
    f1_score = 2 * (avg_precision * avg_recall) / (avg_precision + avg_recall) if (avg_precision + avg_recall) > 0 else 0
    
    return avg_precision, avg_recall, f1_score

# Run evaluation
print("\n--- Evaluation Metrics ---")
avg_precision, avg_recall, f1_score = evaluate_recommendations(test_df, train_df, train_feature_matrix)
print(f"Average Precision: {avg_precision:.4f}")
print(f"Average Recall: {avg_recall:.4f}")
print(f"F1-Score: {f1_score:.4f}")


--- Evaluation Metrics ---
Anime 'Kinnikuman: Kessen! Shichinin no Seigi Choujin vs. Uchuu Nobushi' not found in the dataset.
Anime 'Goro-chan' not found in the dataset.
Anime 'Sword Art Online: Sword Art Offline - Extra Edition' not found in the dataset.
Anime 'Tari Tari' not found in the dataset.
Anime 'One Piece: Adventure of Nebulandia' not found in the dataset.
Anime 'Minna Atsumare! Falcom Gakuen' not found in the dataset.
Anime 'Hokuto no Ken: Yuria-den' not found in the dataset.
Anime 'X-Men' not found in the dataset.
Anime 'Rozen Maiden: Meitantei Kunkun - Duell Walzer' not found in the dataset.
Anime 'Mushishi Zoku Shou: Suzu no Shizuku' not found in the dataset.
Anime 'Code Geass: Boukoku no Akito Final - Itoshiki Monotachi e Picture Drama' not found in the dataset.
Anime 'Feng Ji Yun Nu' not found in the dataset.
Anime 'Meiken Lassie Specials' not found in the dataset.
Anime 'Kaiketsu Tamagon' not found in the dataset.
Anime 'Genki Bakuhatsu Ganbaruger: Hyakka' not found i

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [8]:
# Split data into training and testing sets (80% train, 20% test)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_feature_matrix = train_df[feature_cols]

def evaluate_recommendations(test_df, train_df, train_feature_matrix):
    """
    Evaluates the recommendation system on the test set using precision and recall.
    
    Args:
        test_df (DataFrame): The test dataset.
        train_df (DataFrame): The training dataset.
        train_feature_matrix (DataFrame): The feature matrix of the training data.
        
    Returns:
        tuple: A tuple containing the average precision, recall, and F1-score.
    """
    precisions = []
    recalls = []
    
    for _, test_anime in test_df.iterrows():
        # Get recommendations from the training data for each anime in the test set
        recs = get_recommendations(test_anime['name'], train_feature_matrix, train_df, top_n=10)
        
        if recs is not None:
            # Get genres of the target test anime
            test_genres = set(test_anime['genre'].split(', '))
            
            # Count how many recommendations are "relevant" (share a genre)
            relevant_recs = 0
            for _, rec in recs.iterrows():
                rec_genres = set(rec['genre'].split(', '))
                if not test_genres.isdisjoint(rec_genres):
                    relevant_recs += 1
            
            # Calculate precision and recall
            precision = relevant_recs / 10 if 10 > 0 else 0
            recall = relevant_recs / len(test_genres) if len(test_genres) > 0 else 0
            
            precisions.append(precision)
            recalls.append(recall)
            
    avg_precision = np.mean(precisions)
    avg_recall = np.mean(recalls)
    f1_score = 2 * (avg_precision * avg_recall) / (avg_precision + avg_recall) if (avg_precision + avg_recall) > 0 else 0
    
    return avg_precision, avg_recall, f1_score

# Run evaluation
print("\n--- Evaluation Metrics ---")
avg_precision, avg_recall, f1_score = evaluate_recommendations(test_df, train_df, train_feature_matrix)
print(f"Average Precision: {avg_precision:.4f}")
print(f"Average Recall: {avg_recall:.4f}")
print(f"F1-Score: {f1_score:.4f}")


--- Evaluation Metrics ---
Anime 'Kinnikuman: Kessen! Shichinin no Seigi Choujin vs. Uchuu Nobushi' not found in the dataset.
Anime 'Goro-chan' not found in the dataset.
Anime 'Sword Art Online: Sword Art Offline - Extra Edition' not found in the dataset.
Anime 'Tari Tari' not found in the dataset.
Anime 'One Piece: Adventure of Nebulandia' not found in the dataset.
Anime 'Minna Atsumare! Falcom Gakuen' not found in the dataset.
Anime 'Hokuto no Ken: Yuria-den' not found in the dataset.
Anime 'X-Men' not found in the dataset.
Anime 'Rozen Maiden: Meitantei Kunkun - Duell Walzer' not found in the dataset.
Anime 'Mushishi Zoku Shou: Suzu no Shizuku' not found in the dataset.
Anime 'Code Geass: Boukoku no Akito Final - Itoshiki Monotachi e Picture Drama' not found in the dataset.
Anime 'Feng Ji Yun Nu' not found in the dataset.
Anime 'Meiken Lassie Specials' not found in the dataset.
Anime 'Kaiketsu Tamagon' not found in the dataset.
Anime 'Genki Bakuhatsu Ganbaruger: Hyakka' not found i

In [None]:
4. Analysis and Improvement
Analysis:

Strengths: The system effectively captures similarities based on genres and anime type due to one-hot encoding. For example, 
if you input a TV series, the recommendations will likely be other TV series with similar genres.

Weaknesses: The current evaluation model for precision and recall is limited. A better approach would require user-item
interaction data (e.g., user watch history or ratings) to truly determine relevance. The members and rating features, while normalized, 
might not be as influential as the genre and type features in a simple cosine similarity model.

In [None]:
Possible Improvements:

Weighted Features: You can introduce weights to features in the cosine similarity calculation to prioritize some over others. 
For instance, you could give more weight to genres than to the number of episodes or members.

User-Item Interaction Data: A more advanced system would use user ratings or watch history to build a user-based collaborative 
filtering model, where recommendations are made based on what similar users have enjoyed.

Advanced Algorithms: Implement more sophisticated algorithms like matrix factorization (e.g., Singular Value Decomposition) 
which can uncover latent factors that influence user preferences and item characteristics.

Hybrid Approach: A hybrid model that combines the content-based approach (using genre, type) with collaborative filtering 
could provide more accurate and diverse recommendations.


In [None]:
'''
1. Can you explain the difference between user-based and item-based collaborative filtering?
The key difference lies in whether the similarity is calculated between users or items.

User-Based Collaborative Filtering: This method finds users who are similar to you based on your past ratings and preferences.
For example, if you and another user both gave high ratings to the same set of movies, the system will consider you similar. 
It then recommends items that the other user liked but you haven't seen yet. The core idea is "people who liked X also liked Y."

Item-Based Collaborative Filtering: This method finds items that are similar to the ones you've already liked. For example, if a group 
of users who liked "Movie A" also liked "Movie B," then "Movie B" is considered similar to "Movie A." The system then recommends "Movie B" 
to anyone who likes "Movie A." The core idea is "items that are liked by similar people are similar."
'''

In [None]:
'''
2. What is collaborative filtering, and how does it work?

Collaborative filtering is a technique used by recommendation systems to make personalized recommendations by leveraging the opinions of a community
of users. It's based on the idea that people often trust recommendations from others with similar tastes.

How it Works:
Data Collection: The system collects data on user behavior, such as ratings, purchases, or viewing history.
This data is typically represented in a user-item matrix, where rows are users, columns are items, and the cells contain ratings or preferences.

Similarity Calculation: The system then calculates similarity.

In user-based filtering, it measures how similar one user's preferences are to another's.

In item-based filtering, it measures how similar one item is to another based on how users rated them.

Recommendation Generation:

For user-based filtering, the system finds the N most similar users to you and recommends the items they liked most that you haven't seen.

For item-based filtering, the system identifies the items most similar to the ones you've liked and recommends them.
'''