In [3]:
# ---------------------------------------------
# STEP 1: DATA PREPROCESSING
# ---------------------------------------------

import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv("anime.csv")

# Basic exploration
print("Shape:", df.shape)
print("\nColumns:", df.columns)
print("\nMissing values:\n", df.isnull().sum())
print("\nInfo:")
print(df.info())

# Handle missing values
df.fillna({
    'genre': 'Unknown',
    'type': 'Unknown',
    'rating': df['rating'].mean(),
    'members': df['members'].median()
}, inplace=True)

# ---------------------------------------------
# STEP 2: FEATURE EXTRACTION
# ---------------------------------------------

from sklearn.preprocessing import MultiLabelBinarizer, MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer

# --- 2A. Process Genre (categorical multi-label) ---
df['genre_list'] = df['genre'].apply(lambda x: x.split(', '))

mlb = MultiLabelBinarizer()
genre_matrix = mlb.fit_transform(df['genre_list'])

genre_df = pd.DataFrame(genre_matrix, columns=mlb.classes_)

# --- 2B. Normalize numerical features ---
scaler = MinMaxScaler()
numeric_features = df[['rating', 'members']]
numeric_scaled = scaler.fit_transform(numeric_features)

numeric_df = pd.DataFrame(numeric_scaled, columns=['rating_scaled', 'members_scaled'])

# --- 2C. Combine all features ---
final_features = pd.concat([genre_df, numeric_df], axis=1)
print("Final feature matrix shape:", final_features.shape)

# ---------------------------------------------
# STEP 3: SIMILARITY MATRIX
# ---------------------------------------------

from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(final_features)
print("Similarity matrix shape:", similarity_matrix.shape)

# ---------------------------------------------
# STEP 4: RECOMMENDATION FUNCTION
# ---------------------------------------------

def recommend_anime(title, df, similarity_matrix, threshold=0.3, top_n=20):
    # Find index of given anime
    if title not in df['name'].values:
        print("Anime not found!")
        return None

    anime_idx = df[df['name'] == title].index[0]
    scores = similarity_matrix[anime_idx]

    # Filter based on threshold
    valid_indices = np.where(scores >= threshold)[0]
    valid_indices = valid_indices[valid_indices != anime_idx]

    # Sort
    sorted_indices = valid_indices[np.argsort(scores[valid_indices])[::-1]]

    # Limit results
    sorted_indices = sorted_indices[:top_n]

    # Create output
    results = pd.DataFrame({
        'Recommended Anime': df.iloc[sorted_indices]['name'].values,
        'Similarity Score': scores[sorted_indices]
    })

    return results

# Example usage:
print("\nRecommendations for 'Naruto':")
print(recommend_anime("Naruto", df, similarity_matrix, threshold=0.3, top_n=10))


# ---------------------------------------------
# STEP 5: EXPERIMENT WITH DIFFERENT THRESHOLDS
# ---------------------------------------------

def experiment_thresholds(title, thresholds):
    result = []
    for t in thresholds:
        recs = recommend_anime(title, df, similarity_matrix, threshold=t)
        count = len(recs) if recs is not None else 0
        result.append([t, count])
    return pd.DataFrame(result, columns=["Threshold", "Recommendations"])

threshold_list = np.arange(0.1, 0.9, 0.1)

print("\nThreshold Experiment:")
threshold_results = experiment_thresholds("Naruto", threshold_list)
print(threshold_results)


Shape: (12294, 7)

Columns: Index(['anime_id', 'name', 'genre', 'type', 'episodes', 'rating', 'members'], dtype='object')

Missing values:
 anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 480.3+ KB
None
Final feature matrix shape: (12294, 46)
Similarity matrix shape: (12294, 12294)

Recommendations for 'Naruto':
                                   Recommended Anime  Similarity Score
0                 

In [5]:
"""The recommendation system is dominated by genre features, while rating and popularity have very little impact.
Similarity scores are sensitive to thresholds, and the system lacks semantic understanding because text descriptions are not used.
Adding TF-IDF text features, applying feature weights, and including more metadata (studio, year, type) would improve accuracy.
Hybrid models, dimensionality reduction, and better evaluation metrics can further enhance recommendation quality."""

'The recommendation system is dominated by genre features, while rating and popularity have very little impact.\nSimilarity scores are sensitive to thresholds, and the system lacks semantic understanding because text descriptions are not used.\nAdding TF-IDF text features, applying feature weights, and including more metadata (studio, year, type) would improve accuracy.\nHybrid models, dimensionality reduction, and better evaluation metrics can further enhance recommendation quality.'

In [7]:
"""User-based collaborative filtering finds similar users and recommends items they liked. Item-based collaborative filtering finds similar items to those a user already enjoyed. User-based works well in smaller datasets but struggles with many users. Item-based scales better, as item relationships are more stable over time."""

'User-based collaborative filtering finds similar users and recommends items they liked. Item-based collaborative filtering finds similar items to those a user already enjoyed. User-based works well in smaller datasets but struggles with many users. Item-based scales better, as item relationships are more stable over time.'