#Install Required Packages

In [None]:
# Install necessary packages using pip
!pip install scikit-surprise
!pip install --upgrade scikit-learn

#Import Libraries

In [4]:
# Import required libraries for data processing, visualization, and modeling

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# For text processing in content-based filtering
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# For collaborative filtering using the Surprise library
from surprise import Dataset, Reader, SVD, KNNBasic
from surprise.model_selection import cross_validate, train_test_split

import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully.")

Libraries imported successfully.


#Clone Github Repo

In [1]:
# Clone the GitHub repository
!git clone https://github.com/SomersInias/AI-Anime-Recommendation.git

Cloning into 'AI-Anime-Recommendation'...
remote: Enumerating objects: 9, done.[K
remote: Counting objects: 100% (9/9), done.[K
remote: Compressing objects: 100% (6/6), done.[K
remote: Total 9 (delta 0), reused 6 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (9/9), 25.10 MiB | 9.59 MiB/s, done.


#Data Ingestion – Load the Datasets

In [5]:
# Load the datasets.
# Make sure you have uploaded 'anime.csv' and 'rating.csv' to your Colab environment or google colab

anime_df = pd.read_csv('/content/AI-Anime-Recommendation/datasets/anime.csv')
rating_df = pd.read_csv('/content/AI-Anime-Recommendation/datasets/rating.csv')

print("Anime dataset shape:", anime_df.shape)
print("Rating dataset shape:", rating_df.shape)

Anime dataset shape: (12294, 7)
Rating dataset shape: (7813737, 3)


#Data Cleaning and Preprocessing

In [None]:
# Data Cleaning:
# - Remove duplicate rows.
# - For ratings, remove entries where rating == -1 (if used as a flag for 'not rated').
# - Fill missing values in key columns.

anime_df.drop_duplicates(inplace=True)
rating_df.drop_duplicates(inplace=True)

# Remove rows where the rating is -1 (if applicable)
rating_df = rating_df[rating_df['rating'] != -1]

# Fill missing genres with an empty string.
anime_df['genre'] = anime_df['genre'].fillna('')

print("Cleaned anime dataset shape:", anime_df.shape)
print("Cleaned rating dataset shape:", rating_df.shape)

#Exploratory Data Analysis

In [None]:
# Plot the distribution of user ratings from the rating dataset.

plt.figure(figsize=(10,6))
sns.histplot(rating_df['rating'], bins=20, kde=True, color='skyblue')
plt.title("Distribution of User Ratings")
plt.xlabel("Rating")
plt.ylabel("Frequency")
plt.show()

# Plot the distribution of anime ratings from the anime metadata.

plt.figure(figsize=(10,6))
sns.histplot(anime_df['rating'], bins=20, kde=True, color='salmon')
plt.title("Distribution of Anime Ratings (Metadata)")
plt.xlabel("Rating")
plt.ylabel("Frequency")
plt.show()

# Analyze the top genres:
# Split the 'genre' column by commas, explode the list, and count the occurrences.

anime_genres = anime_df['genre'].str.split(',').explode().str.strip()
genre_counts = anime_genres[anime_genres != ''].value_counts().head(10)

plt.figure(figsize=(10,6))
sns.barplot(x=genre_counts.values, y=genre_counts.index, palette='viridis')
plt.title("Top 10 Anime Genres")
plt.xlabel("Count")
plt.ylabel("Genre")
plt.show()

#Build the Collaborative Filtering Model (SVD)

In [None]:
# Define a Reader object for the Surprise library.
# The rating scale here is assumed to be from 1 to 10.

reader = Reader(rating_scale=(1, 10))

# Load the rating data into Surprise's Dataset format.
data = Dataset.load_from_df(rating_df[['user_id', 'anime_id', 'rating']], reader)

# Evaluate the SVD model using 5-fold cross-validation.
svd = SVD()
cv_results = cross_validate(svd, data, measures=['RMSE'], cv=5, verbose=True)

# Train the SVD model on the full dataset.
trainset = data.build_full_trainset()
svd.fit(trainset)
print("SVD model training complete.")

#Collaborative Filtering – Generate Recommendations Using SVD

In [None]:
# Function to generate SVD-based recommendations for a given user_id.
def get_svd_recommendations(user_id, n=10):
    # Get a list of all unique anime_ids in the ratings dataset.
    all_anime_ids = rating_df['anime_id'].unique()
    # Find the anime_ids that the user has already rated.
    rated_anime = rating_df[rating_df['user_id'] == user_id]['anime_id'].tolist()
    # The candidates for recommendation are those anime that the user hasn't rated.
    candidates = [aid for aid in all_anime_ids if aid not in rated_anime]

    predictions = []
    for aid in candidates:
        pred = svd.predict(user_id, aid)
        predictions.append((aid, pred.est))

    # Sort candidates based on the predicted rating (highest first).
    predictions.sort(key=lambda x: x[1], reverse=True)
    top_n = predictions[:n]

    # Build a DataFrame with the recommended anime names and predicted ratings.
    recs = []
    for aid, pred_rating in top_n:
        anime_name = anime_df[anime_df['anime_id'] == aid]['name'].values
        if len(anime_name) > 0:
            recs.append({'anime_id': aid, 'name': anime_name[0], 'predicted_rating': pred_rating})
    return pd.DataFrame(recs)

# Test the SVD recommendations for a sample user.
sample_user = rating_df['user_id'].sample(1).iloc[0]
print("Generating SVD recommendations for user_id:", sample_user)
svd_recs = get_svd_recommendations(sample_user, n=10)
svd_recs

In [None]:
# Function to generate SVD-based recommendations for a given user_id.
def get_svd_recommendations(user_id, n=10):
    # Get a list of all unique anime_ids in the ratings dataset.
    all_anime_ids = rating_df['anime_id'].unique()
    # Find the anime_ids that the user has already rated.
    rated_anime = rating_df[rating_df['user_id'] == user_id]['anime_id'].tolist()
    # The candidates for recommendation are those anime that the user hasn't rated.
    candidates = [aid for aid in all_anime_ids if aid not in rated_anime]

    predictions = []
    for aid in candidates:
        pred = svd.predict(user_id, aid)
        predictions.append((aid, pred.est))

    # Sort candidates based on the predicted rating (highest first).
    predictions.sort(key=lambda x: x[1], reverse=True)
    top_n = predictions[:n]

    # Build a DataFrame with the recommended anime names and predicted ratings.
    recs = []
    for aid, pred_rating in top_n:
        anime_name = anime_df[anime_df['anime_id'] == aid]['name'].values
        if len(anime_name) > 0:
            recs.append({'anime_id': aid, 'name': anime_name[0], 'predicted_rating': pred_rating})
    return pd.DataFrame(recs)

# Test the SVD recommendations for a sample user.
sample_user = rating_df['user_id'].sample(1).iloc[0]
print("Generating SVD recommendations for user_id:", sample_user)
svd_recs = get_svd_recommendations(sample_user, n=10)
svd_recs

#Build the Content-Based Filtering Model Using TF-IDF

In [None]:
# For content-based filtering, we vectorize the 'genre' column using TF-IDF.
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(anime_df['genre'])

# Compute the cosine similarity matrix for the anime based on their TF-IDF vectors.
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
print("Cosine similarity matrix computed.")

# Create a reverse mapping from anime name to its index for easy lookup.
indices = pd.Series(anime_df.index, index=anime_df['name']).drop_duplicates()


#Content-Based Filtering – Generate Recommendations Based on an Anime Title

In [None]:
# Function to get content-based recommendations for a given anime title.
def get_content_recommendations(title, n=10):
    if title not in indices:
        print("Anime title not found in the dataset.")
        return None
    idx = indices[title]
    # Compute pairwise similarity scores for the selected anime.
    sim_scores = list(enumerate(cosine_sim[idx]))
    # Sort the anime based on similarity scores (highest first) and skip the first one (the anime itself).
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:n+1]
    anime_indices = [i[0] for i in sim_scores]
    return anime_df.iloc[anime_indices][['anime_id', 'name', 'genre', 'rating']]

# Test content-based recommendations for a sample anime title.
sample_title = anime_df['name'].iloc[0]
print("Generating content-based recommendations for:", sample_title)
content_recs = get_content_recommendations(sample_title, n=10)
content_recs


#Hybrid Recommendation Approach

In [None]:
# A simple hybrid approach combines the SVD predicted ratings (collaborative) with content similarity.
# 'alpha' determines the weight for the collaborative part (with 1 - alpha for content similarity).

def hybrid_recommendations(user_id, anime_title, alpha=0.5, n=10):
    """
    Returns hybrid recommendations based on:
    - SVD collaborative filtering predictions.
    - Cosine similarity with a given anime title.

    Parameters:
        user_id: The ID of the user for whom to generate recommendations.
        anime_title: The anime title used as context for content similarity.
        alpha: Weight for collaborative filtering score (0 <= alpha <= 1).
        n: Number of recommendations to return.
    """
    # Get collaborative filtering recommendations (many candidates).
    svd_recs = get_svd_recommendations(user_id, n=500)

    # Get index of the context anime for content-based similarity.
    if anime_title not in indices:
        print("Anime title not found for content-based filtering.")
        return None
    title_idx = indices[anime_title]

    scores = []
    # For each candidate from SVD, compute a hybrid score.
    for _, row in svd_recs.iterrows():
        candidate_idx = anime_df[anime_df['anime_id'] == row['anime_id']].index[0]
        content_score = cosine_sim[title_idx][candidate_idx]
        # Normalize the collaborative score (assuming ratings range from 1 to 10).
        collab_score = (row['predicted_rating'] - 1) / 9
        hybrid_score = alpha * collab_score + (1 - alpha) * content_score
        scores.append((row['anime_id'], row['name'], hybrid_score))

    # Sort the candidates by the hybrid score (highest first).
    scores.sort(key=lambda x: x[2], reverse=True)
    top_n = scores[:n]
    return pd.DataFrame(top_n, columns=['anime_id', 'name', 'hybrid_score'])

# Test the hybrid recommendation function.
print("Hybrid Recommendations for user_id {} with context '{}':".format(sample_user, sample_title))
hybrid_recs = hybrid_recommendations(sample_user, sample_title, alpha=0.5, n=10)
hybrid_recs


#Create new user

In [None]:
def create_test_user(anime_ratings, user_id=None):
    """
    Create a test user with custom ratings for multiple anime and add the user to the global rating_df.

    Parameters:
      - anime_ratings (dict): Dictionary where keys are anime titles and values are ratings.
      - user_id (int, optional): If provided, uses this user_id; otherwise, generates a new unique user_id.

    Returns:
      - DataFrame: A DataFrame containing only the new test user's ratings.
    """
    global rating_df  # We'll update the global rating_df

    # Generate a new unique user_id if not provided
    if user_id is None:
        if rating_df.empty:
            user_id = 1
        else:
            user_id = rating_df['user_id'].max() + 1

    new_ratings = []
    for title, rating in anime_ratings.items():
        # Check if the anime exists in anime_df.
        anime_entry = anime_df[anime_df['name'] == title]
        if anime_entry.empty:
            print(f"Anime '{title}' not found in the dataset. Skipping...")
            continue

        anime_id = anime_entry['anime_id'].values[0]
        new_ratings.append({
            'user_id': user_id,  # All ratings will share this test user ID.
            'anime_id': anime_id,
            'rating': rating
        })

    if not new_ratings:
        print("No valid anime found. No test user created.")
        return None

    new_ratings_df = pd.DataFrame(new_ratings)

    # Append the new test user's ratings to the global rating_df.
    rating_df = pd.concat([rating_df, new_ratings_df], ignore_index=True)

    print(f"Created test user with user_id {user_id}")
    return new_ratings_df

In [None]:
# Example usage:
test_anime_ratings = {
    "Naruto": 0,
    "One Piece": 0,
    "Attack on Titan": 8.5  # Ensure the title exactly matches an entry in anime_df.
}

new_test_user_ratings = create_test_user(test_anime_ratings)
print(new_test_user_ratings)

#Display Final Recommendations




In [None]:
# Display the recommendations from each method.

print("SVD (Collaborative Filtering) Recommendations for user_id {}:".format(sample_user))
display(svd_recs)

print("\nContent-Based Recommendations for anime '{}':".format(sample_title))
display(content_recs)

print("\nHybrid Recommendations for user_id {} with context '{}':".format(sample_user, sample_title))
display(hybrid_recs)


#Train models and upload to google drive

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from surprise import Dataset, Reader, SVD
import joblib

# Train SVD model (same as in your notebook)
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(rating_df[['user_id', 'anime_id', 'rating']], reader)
trainset = data.build_full_trainset()
svd_model = SVD()
svd_model.fit(trainset)

# Compute TF-IDF matrix, cosine similarity, and indices (same as in your notebook)
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(anime_df['genre'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
indices = pd.Series(anime_df.index, index=anime_df['name']).drop_duplicates()

# Save the trained models and components
joblib.dump(svd_model, '/content/drive/MyDrive/svd_model.joblib')
joblib.dump(tfidf, '/content/drive/MyDrive/tfidf_vectorizer.joblib')
joblib.dump(cosine_sim, '/content/drive/MyDrive/cosine_sim_matrix.joblib')
joblib.dump(indices, '/content/drive/MyDrive/indices.joblib')

print("Models and components saved successfully!")