In [12]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer, MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

# --------------------------------------
# Step 1: Load and Preprocess the Dataset
# --------------------------------------

# Load the dataset
anime_df = pd.read_csv(r'C:\Users\User\Downloads\Recommendation System\Recommendation System\anime.csv')  # Dataset file path

# Inspect the dataset
print("Initial Dataset Info:\n")
print(anime_df.info())
print("\nDataset Preview:\n")
print(anime_df.head())

# Handle missing values
anime_df['genre'] = anime_df['genre'].fillna('')  # Replace NaN in genres with an empty string
anime_df['rating'] = anime_df['rating'].fillna(anime_df['rating'].mean())  # Fill missing ratings with the mean
anime_df['members'] = anime_df['members'].fillna(anime_df['members'].mean())  # Fill missing members with the mean

# Split genres into lists for multi-label encoding
anime_df['genre'] = anime_df['genre'].apply(lambda x: x.split(', ') if isinstance(x, str) else [])

# --------------------------------------
# Step 2: Multi-Label Binarization for Genres
# --------------------------------------

# Use MultiLabelBinarizer to encode genres
mlb = MultiLabelBinarizer()
genre_encoded = mlb.fit_transform(anime_df['genre'])

# Add the encoded genres back to the DataFrame
genre_columns = mlb.classes_  # Get the genre names
genre_df = pd.DataFrame(genre_encoded, columns=genre_columns)

# Concatenate the binary genre features with the original DataFrame
anime_df = pd.concat([anime_df, genre_df], axis=1)

# --------------------------------------
# Step 3: Normalize Numerical Features
# --------------------------------------

# Normalize 'rating' and 'members' columns
scaler = MinMaxScaler()
anime_df[['rating', 'members']] = scaler.fit_transform(anime_df[['rating', 'members']])

# Ensure no remaining NaN values
anime_df.fillna(0, inplace=True)

# --------------------------------------
# Step 4: Compute Cosine Similarity
# --------------------------------------

# Select features for similarity computation (genres + normalized numerical features)
features = list(genre_columns) + ['rating', 'members']
anime_features = anime_df[features]

# Compute cosine similarity matrix
cosine_sim = cosine_similarity(anime_features)

# --------------------------------------
# Step 5: Recommendation Function
# --------------------------------------

def recommend_anime(anime_index, cosine_sim_matrix, anime_df, top_n=5):
    """
    Recommends anime based on cosine similarity.

    Parameters:
    - anime_index: Index of the target anime in the dataset.
    - cosine_sim_matrix: Precomputed cosine similarity matrix.
    - anime_df: DataFrame containing anime data (titles, genres, etc.).
    - top_n: Number of recommendations to return.

    Returns:
    - DataFrame containing the recommended anime in a tabular format.
    """
    # Get similarity scores for the target anime
    similarity_scores = list(enumerate(cosine_sim_matrix[anime_index]))
    
    # Sort by similarity score in descending order
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    
    # Get indices of the top N most similar anime (excluding the target anime itself)
    similar_anime_indices = [i[0] for i in similarity_scores[1:top_n+1]]
    
    # Retrieve the details of the recommended anime
    recommended_anime = anime_df.iloc[similar_anime_indices][['name', 'genre', 'rating', 'members']]
    
    # Return the recommendations as a DataFrame
    return recommended_anime

# --------------------------------------
# Example Usage
# --------------------------------------

# Load anime titles (keep the original dataset for titles)
anime_titles = anime_df['name']  # Titles from the processed DataFrame

# Example: Get recommendations for the first anime in the dataset
target_anime_index = 0  # Index of the target anime (e.g., "Kimi no Na wa.")
top_n_recommendations = 5  # Number of recommendations to generate

# Get recommendations
recommendations = recommend_anime(target_anime_index, cosine_sim, anime_df, top_n=top_n_recommendations)

# Display the target anime
target_anime = anime_df.iloc[target_anime_index][['name', 'genre', 'rating', 'members']]
print("\nTarget Anime Details:")
print(target_anime.to_frame().T)  # Display as a DataFrame (Excel-like format)

# Display the recommendations in block format
print("\nRecommended Anime:")
print(recommendations.to_string(index=False))  # Display in tabular format without the index


Initial Dataset Info:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB
None

Dataset Preview:

   anime_id                              name  \
0     32281                    Kimi no Na wa.   
1      5114  Fullmetal Alchemist: Brotherhood   
2     28977                          Gintama°   
3      9253                       Steins;Gate   
4      9969                     Gintama&#039;   

                                               genre   type episodes  rating  \
0               Drama, Romance, School, Supernatural  M