In [35]:
import pandas as pd

# Load the dataset
file_path = 'anime.csv'
anime_df = pd.read_csv(file_path)

# Display the first few rows and column names
print("First few rows of the dataset:\n", anime_df.head())
print("Column names:", anime_df.columns)

# Handle missing values: Fill missing 'genre' with an empty string and 'rating' with the mean rating
anime_df['genre'] = anime_df['genre'].fillna('')
anime_df['rating'] = anime_df['rating'].fillna(anime_df['rating'].mean())

# Ensure the 'title' column is in lowercase for consistent referencing
anime_df.columns = anime_df.columns.str.lower()

# Verify the changes
print(anime_df.info())


First few rows of the dataset:
    anime_id                              name  \
0     32281                    Kimi no Na wa.   
1      5114  Fullmetal Alchemist: Brotherhood   
2     28977                          Gintama°   
3      9253                       Steins;Gate   
4      9969                     Gintama&#039;   

                                               genre   type episodes  rating  \
0               Drama, Romance, School, Supernatural  Movie        1    9.37   
1  Action, Adventure, Drama, Fantasy, Magic, Mili...     TV       64    9.26   
2  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.25   
3                                   Sci-Fi, Thriller     TV       24    9.17   
4  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.16   

   members  
0   200630  
1   793665  
2   114262  
3   673572  
4   151266  
Column names: Index(['anime_id', 'name', 'genre', 'type', 'episodes', 'rating', 'members'], dtype='object')
<class

In [36]:
from sklearn.preprocessing import MultiLabelBinarizer, MinMaxScaler

# Extract genres by splitting the string into lists
anime_df['genre'] = anime_df['genre'].str.split(', ')

# Use MultiLabelBinarizer to convert genres to a binary format
mlb = MultiLabelBinarizer()
genre_features = mlb.fit_transform(anime_df['genre'])

# Combine genre features with ratings
features = pd.DataFrame(genre_features, columns=mlb.classes_)
features['rating'] = anime_df['rating']

# Check for any NaN values in the features DataFrame
print("NaN values in features before normalization:\n", features.isna().sum())

# Normalize ratings
scaler = MinMaxScaler()
features['rating'] = scaler.fit_transform(features[['rating']])

# Ensure no NaN values after normalization
features.fillna(0, inplace=True)

# Verify no NaN values are present in the features DataFrame
print("NaN values in features after normalization:\n", features.isna().sum())


NaN values in features before normalization:
                  0
Action           0
Adventure        0
Cars             0
Comedy           0
Dementia         0
Demons           0
Drama            0
Ecchi            0
Fantasy          0
Game             0
Harem            0
Hentai           0
Historical       0
Horror           0
Josei            0
Kids             0
Magic            0
Martial Arts     0
Mecha            0
Military         0
Music            0
Mystery          0
Parody           0
Police           0
Psychological    0
Romance          0
Samurai          0
School           0
Sci-Fi           0
Seinen           0
Shoujo           0
Shoujo Ai        0
Shounen          0
Shounen Ai       0
Slice of Life    0
Space            0
Sports           0
Super Power      0
Supernatural     0
Thriller         0
Vampire          0
Yaoi             0
Yuri             0
rating           0
dtype: int64
NaN values in features after normalization:
                  0
Action           0
Adv

In [32]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarity
cosine_sim = cosine_similarity(features)


In [37]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarity for the features
cosine_sim = cosine_similarity(features)

# Check the shape of the cosine similarity matrix
print("Shape of cosine similarity matrix:", cosine_sim.shape)


Shape of cosine similarity matrix: (12294, 12294)


In [54]:
def get_recommendations(title, cosine_sim=cosine_sim, anime_df=anime_df):
    # Ensure the column name 'title' is correctly referenced
    title = title.lower()

    # Check if the title exists in the DataFrame
    if title not in anime_df['name'].str.lower().values:
        return "Title not found in the dataset"

    # Get the index of the anime with the given title
    idx = anime_df[anime_df['name'].str.lower() == title].index[0]

    # Get the pairwise similarity scores for all anime with that anime
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the anime based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar anime
    sim_scores = sim_scores[1:11]  # Exclude the first one as it is the anime itself

    # Get the anime indices
    anime_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar anime titles
    return anime_df['name'].iloc[anime_indices]

# Example usage
recommendations = get_recommendations('Naruto')
print("Recommendations for 'Naruto':\n", recommendations)


Recommendations for 'Naruto':
 615                                    Naruto: Shippuuden
1103    Boruto: Naruto the Movie - Naruto ga Hokage ni...
486                              Boruto: Naruto the Movie
1343                                          Naruto x UT
1472          Naruto: Shippuuden Movie 4 - The Lost Tower
1573    Naruto: Shippuuden Movie 3 - Hi no Ishi wo Tsu...
2458                 Naruto Shippuuden: Sunny Side Battle
2997    Naruto Soyokazeden Movie: Naruto to Mashin to ...
175                                Katekyo Hitman Reborn!
7628                              Kyutai Panic Adventure!
Name: name, dtype: object


Detailed Explanation:

Data Loading and Initial Preprocessing:

Load the dataset and display the first few rows and column names to understand its structure.

Handle missing values in 'genre' and 'rating'.

Ensure column names are in lowercase for consistent referencing.

Verify the changes by displaying DataFrame information.

Feature Extraction and Preprocessing:

Split the 'genre' column into lists.

Convert genres into binary features using MultiLabelBinarizer.

Combine genre features with ratings and check for NaN values.

Normalize the ratings and ensure no NaN values remain.

Computing Cosine Similarity:

Compute cosine similarity for the features matrix and verify its shape.

Recommendation Function:

The function checks if the input title exists in the dataset and retrieves its index.

Computes similarity scores for all other anime.
Sorts and returns the top 10 most similar anime titles, excluding the input title itself.



Interview Questions:

1.Can you explain the difference between user-based and item-based collaborative filtering?

User-based collaborative filtering recommends items by finding similar users to the target user and suggesting items that those similar users liked.

Item-based collaborative filtering recommends items by finding similar items to the ones the target user has liked and suggesting these similar items to the user.

2.What is collaborative filtering, and how does it work?

Collaborative filtering is a technique used in recommendation systems that makes predictions about a user's interests by collecting preferences from many users. It works by identifying similarities between users or items and using those similarities to make recommendations.




