In [1]:
import numpy as np
import pandas as pd

In [2]:
#loading the data
anime_df = pd.read_csv('anime.csv')  # Assuming anime.csv is in the current directory
anime_df.head()


Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [3]:
# Check for missing values in each column
print(anime_df.isnull().sum())

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64


In [4]:
# Handle missing values 
# For numerical columns - Fill with the mean
for col in anime_df.select_dtypes(include=np.number):
    anime_df[col] = anime_df[col].fillna(anime_df[col].mean())
    

In [5]:
# For categorical columns  - Fill with the mode
for col in anime_df.select_dtypes(exclude=np.number):
    anime_df[col] = anime_df[col].fillna(anime_df[col].mode()[0])

In [6]:
# Verify if there are any more missing values 
print(anime_df.isnull().sum())

anime_id    0
name        0
genre       0
type        0
episodes    0
rating      0
members     0
dtype: int64


In [7]:
# Explore the dataset (example)
print("first 5 rows are:",anime_df.head()) # Show first 5 rows
print("*******************************************************************")
print("information of data:",anime_df.info()) # Overview of column data types and non-null values
print("*******************************************************************")
print("description of data:",anime_df.describe()) # Summary statistics for numerical columns
print("*******************************************************************")
print("all unique values of data:",anime_df.nunique()) # Number of unique values in each colum

first 5 rows are:    anime_id                              name  \
0     32281                    Kimi no Na wa.   
1      5114  Fullmetal Alchemist: Brotherhood   
2     28977                          Gintama°   
3      9253                       Steins;Gate   
4      9969                     Gintama&#039;   

                                               genre   type episodes  rating  \
0               Drama, Romance, School, Supernatural  Movie        1    9.37   
1  Action, Adventure, Drama, Fantasy, Magic, Mili...     TV       64    9.26   
2  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.25   
3                                   Sci-Fi, Thriller     TV       24    9.17   
4  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.16   

   members  
0   200630  
1   793665  
2   114262  
3   673572  
4   151266  
*******************************************************************
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entr

In [8]:
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

# One-hot encode the 'Genres' column
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)  

encoded_genres = encoder.fit_transform(anime_df[['genre']])
encoded_genres_df = pd.DataFrame(encoded_genres, columns=encoder.get_feature_names_out(['genre']))
anime_df = pd.concat([anime_df, encoded_genres_df], axis=1)
anime_df = anime_df.drop('genre', axis=1)

In [9]:
# Replace 'Unknown' or other non-numeric entries with a default value (e.g., 0)
anime_df['episodes'] = anime_df['episodes'].replace('Unknown', 0).astype(float)

# Normalize numerical features ('rating', 'episodes', 'members')
numerical_cols = ['rating', 'episodes', 'members']
scaler = MinMaxScaler()

# Fit and transform the numerical columns
anime_df[numerical_cols] = scaler.fit_transform(anime_df[numerical_cols])

# Display the first few rows of the transformed DataFrame
anime_df.head()


Unnamed: 0,anime_id,name,type,episodes,rating,members,genre_Action,"genre_Action, Adventure","genre_Action, Adventure, Cars, Comedy, Sci-Fi, Shounen","genre_Action, Adventure, Cars, Mecha, Sci-Fi, Shounen, Sports",...,genre_Slice of Life,"genre_Slice of Life, Space","genre_Slice of Life, Supernatural",genre_Space,genre_Sports,"genre_Super Power, Supernatural, Vampire",genre_Supernatural,genre_Thriller,genre_Vampire,genre_Yaoi
0,32281,Kimi no Na wa.,Movie,0.00055,0.92437,0.197872,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5114,Fullmetal Alchemist: Brotherhood,TV,0.035204,0.911164,0.78277,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,28977,Gintama°,TV,0.028053,0.909964,0.112689,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,9253,Steins;Gate,TV,0.013201,0.90036,0.664325,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,9969,Gintama&#039;,TV,0.028053,0.89916,0.149186,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
from sklearn.metrics.pairwise import cosine_similarity

def recommend_anime(target_anime_name, threshold=0.5):
  # Check if the target anime exists in the DataFrame
    if target_anime_name not in anime_df['name'].values:
        return []  # Return empty list if the anime is not found


    # Get the index of the target anime
    target_anime_index = anime_df[anime_df['name'] == target_anime_name].index[0]

    # Select relevant columns for similarity calculation (excluding 'Name' and 'Anime_id')
    similarity_features = anime_df.drop(columns=['name', 'anime_id', 'type'])

    # Calculate cosine similarity
    cosine_sim = cosine_similarity(similarity_features)

    # Get similarity scores for the target anime
    similarity_scores = cosine_sim[target_anime_index]

    # Find anime with similarity scores above the threshold
    recommended_anime = []
    for i, score in enumerate(similarity_scores):
        if i != target_anime_index and score >= threshold:  # Exclude the target anime itself
            recommended_anime.append((anime_df['name'].iloc[i], score))

    # Sort recommendations by similarity score (descending)
    recommended_anime.sort(key=lambda x: x[1], reverse=True)

    return recommended_anime

In [11]:
# Example usage:
recommendations = recommend_anime('Death Note', threshold=0.6)  # Adjust threshold as needed
print(recommendations)

[('Death Note Rewrite', 0.8336396653024045), ('Shingeki no Kyojin', 0.6116679156394609), ('Fullmetal Alchemist: Brotherhood', 0.6031121694100268)]


In [12]:
recommendations = recommend_anime('Naruto', threshold=0.7)  # Adjust threshold as needed
recommendations

[('Naruto: Shippuuden', 0.9914319354223625),
 ('Naruto: Shippuuden Movie 4 - The Lost Tower', 0.9059029784925086),
 ('Naruto: Shippuuden Movie 3 - Hi no Ishi wo Tsugu Mono', 0.9055639388984658),
 ('Boruto: Naruto the Movie', 0.9019725607847874),
 ('Naruto x UT', 0.8844921352423337),
 ('Naruto Soyokazeden Movie: Naruto to Mashin to Mitsu no Onegai Dattebayo!!',
  0.884106752583036),
 ('Boruto: Naruto the Movie - Naruto ga Hokage ni Natta Hi',
  0.8821099886644828),
 ('Naruto Shippuuden: Sunny Side Battle', 0.8799870829283234)]

In [2]:
# Interview Questions
print("1. Difference between user-based and item-based collaborative filtering")
print("User-based collaborative filtering focuses on finding users with similar preferences, while item-based finds relationships between items. Item-based is generally more stable as user preferences change over time, but item relationships remain consistent.")
print("************************************************************************************************")
print("2. What is collaborative filtering and how does it work?")
print("Collaborative filtering is a recommendation system approach that predicts user preferences based on past behavior. It assumes users with similar preferences will like similar items and can be implemented using user-user or item-item relationships.")

1. Difference between user-based and item-based collaborative filtering
User-based collaborative filtering focuses on finding users with similar preferences, while item-based finds relationships between items. Item-based is generally more stable as user preferences change over time, but item relationships remain consistent.
************************************************************************************************
2. What is collaborative filtering and how does it work?
Collaborative filtering is a recommendation system approach that predicts user preferences based on past behavior. It assumes users with similar preferences will like similar items and can be implemented using user-user or item-item relationships.
