### Data Preprocessing

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Load the dataset
df = pd.read_csv('anime.csv')
df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [5]:
### summarize

df.describe()

Unnamed: 0,anime_id,rating,members
count,12294.0,12064.0,12294.0
mean,14058.221653,6.473902,18071.34
std,11455.294701,1.026746,54820.68
min,1.0,1.67,5.0
25%,3484.25,5.88,225.0
50%,10260.5,6.57,1550.0
75%,24794.5,7.18,9437.0
max,34527.0,10.0,1013917.0


In [6]:
df_cleaned = df.dropna(subset=['genre', 'type', 'rating']).copy()

In [7]:
# Handle "Unknown" in episodes
df_cleaned['episodes'] = df_cleaned['episodes'].replace('Unknown', np.nan)
df_cleaned['episodes'] = df_cleaned['episodes'].astype(float)
df_cleaned['episodes'] = df_cleaned['episodes'].fillna(df_cleaned['episodes'].median())

### Feature Extraction

In [8]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [9]:
# Genre - TF-IDF
tfidf = TfidfVectorizer(stop_words='english')
genre_matrix = tfidf.fit_transform(df_cleaned['genre'])

In [10]:
# Type - One-hot encoding
type_dummies = pd.get_dummies(df_cleaned['type'])

In [11]:
# Scaling numerical features (rating, members, episodes)
scaler = MinMaxScaler()
numerical_features = scaler.fit_transform(df_cleaned[['rating', 'members', 'episodes']])

In [12]:
# Combine features

import scipy.sparse as sp

# Convert numerical and dummies to sparse matrix to combine with genre_matrix

type_sparse = sp.csr_matrix(type_dummies.values)
numerical_sparse = sp.csr_matrix(numerical_features)

combined_features = sp.hstack([genre_matrix, type_sparse, numerical_sparse])

In [13]:
# Function to get recommendation
def get_recommendations(title, df, features, n_recommendations=10):
    try:
        # Get index of the anime
        idx = df[df['name'].str.contains(title, case=False)].index[0]
        # Calculate cosine similarity for this specific anime against all others
        query_vec = features[idx]
        sim_scores = cosine_similarity(query_vec, features).flatten()
        
        # Get indices of most similar anime
        sim_indices = sim_scores.argsort()[-(n_recommendations+1):-1][::-1]
        
        # Return the results
        return df.iloc[sim_indices][['name', 'genre', 'type', 'rating']]
    except IndexError:
        return "Anime not found."

In [14]:
# Test with a popular anime
test_recommendations = get_recommendations("Kimi no Na wa.", df_cleaned, combined_features)
print("Recommendations for 'Kimi no Na wa.':")
print(test_recommendations)

Recommendations for 'Kimi no Na wa.':
                                                   name  \
1111              Aura: Maryuuin Kouga Saigo no Tatakai   
1494                                           Harmonie   
208                       Kokoro ga Sakebitagatterunda.   
1959                                          Air Movie   
2103                                      Clannad Movie   
60                                   Hotarubi no Mori e   
1697  Zutto Mae kara Suki deshita.: Kokuhaku Jikkou ...   
894                                    Momo e no Tegami   
25                        Suzumiya Haruhi no Shoushitsu   
5796                                   Taifuu no Noruda   

                                                  genre   type  rating  
1111       Comedy, Drama, Romance, School, Supernatural  Movie    7.67  
1494                        Drama, School, Supernatural  Movie    7.52  
208                              Drama, Romance, School  Movie    8.32  
1959                

In [15]:
# Also test another one
test_recommendations_2 = get_recommendations("Fullmetal Alchemist", df_cleaned, combined_features)
print("\nRecommendations for 'Fullmetal Alchemist':")
print(test_recommendations_2)


Recommendations for 'Fullmetal Alchemist':
                                                  name  \
200                                Fullmetal Alchemist   
288                                         Fairy Tail   
268                       Magi: The Labyrinth of Magic   
101                         Magi: The Kingdom of Magic   
255                                  Fairy Tail (2014)   
554         Gate: Jieitai Kanochi nite, Kaku Tatakaeri   
795                     Densetsu no Yuusha no Densetsu   
555  Gate: Jieitai Kanochi nite, Kaku Tatakaeri 2nd...   
374                                        Log Horizon   
290                        Magi: Sinbad no Bouken (TV)   

                                                 genre type  rating  
200  Action, Adventure, Comedy, Drama, Fantasy, Mag...   TV    8.33  
288  Action, Adventure, Comedy, Fantasy, Magic, Sho...   TV    8.22  
268         Action, Adventure, Fantasy, Magic, Shounen   TV    8.24  
101         Action, Adventure, Fantas

### Recommendation System

In [16]:
# Check distribution of similarity scores for a sample anime
query_vec = combined_features[0] # Kimi no Na wa
sim_scores = cosine_similarity(query_vec, combined_features).flatten()

In [17]:
print(f"Max score: {sim_scores.max()}")
print(f"Mean score: {sim_scores.mean()}")
print(f"95th percentile: {np.percentile(sim_scores, 95)}")
print(f"99th percentile: {np.percentile(sim_scores, 99)}")

Max score: 1.0
Mean score: 0.31885040281510313
95th percentile: 0.6683126319581995
99th percentile: 0.7626035603895942


In [18]:
# Experimenting with a threshold (e.g., 0.5)
threshold = 0.5
threshold_indices = np.where(sim_scores > threshold)[0]
print(f"Number of anime with similarity > {threshold}: {len(threshold_indices)}")

Number of anime with similarity > 0.5: 2274


### Interview Question