# Recommendation System

### Project Objective: - Implement a recommendation system using cosine similarity on an anime dataset for a Target anime_id

## Importing libraries

In [113]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

## Load dataset

In [115]:
df = pd.read_csv('anime.csv')

## EDA

In [117]:
df

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


In [118]:
df.shape

(12294, 7)

In [119]:
df.duplicated().sum()

0

In [120]:
df.isna().sum()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

In [121]:
# Treating the missing values, we use median values to fill null values for 'rating' feature as it's values are in 'int' dtype and can roughly 
# be used in recommending animes while also using all other similarities. 

# Treating 'genre' missing values by generating a new 'Missing' category is appropriate rather than giving a wrong category to any anime.

# We are not treating 'type' as it is a non-significant feature and needs to be dropped for further calculations. 

In [122]:
df['rating'] = df['rating'].fillna(df['rating'].median())

In [123]:
df['genre'].fillna('Missing', inplace=True)

In [124]:
df.isna().sum()

anime_id     0
name         0
genre        0
type        25
episodes     0
rating       0
members      0
dtype: int64

In [125]:
df

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


In [126]:
df.dtypes

anime_id      int64
name         object
genre        object
type         object
episodes     object
rating      float64
members       int64
dtype: object

## Feature Extraction

In [128]:
# Features like 'type', 'episodes', and 'members' might not directly contribute to the similarity between items in the way that user ratings 
# or genre would. Dropping these less significant features will help to reduce the dimensionality of the dataset, whcih will further help in 
# improving the efficiency of the recommendation system.

In [129]:
df.drop(columns=['type','episodes','members'], inplace=True)
df

Unnamed: 0,anime_id,name,genre,rating
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",9.37
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",9.26
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",9.25
3,9253,Steins;Gate,"Sci-Fi, Thriller",9.17
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",9.16
...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,4.15
12290,5543,Under World,Hentai,4.28
12291,5621,Violence Gekiga David no Hoshi,Hentai,4.88
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,4.98


In [185]:
# Creating dummy variables for 'genre' feature.

In [130]:
genre_dummies = pd.get_dummies(df['genre'], prefix='genre', dtype=int)
genre_dummies

Unnamed: 0,genre_Action,"genre_Action, Adventure","genre_Action, Adventure, Cars, Comedy, Sci-Fi, Shounen","genre_Action, Adventure, Cars, Mecha, Sci-Fi, Shounen, Sports","genre_Action, Adventure, Cars, Sci-Fi","genre_Action, Adventure, Comedy","genre_Action, Adventure, Comedy, Demons, Drama, Ecchi, Horror, Mystery, Romance, Sci-Fi","genre_Action, Adventure, Comedy, Demons, Fantasy, Magic","genre_Action, Adventure, Comedy, Demons, Fantasy, Magic, Romance, Shounen, Supernatural","genre_Action, Adventure, Comedy, Demons, Fantasy, Martial Arts, Shounen, Super Power",...,genre_Slice of Life,"genre_Slice of Life, Space","genre_Slice of Life, Supernatural",genre_Space,genre_Sports,"genre_Super Power, Supernatural, Vampire",genre_Supernatural,genre_Thriller,genre_Vampire,genre_Yaoi
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12289,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12290,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12291,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12292,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [131]:
# Concatenating both DataFrames

In [132]:
df = pd.concat([df, genre_dummies], axis=1)
df

Unnamed: 0,anime_id,name,genre,rating,genre_Action,"genre_Action, Adventure","genre_Action, Adventure, Cars, Comedy, Sci-Fi, Shounen","genre_Action, Adventure, Cars, Mecha, Sci-Fi, Shounen, Sports","genre_Action, Adventure, Cars, Sci-Fi","genre_Action, Adventure, Comedy",...,genre_Slice of Life,"genre_Slice of Life, Space","genre_Slice of Life, Supernatural",genre_Space,genre_Sports,"genre_Super Power, Supernatural, Vampire",genre_Supernatural,genre_Thriller,genre_Vampire,genre_Yaoi
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",9.37,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",9.26,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",9.25,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9253,Steins;Gate,"Sci-Fi, Thriller",9.17,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",9.16,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,4.15,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12290,5543,Under World,Hentai,4.28,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12291,5621,Violence Gekiga David no Hoshi,Hentai,4.88,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,4.98,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [133]:
# Dropping the original 'genre' feature to avoid redundancy.

In [134]:
df.drop(columns=['genre'], axis=1, inplace=True)

In [135]:
df

Unnamed: 0,anime_id,name,rating,genre_Action,"genre_Action, Adventure","genre_Action, Adventure, Cars, Comedy, Sci-Fi, Shounen","genre_Action, Adventure, Cars, Mecha, Sci-Fi, Shounen, Sports","genre_Action, Adventure, Cars, Sci-Fi","genre_Action, Adventure, Comedy","genre_Action, Adventure, Comedy, Demons, Drama, Ecchi, Horror, Mystery, Romance, Sci-Fi",...,genre_Slice of Life,"genre_Slice of Life, Space","genre_Slice of Life, Supernatural",genre_Space,genre_Sports,"genre_Super Power, Supernatural, Vampire",genre_Supernatural,genre_Thriller,genre_Vampire,genre_Yaoi
0,32281,Kimi no Na wa.,9.37,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5114,Fullmetal Alchemist: Brotherhood,9.26,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,28977,Gintama°,9.25,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9253,Steins;Gate,9.17,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,9969,Gintama&#039;,9.16,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,4.15,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12290,5543,Under World,4.28,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12291,5621,Violence Gekiga David no Hoshi,4.88,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,4.98,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [136]:
# Content Representation (using genre dummies and ratings)
feature_columns = genre_dummies.columns.tolist() + ['rating']

In [137]:
anime_features = df[feature_columns].values

In [138]:
anime_features

array([[0.  , 0.  , 0.  , ..., 0.  , 0.  , 9.37],
       [0.  , 0.  , 0.  , ..., 0.  , 0.  , 9.26],
       [0.  , 0.  , 0.  , ..., 0.  , 0.  , 9.25],
       ...,
       [0.  , 0.  , 0.  , ..., 0.  , 0.  , 4.88],
       [0.  , 0.  , 0.  , ..., 0.  , 0.  , 4.98],
       [0.  , 0.  , 0.  , ..., 0.  , 0.  , 5.46]])

## Model Building

In [140]:
from sklearn.metrics.pairwise import cosine_similarity

In [141]:
# Similarity Calculation

In [142]:
similarity_matrix = cosine_similarity(anime_features)

In [143]:
similarity_matrix

array([[1.        , 0.98860532, 0.98859299, ..., 0.97411139, 0.97489269,
        0.97808412],
       [0.98860532, 1.        , 0.98845998, ..., 0.97398033, 0.97476152,
        0.97795252],
       [0.98859299, 0.98845998, 1.        , ..., 0.97396818, 0.97474936,
        0.97794033],
       ...,
       [0.97411139, 0.97398033, 0.97396818, ..., 1.        , 0.99999219,
        0.99977998],
       [0.97489269, 0.97476152, 0.97474936, ..., 0.99999219, 1.        ,
        0.99985508],
       [0.97808412, 0.97795252, 0.97794033, ..., 0.99977998, 0.99985508,
        1.        ]])

In [178]:
# Checking our results for anime id -> 5543

#### Target anime id = 5543

In [145]:
# User Profile Creation 
target_anime_id = 5543
user_profile_index = df[df['anime_id'] == target_anime_id].index[0]

In [146]:
user_profile = anime_features[user_profile_index]

In [147]:
user_profile

array([0.  , 0.  , 0.  , ..., 0.  , 0.  , 4.28])

In [148]:
# Recommendation Generation

In [149]:
similarity_scores = cosine_similarity(user_profile.reshape(1, -1), anime_features)

In [150]:
similarity_scores

array([[0.96827536, 0.96814508, 0.96813301, ..., 0.99962444, 0.99950833,
        0.99882968]])

In [151]:
recommended_anime_ids_indices = similarity_scores.argsort()[0][::-1]  # Sort by similarity

In [152]:
# Get the actual anime_id values using the indices
recommended_anime_ids = df['anime_id'].iloc[recommended_anime_ids_indices].values

In [153]:
recommended_anime_ids

array([ 5543,  9352,  5541, ...,  3287, 20007, 34476], dtype=int64)

In [154]:
top_n = 10  # Number of recommendations to retrieve
recommended_anime_ids = recommended_anime_ids[1:top_n + 1]

In [155]:
# Get Recommended Anime Names
recommended_anime_names = []
for anime_id in recommended_anime_ids:
    anime_name = df[df['anime_id'] == anime_id]['name'].values[0]  
    recommended_anime_names.append(anime_name)

In [156]:
# Display Recommendations with Names
for anime_id, anime_name in zip(recommended_anime_ids, recommended_anime_names):
    print(f"Recommended: {anime_name} (anime_id: {anime_id})")

Recommended: Tenshi no Habataki Jun (anime_id: 9352)
Recommended: The Satisfaction (anime_id: 5541)
Recommended: Hokenshitsu de Aimashou (anime_id: 9503)
Recommended: Toushindai My Lover: Minami tai Mecha-Minami (anime_id: 9316)
Recommended: Super Erotic Anime (anime_id: 26031)
Recommended: Lovely Series (anime_id: 14207)
Recommended: Milky Gal: Cats Ai (anime_id: 13959)
Recommended: Prima Donna Mai (anime_id: 6903)
Recommended: Sakura no Mori (anime_id: 9504)
Recommended: Original C-V-P Momoko (anime_id: 32713)


In [176]:
# Checking our results for another anime_id -> 5114

#### target_anime_id = 5114

In [158]:
target_anime_id = 5114
user_profile_index = df[df['anime_id'] == target_anime_id].index[0]

In [159]:
user_profile = anime_features[user_profile_index]

In [160]:
user_profile

array([0.  , 0.  , 0.  , ..., 0.  , 0.  , 9.26])

In [161]:
similarity_scores = cosine_similarity(user_profile.reshape(1, -1), anime_features)

In [162]:
similarity_scores

array([[0.98860532, 1.        , 0.98845998, ..., 0.97398033, 0.97476152,
        0.97795252]])

In [163]:
recommended_anime_ids_indices = similarity_scores.argsort()[0][::-1]

In [164]:
recommended_anime_ids = df['anime_id'].iloc[recommended_anime_ids_indices].values

In [165]:
recommended_anime_ids

array([ 5114, 33662, 30120, ...,  3287, 20007, 34476], dtype=int64)

In [166]:
top_n = 10  # Number of recommendations to retrieve
recommended_anime_ids = recommended_anime_ids[1:top_n + 1]

In [167]:
# Get Recommended Anime Names
recommended_anime_names = []
for anime_id in recommended_anime_ids:
    anime_name = df[df['anime_id'] == anime_id]['name'].values[0]  
    recommended_anime_names.append(anime_name)

In [168]:
for anime_id, anime_name in zip(recommended_anime_ids, recommended_anime_names):
    print(f"Recommended: {anime_name} (anime_id: {anime_id})")

Recommended: Taka no Tsume 8: Yoshida-kun no X-Files (anime_id: 33662)
Recommended: Spoon-hime no Swing Kitchen (anime_id: 30120)
Recommended: Mogura no Motoro (anime_id: 23005)
Recommended: Kimi no Na wa. (anime_id: 32281)
Recommended: Kahei no Umi (anime_id: 33607)
Recommended: Yakusoku: Africa Mizu to Midori (anime_id: 26313)
Recommended: Gintama° (anime_id: 28977)
Recommended: Steins;Gate (anime_id: 9253)
Recommended: Gintama&#039; (anime_id: 9969)
Recommended: Haikyuu!!: Karasuno Koukou VS Shiratorizawa Gakuen Koukou (anime_id: 32935)
