Data Preprocessing

In [1]:
import pandas as pd
import numpy as np

In [2]:
anime = pd.read_csv('anime.csv')
anime

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


In [3]:
anime.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [4]:
anime.isna().sum()

Unnamed: 0,0
anime_id,0
name,0
genre,62
type,25
episodes,0
rating,230
members,0


In [5]:
anime.fillna(anime.mean(numeric_only=True),inplace=True)

In [6]:
anime.isna().sum()

Unnamed: 0,0
anime_id,0
name,0
genre,62
type,25
episodes,0
rating,0
members,0


In [7]:
anime.fillna('Unknown',inplace=True)

In [8]:
anime.isna().sum()

Unnamed: 0,0
anime_id,0
name,0
genre,0
type,0
episodes,0
rating,0
members,0


In [9]:
anime.describe()

Unnamed: 0,anime_id,rating,members
count,12294.0,12294.0,12294.0
mean,14058.221653,6.473902,18071.34
std,11455.294701,1.017096,54820.68
min,1.0,1.67,5.0
25%,3484.25,5.9,225.0
50%,10260.5,6.55,1550.0
75%,24794.5,7.17,9437.0
max,34527.0,10.0,1013917.0


Feature Extraction

In [10]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import StandardScaler

In [11]:
mlb = MultiLabelBinarizer()
ss = StandardScaler()

In [12]:
# converting genre to numeric
anime['genre_encoding'] = mlb.fit_transform(anime['genre'].str.split(',')).tolist()
# normalize rating
anime['rating_encoding'] = ss.fit_transform(anime['rating'].values.reshape(-1,1))

In [13]:
anime.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members,genre_encoding,rating_encoding
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2.847535
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665,"[1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...",2.73938
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...",2.729547
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2.650889
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...",2.641057


Recommendation System

In [14]:
from sklearn.metrics.pairwise import cosine_similarity

In [56]:
# Function to recommend similar anime
def recommend_anime(target_anime,df,threshold=0.8):
    target_index = df[df['name'] == target_anime].index[0]
    similarity_scores = cosine_similarity(df['genre_encoding'].tolist())
    target_similarity = similarity_scores[target_index]
    recommended_indices = np.argsort(target_similarity)[::-1][1:]
    recommended_anime = []
    for idx in recommended_indices:
        if target_similarity[idx] >= threshold:
            recommended_anime.append(df.iloc[idx]['name'])
    return recommended_anime



In [57]:
print(recommend_anime('Gintama',anime))

['Gintama: Yorinuki Gintama-san on Theater 2D', 'Gintama°', 'Gintama&#039;', 'Gintama: Shinyaku Benizakura-hen', 'Gintama: Jump Festa 2014 Special', 'Gintama Movie: Kanketsu-hen - Yorozuya yo Eien Nare', 'Gintama&#039;: Enchousen', 'Gintama Movie: Shinyaku Benizakura-hen', 'Gintama (2017)', 'Gintama: Nanigoto mo Saiyo ga Kanjin nano de Tasho Senobisuru Kurai ga Choudoyoi', 'Gintama: Jump Festa 2015 Special', 'Peace Maker Kurogane', 'Peace Maker Kurogane (Shinsaku)', 'Gintama: Shiroyasha Koutan', 'Bobobo-bo Bo-bobo Recap']


Evaluation

In [58]:
def evaluate_recommendations(true_anime_list,recommended_list):
    true_pos = len(set(true_anime_list) & (set(recommended_list)))
    precision = true_pos / len(recommended_list)
    recall = true_pos / len(true_anime_list)
    f1 = 2 * (precision * recall) / (precision + recall)
    return precision, recall, f1

In [71]:
true_anime = ['Under World']
recommended_anime = recommend_anime('Under World',anime)

In [72]:
precision, recall, f1 = evaluate_recommendations(true_anime, recommended_anime)
print(f'Precision: {precision} \nRecall: {recall} \nF1-score: {f1}')

Precision: 0.0012165450121654502 
Recall: 1.0 
F1-score: 0.002430133657351154


In [69]:
true_anime = ['Naruto']
recommended_anime = recommend_anime('Naruto',anime)

In [70]:
precision, recall, f1 = evaluate_recommendations(true_anime, recommended_anime)
print(f'Precision: {precision} \nRecall: {recall} \nF1-score: {f1}')

Precision: 0.043478260869565216 
Recall: 1.0 
F1-score: 0.08333333333333333


 Collaborative Filtering

   Collaborative filtering is a method of making automatic predictions about a user's interests by collecting preferences from many users.
    
 - Collaborative filtering works by analyzing patterns in user-item interactions.
 - Collaborative filtering is widely used in recommendation systems like Netflix, Amazon, and Spotify, where it helps predict what users will like based on their past actions or the actions of others.

1) User-Based Collaborative Filtering

  User-Based collaborative filtering recommends items to a user by finding similar users who have similar preferences.

How it works:
 - It calculates the similarity between users based on their ratings or interactions with items. Common methods include cosine similarity or Pearson correlation.
 - Once similar users sre identified, the system recommends items that those similar users liked but that the target users has not yet interacted with.


Cons:
  - Finding similar users can be computationally expensive, especially in large datasets.
  - When a new user joins, the system has no prior data about them, so it struggles to find similar users.

2) Item-Based Collaborative Filtering

  Item-based collaborative filtering recommends items based on the similarity between items rather than users.

How it Works:
- It calculates the similarity between items based on how users have rated them. For instance, if two items are rated similarly by many users, they are considered similar.
- The system recommends items that are similar to those that the user has already liked or rated highly.


Cons:
- If items are sparsely rated (i.e., not many users have rated them), it may be hard to find reliable similarity scores between items.