In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load the dataset
df = pd.read_csv("E:\Data Science\Assignments\Recommendation System\Recommendation System\\anime.csv")

In [3]:
df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [4]:
df.shape

(12294, 7)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [6]:
# Dropping null values
df.dropna(inplace=True)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12017 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12017 non-null  int64  
 1   name      12017 non-null  object 
 2   genre     12017 non-null  object 
 3   type      12017 non-null  object 
 4   episodes  12017 non-null  object 
 5   rating    12017 non-null  float64
 6   members   12017 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 751.1+ KB


In [8]:
new_df = df[['anime_id', 'name','genre']]
new_df.head()

Unnamed: 0,anime_id,name,genre
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural"
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili..."
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S..."
3,9253,Steins;Gate,"Sci-Fi, Thriller"
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S..."


In [9]:
import nltk
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [10]:
def stem(text):
    y = []
    
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [11]:
new_df['genre'].apply(stem)

0                       drama, romance, school, supernatur
1        action, adventure, drama, fantasy, magic, mili...
2        action, comedy, historical, parody, samurai, s...
3                                         sci-fi, thriller
4        action, comedy, historical, parody, samurai, s...
                               ...                        
12289                                               hentai
12290                                               hentai
12291                                               hentai
12292                                               hentai
12293                                               hentai
Name: genre, Length: 12017, dtype: object

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words='english')

In [13]:
vectors = cv.fit_transform(new_df['genre'])

In [14]:
vectors = vectors.toarray()
vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [1, 1, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [15]:
cv.get_feature_names_out()

array(['action', 'adventure', 'ai', 'arts', 'cars', 'comedy', 'dementia',
       'demons', 'drama', 'ecchi', 'fantasy', 'fi', 'game', 'harem',
       'hentai', 'historical', 'horror', 'josei', 'kids', 'life', 'magic',
       'martial', 'mecha', 'military', 'music', 'mystery', 'parody',
       'police', 'power', 'psychological', 'romance', 'samurai', 'school',
       'sci', 'seinen', 'shoujo', 'shounen', 'slice', 'space', 'sports',
       'super', 'supernatural', 'thriller', 'vampire', 'yaoi', 'yuri'],
      dtype=object)

In [16]:
from sklearn.metrics.pairwise import cosine_similarity

In [17]:
similarity = cosine_similarity(vectors)

In [18]:
similarity[0]

array([1.        , 0.18898224, 0.        , ..., 0.        , 0.        ,
       0.        ])

In [19]:
def recommend(movie):
    movie_index = new_df[new_df['name'] == movie].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)),reverse=True,key=lambda x:x[1])[1:6]

    for i in movies_list:
        print(new_df.iloc[i[0]][1])

In [20]:
recommend('Fullmetal Alchemist: Brotherhood')

Fullmetal Alchemist
Fullmetal Alchemist: The Sacred Star of Milos
Fullmetal Alchemist: Brotherhood Specials
Kkomaeosa Ttori
Magi: The Kingdom of Magic


 #### Can you explain the difference between user-based and item-based collaborative filtering?

 #### User-Based Collaborative Filtering
- Concept: Recommends items based on similarities between users.
- Process:
    Find users with similar preferences.
    Recommend items liked by these similar users that the target user hasn't interacted with.
- Advantages:
    Personalized recommendations.
    Effective with a large user base and rich interaction data.
- Disadvantages:
    Scalability issues with many users.
    Cold start problem for new users with little data.
#### Item-Based Collaborative Filtering
- Concept: Recommends items based on similarities between items.
- Process:
    Find items similar to those the user has already interacted with.
    Recommend these similar items.
- Advantages:
    More scalable as it focuses on item similarities.
    Better handles new users if they interact with a few items.
- Disadvantages:
    Less personalized compared to user-based methods.
    Depends on accurate item similarity calculations and interaction data.

#### What is collaborative filtering, and how does it work?

Collaborative filtering is a technique used in recommendation systems to suggest items based on past user interactions and preferences. It works by analyzing patterns in user behavior or item interactions:

- User-Based: Finds users with similar preferences and recommends items they liked.
- Item-Based: Identifies items similar to those a user has interacted with and recommends those.