In [1]:
#pip install scikit-surprise

In [2]:
#pip install --upgrade --force-reinstall numpy==1.26.4

In [3]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

import surprise
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import KNNBasic
from surprise import accuracy

path_to_df='/kaggle/input/anime-recommendations-database/anime.csv'
path_to_rating_df='/kaggle/input/anime-recommendations-database/rating.csv'

In [4]:
df = pd.read_csv(path_to_df)
rating_df = pd.read_csv(path_to_rating_df)

In [5]:
df

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


In [6]:
rating_df

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1
...,...,...,...
7813732,73515,16512,7
7813733,73515,17187,9
7813734,73515,22145,10
7813735,73516,790,9


In [7]:
df.info()
rating_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7813737 entries, 0 to 7813736
Data columns (total 3 columns):
 #   Column    Dtype
---  ------    -----
 0   user_id   int64
 1   anime_id  int64
 2   rating    int64
dtypes: int64(3)
memory usage: 178.8 MB


#Data prepare

In [8]:
df=df[~df['genre'].isna()]

In [9]:
df['type'].fillna(df['type'].mode()[0], inplace=True)

In [10]:
df = df.drop_duplicates(subset=['name']).reset_index(drop=True)

In [11]:
rating_df = rating_df[rating_df['anime_id'].isin(df['anime_id'])].reset_index(drop=True)

I dont use rating or members, so let`s leave them untouched

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12230 entries, 0 to 12229
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12230 non-null  int64  
 1   name      12230 non-null  object 
 2   genre     12230 non-null  object 
 3   type      12230 non-null  object 
 4   episodes  12230 non-null  object 
 5   rating    12015 non-null  float64
 6   members   12230 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 669.0+ KB


-1 rating rows have no info for us, but we can use them for validation

In [13]:
valid_data = rating_df[rating_df['rating'] == -1][['user_id', 'anime_id']]
rating_df = rating_df[rating_df["rating"] != -1]

Removing all users, which gave the same rate to all what thay watched

In [14]:
user_rating_stats = rating_df.groupby("user_id")["rating"].agg(["min", "max"])
users_to_remove = user_rating_stats[user_rating_stats["min"] == user_rating_stats["max"]].index
rating_df = rating_df[~rating_df["user_id"].isin(users_to_remove)]
valid_data = valid_data[~valid_data["user_id"].isin(users_to_remove)]
print(f"Deleted {len(users_to_remove)} users")

Deleted 5476 users


In [15]:
valid_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1328120 entries, 246 to 7813546
Data columns (total 2 columns):
 #   Column    Non-Null Count    Dtype
---  ------    --------------    -----
 0   user_id   1328120 non-null  int64
 1   anime_id  1328120 non-null  int64
dtypes: int64(2)
memory usage: 30.4 MB


In [16]:
rating_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6304679 entries, 156 to 7813612
Data columns (total 3 columns):
 #   Column    Dtype
---  ------    -----
 0   user_id   int64
 1   anime_id  int64
 2   rating    int64
dtypes: int64(3)
memory usage: 192.4 MB


#Colaborative Item Based Filtering

## Train/test

In [17]:
reader = Reader(rating_scale=(rating_df['rating'].min(), rating_df['rating'].max()))
rating_data = Dataset.load_from_df(rating_df[['user_id', 'anime_id', 'rating']], reader)

In [18]:
trainset, testset = train_test_split(rating_data, test_size=0.2,random_state=42)

In [19]:
algo_item_based = KNNBasic(sim_options={'user_based': False}, n_jobs=-1)
algo_item_based.fit(trainset)
predictions_item_based = algo_item_based.test(testset)
a=accuracy.rmse(predictions_item_based)

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.2133


## Validation

Its not just mean fill of ratings for users that watched anime, but dont rate it - its predicted ratings for each user/anime.

In [20]:
predictions = [algo_item_based.predict(uid, iid) for uid, iid in zip(valid_data['user_id'], valid_data['anime_id'])]
valid_data['rating'] = [pred.est for pred in predictions]
print(valid_data.head())

     user_id  anime_id    rating
246        3     30276  8.591805
247        3     30503  8.462386
250        4         6  7.798716
251        4        72  7.798716
252        4       121  7.798716


# Content Based Filtering

Filtering based only on genre is enough for this task.

In [21]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['genre'])
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# Hybrid Filtering

Small input check for input errors

In [22]:
def info_check(user_id, movie_title, n_outputs, filtering_type='hybrid'):
    fiters=['collab','content','hybrid']
    if user_id not in rating_df['user_id'].values:
      print(f"{user_id} is a invalid user id")
      return 0
    elif movie_title not in df['name'].values:
      print(f"{movie_title} is a invalid name")
      return 0
    elif filtering_type not in fiters:
      print(f"{filtering_type} is not a filtering type (try collab, content, hybrid) ")
      return 0
    elif n_outputs > len(df['anime_id']):
      print(f"{n_outputs} is bigger than number of titles({len(df['anime_id'])} )")
      return 0

There is 3 functions. hybrid_filtering is main that used 2 anothers inside, but they can be used throw it. Maybe it was more correctly to do fourth and do filtering in it, but i think it's good as it is. In the comments i leave you can read more detailed function of them.

In [23]:
def collab_filtering(user_id):
    '''
    Colab filtering, all you need is to put user_id in it, and it will return list in format [(anime_id, score)]
    Returns unsorted data.
    '''
    collab_recs = []
    i=0

    for iid in df['anime_id']:
      pred = algo_item_based.predict(user_id, iid).est
      collab_recs.append((i, pred))
      i+=1
    collab_recs = [(df.iloc[i]['anime_id'], score) for i, score in collab_recs]

    return collab_recs


def content_filtering(user_id, movie_title):
    '''
    Content filtering, it need user_id and movie_title - name from df/data of the movie/anime.
    It returns list in format [(anime_id, score)]
    Returns unsorted data.
    '''
    idx = df[df['name'] == movie_title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))

    # To match metrics we make ratings from 0 to 1, from 1 to 10
    for i in range(len(sim_scores)):
        sim_scores[i] = (sim_scores[i][0], sim_scores[i][1] * 10)

    content_recs = [(df.iloc[i]['anime_id'], score) for i, score in sim_scores]

    return content_recs


def hybrid_filtering(user_id, movie_title, filtering_type='hybrid', n_outputs=5, w_col=0.8, w_cont=0.2):

    '''
    Hybrid filtering, using colab and content filtering in it.
    As the main function it used needed for child functions info user_id, movie_title.
    Also you can change how many titles gets in output and weights of colab and content filtering.
    Returns sorted list in format [(anime_id, name, score(max))...(anime_id, name, score(min))]
    '''
    if info_check(user_id, movie_title,n_outputs,filtering_type)==0:
      return []

    # getting collab recommendation
    collab_recs=collab_filtering(user_id)

    # getting content recommendation
    content_recs=content_filtering(user_id, movie_title)

    hybrid_recs = []
    for i in range(min(len(collab_recs), len(content_recs))):
        anime_id = collab_recs[i][0]
        hybrid_score = collab_recs[i][1] * w_col + content_recs[i][1] * w_cont
        hybrid_recs.append((anime_id, hybrid_score))

    if filtering_type=='hybrid':
      filtering_data=hybrid_recs
    elif filtering_type=='collab':
      filtering_data=collab_recs
    else:
      filtering_data=content_recs

    # del input anime from list
    input_id = df[df['name'] == movie_title]['anime_id'].values[0]
    filtering_data = [title_id for title_id in filtering_data if title_id[0] != input_id]

    filtering_data.sort(key=lambda x: x[1], reverse=True)

    # changing id to names and taking only first sorted n_outputs from filtered data
    recommendations = [(anime_id, df[df['anime_id'] == anime_id]['name'].values[0], score) for anime_id, score in filtering_data[:n_outputs]]

    # little output for easer use
    print(f"{n_outputs} {filtering_type} recommendations for user id {user_id} and anime '{movie_title}':")
    for anime_id, name, score in recommendations:
        print(f"- {name} (score: {score:.2f})")

    print('---'*20)

    return recommendations

Now we can test the function under different circumstances, changing inputs.

In [24]:
user_id = [7, 5, 3, 2, 42635]
movie_title = ['One Punch Man', 'One Punch Man', 'One Punch Man','Kimi no Na wa.','Toushindai My Lover: Minami tai Mecha-Minami']
filtering_type=['hybrid','hybrid','collab','content','content']
n_outputs=[5,5,3,5,1]

for i in range(len(user_id)):
    hybrid_filtering(user_id[i], movie_title[i], filtering_type[i], n_outputs[i])

5 hybrid recommendations for user id 7 and anime 'One Punch Man':
- One Punch Man 2 (score: 8.24)
- Kawasaki Frontale x Tentai Senshi Sunred (score: 8.04)
- Hulu Xiongdi (score: 8.00)
- One Punch Man: Road to Hero (score: 7.94)
- Kawasaki Frontale x Tentai Senshi Sunred 2nd Season (score: 7.94)
------------------------------------------------------------
5 hybrid recommendations for user id 5 and anime 'One Punch Man':
- One Punch Man 2 (score: 8.24)
- Himitsukessha Taka no Tsume Countdown (score: 7.71)
- Himitsukessha Taka no Tsume DO (score: 7.71)
- Himitsukessha Taka no Tsume EX (score: 7.71)
- Himitsukessha Taka no Tsume GT (score: 7.71)
------------------------------------------------------------
3 collab recommendations for user id 3 and anime 'One Punch Man':
- Jungle wa Itsumo Hare nochi Guu: Eizou Tokuten - Maboroshi no Pilot Film (score: 10.00)
- 15 Sonyeon Uju Pyoryugi (score: 10.00)
- Sore Ike! Anpanman: Baikinman no Gyakushuu (score: 10.00)
--------------------------------

Also we can use it without so many incoming info

In [25]:
filtered_recs = hybrid_filtering(123, 'One Punch Man 2')

5 hybrid recommendations for user id 123 and anime 'One Punch Man 2':
- One Punch Man (score: 9.09)
- Uchuu Kyoudai: Number Zero (score: 8.45)
- Code Geass: Hangyaku no Lelouch R2 (score: 8.33)
- Hunter x Hunter (2011) (score: 8.29)
- Tengen Toppa Gurren Lagann Movie: Lagann-hen (score: 8.28)
------------------------------------------------------------
