In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
from src.mapk import *

#Data Preprocessing

In [2]:
# Файлы
INPUT_DIR = 'C:/Dataset'

In [3]:
# Чтение файлов
anime_ratings = pd.read_csv(INPUT_DIR + '/animelist.csv',
                        low_memory=False,
                        decimal=',',
                        usecols=["user_id", "anime_id","rating"]
                        )
anime_data = pd.read_csv(INPUT_DIR + '/anime.csv',
                        low_memory=False,
                        decimal=','
                        )

In [4]:
# Содержимое anime.csv
anime_data.head(3)

Unnamed: 0,anime_id,Name,Score,Genres,English name,Japanese name,Type,Episodes,Aired,Premiered,...,Score-10,Score-9,Score-8,Score-7,Score-6,Score-5,Score-4,Score-3,Score-2,Score-1
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",Cowboy Bebop,カウボーイビバップ,TV,26,"Apr 3, 1998 to Apr 24, 1999",Spring 1998,...,229170,182126,131625,62330,20688,8904,3184,1357,741,1580
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space",Cowboy Bebop:The Movie,カウボーイビバップ 天国の扉,Movie,1,1-Sep-01,Unknown,...,30043,49201,49505,22632,5805,1877,577,221,109,379
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen",Trigun,トライガン,TV,26,"Apr 1, 1998 to Sep 30, 1998",Spring 1998,...,50229,75651,86142,49432,15376,5838,1965,664,316,533


In [5]:
from sklearn.model_selection import train_test_split

# (60% train, 40% test)
anime_ratings, train_ratings = train_test_split(anime_ratings, test_size=0.6, random_state=42)

# (50% train, 50% test)
train_ratings, test_ratings = train_test_split(train_ratings, test_size=0.5, random_state=42)

# (50% test, 50% validation)
test_ratings, val_ratings = train_test_split(test_ratings, test_size=0.5, random_state=42)

In [6]:
# anime_ratings
len(anime_ratings)


43689898

In [7]:
# train_ratings
len(train_ratings)

32767424

In [8]:
# val_ratings
len(val_ratings)

16383713

In [9]:
# test_ratings
len(test_ratings)

16383712

In [10]:
# Cодержимое anime_ratings.csv
anime_ratings.head(3)

Unnamed: 0,user_id,anime_id,rating
43745086,141897,38472,0
87109504,281586,21863,9
76431813,247610,37579,6


In [11]:
# Пользователь должен оценить минимум 500 аниме (train_ratings)
ntrain_ratings = train_ratings['user_id'].value_counts()
train_ratings = train_ratings[train_ratings['user_id'].isin(ntrain_ratings[ntrain_ratings >= 500].index)].copy()
len(train_ratings)

3214180

In [12]:
# Пользователь должен оценить минимум 500 аниме (test_ratings)
ntest_ratings = test_ratings['user_id'].value_counts()
test_ratings = test_ratings[test_ratings['user_id'].isin(ntest_ratings[ntest_ratings >= 500].index)].copy()
len(test_ratings)

450046

In [13]:
# Пользователь должен оценить минимум 500 аниме (val_ratings)
nval_ratings = val_ratings['user_id'].value_counts()
val_ratings = val_ratings[val_ratings['user_id'].isin(nval_ratings[nval_ratings >= 500].index)].copy()
len(val_ratings)

446409

In [14]:
# Удаление Duplicated Rows
train_ratings = train_ratings.drop_duplicates()
test_ratings = test_ratings.drop_duplicates()
val_ratings = val_ratings.drop_duplicates()

In [15]:
train_ratings.head(3)

Unnamed: 0,user_id,anime_id,rating
30237004,97997,21451,5
91207062,294714,22313,0
63772563,206632,22377,2


In [16]:
# Создание сводной таблицы (pivot table). 
# По горизонтали будут аниме, по вертикали - пользователи, значения - оценки
user_item_matrix_train = train_ratings.pivot(index = 'anime_id', columns = 'user_id', values= 'rating')
user_item_matrix_test = test_ratings.pivot(index = 'anime_id', columns = 'user_id', values= 'rating')
user_item_matrix_val = val_ratings.pivot(index = 'anime_id', columns = 'user_id', values= 'rating')

user_item_matrix_train.head()

user_id,60,146,172,240,436,446,478,781,794,853,...,352660,352761,352811,352832,352856,352887,352922,352930,353098,353153
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,9.0,,0.0,,,,,,
5,0.0,,,,,,,,,,...,,7.0,,,,,,7.0,,9.0
6,,,,,,,,,0.0,,...,,10.0,,,,,,,,0.0
7,0.0,,,,,,,,,,...,0.0,,,,,,,,,
8,,,,,,,,,,,...,,,,,,,,,,


In [17]:
# NaN преобразовываю в нули
user_item_matrix_train.fillna(0, inplace = True)
user_item_matrix_test.fillna(0, inplace = True)
user_item_matrix_val.fillna(0, inplace = True)
user_item_matrix_train.head()

user_id,60,146,172,240,436,446,478,781,794,853,...,352660,352761,352811,352832,352856,352887,352922,352930,353098,353153
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,7.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,9.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
# Размерность матрицы "пользователи х anime"
user_item_matrix_train.shape

(17554, 4154)

In [19]:
# Преобразую разреженную матрицу в формат csr
# Метод values передаст функции csr_matrix только значения датафрейма
csr_data_train = csr_matrix(user_item_matrix_train.values)
csr_data_test = csr_matrix(user_item_matrix_test.values)
csr_data_tval = csr_matrix(user_item_matrix_val.values)


In [20]:
user_item_matrix_train.head()

user_id,60,146,172,240,436,446,478,781,794,853,...,352660,352761,352811,352832,352856,352887,352922,352930,353098,353153
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,7.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,9.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
# Сброшу индекс с помощью reset_index()
user_item_matrix_train = user_item_matrix_train.rename_axis(None, axis = 1).reset_index()
user_item_matrix_test = user_item_matrix_test.rename_axis(None, axis = 1).reset_index()
user_item_matrix_val = user_item_matrix_val.rename_axis(None, axis = 1).reset_index()

user_item_matrix_train.head()

Unnamed: 0,anime_id,60,146,172,240,436,446,478,781,794,...,352660,352761,352811,352832,352856,352887,352922,352930,353098,353153
0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,7.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,9.0
2,6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#MAPK TOP 10 Recommendations

In [22]:
train_ratings.info

<bound method DataFrame.info of           user_id  anime_id  rating
30237004    97997     21451       5
91207062   294714     22313       0
63772563   206632     22377       2
26690337    86492      3411       4
65371431   211572     16123       0
...           ...       ...     ...
88949353   287350     10928       0
29375064    95212     11005       0
18988729    61445     10863       8
2409693      8074       153       7
64529215   208886      9177       0

[3214180 rows x 3 columns]>

In [23]:
# Создание датасета с топ 10 рекомендациями для каждого пользователя
user_recommendations = {}

for user_id in test_ratings['user_id'].unique():
    user_ratings = train_ratings[train_ratings['user_id'] == user_id]
    anime_not_rated = list(set(train_ratings['anime_id'].unique()) - set(user_ratings['anime_id'].unique()))
    recommended_anime = anime_not_rated[:10] # использую топ-10 рекомендаций
    user_recommendations[user_id] = recommended_anime


In [24]:
# Создание списка рекомендованных и реальных оценок для каждого пользователя
recommended_ratings = []
actual_ratings_list = []

for user_id in user_recommendations:
    recommended_ratings.append(user_recommendations[user_id])
    actual_ratings_list.append(list(test_ratings[test_ratings['user_id'] == user_id]['anime_id']))


In [25]:
# Расчет MAPK@10
mapk_test_before_score = mapk(actual_ratings_list, recommended_ratings, k=10)
print("MAPK@10 score: ", mapk_test_before_score)

MAPK@10 score:  0.058339535932786374


#Item Based Collaborative Filtering Recommendation System 

#Top 10 anime recommendations based on user preferences

In [26]:
# Функции, которая возвращает результаты по каждому поисковому аниме отдельно, 
# а также объединенный набор данных с 10 лучшими рекомендациями по всем поисковым аниме на основе наименьшего расстояния:
def get_recommendations(search_words, recommendations=10):
    result = {}
    knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)
    knn.fit(csr_data_train)
    all_recoms = []
    for word in search_words:
        anime_search = anime_data[anime_data['Name'].str.contains(word)]
        anime_id = anime_search.iloc[0]['anime_id']
        anime_id = user_item_matrix_train[user_item_matrix_train['anime_id'] == anime_id].index[0]
        distances, indices = knn.kneighbors(csr_data_train[anime_id], n_neighbors=recommendations + 1)
        indices_list = indices.squeeze().tolist()
        distances_list = distances.squeeze().tolist()
        indices_distances = list(zip(indices_list, distances_list))
        indices_distances_sorted = sorted(indices_distances, key=lambda x: x[1], reverse=False)
        indices_distances_sorted = indices_distances_sorted[1:]
        recom_list = []
        for ind_dist in indices_distances_sorted:
            matrix_movie_id = user_item_matrix_train.iloc[ind_dist[0]]['anime_id']
            id = anime_data[anime_data['anime_id'] == matrix_movie_id].index
            Name = anime_data.iloc[id]['Name'].values[0]
            dist = ind_dist[1]
            recom_list.append({'Name': Name, 'Distance': dist})
        recom_df = pd.DataFrame(recom_list, index=range(1, recommendations + 1))
        recom_df.drop(recom_df[recom_df['Name'].isin(search_words)].index, inplace=True)
        result[word] = recom_df
        all_recoms.extend(recom_list)
    all_recoms_df = pd.DataFrame(all_recoms)
    all_recoms_df.drop(all_recoms_df[all_recoms_df['Name'].isin(search_words)].index, inplace=True)
    all_recoms_df = all_recoms_df.sort_values(by=['Distance'], ascending=True).head(recommendations)
    return result, all_recoms_df


In [27]:
search_words = ['Bleach', 'Naruto']
recommendations = 10
results, top_all_recoms = get_recommendations(search_words, recommendations)


In [28]:
print(results['Bleach'])

                                                 Name  Distance
2                                          Soul Eater  0.753275
3   Bleach Movie 3: Fade to Black - Kimi no Na wo ...  0.757773
4                                    Sword Art Online  0.757779
5                                        Angel Beats!  0.759274
6                                           One Piece  0.761856
7                                 Fullmetal Alchemist  0.762074
8                                Fate/Zero 2nd Season  0.762632
9                    Fullmetal Alchemist: Brotherhood  0.763034
10                                        Mirai Nikki  0.763049


In [29]:
print(results['Naruto'])

                                Name  Distance
1                 Naruto: Shippuuden  0.733290
3                         Durarara!!  0.744389
4                        Mirai Nikki  0.745250
5                Fullmetal Alchemist  0.751069
6                         Fairy Tail  0.753146
7   Fullmetal Alchemist: Brotherhood  0.754298
8                          One Piece  0.755778
9                     Ao no Exorcist  0.758908
10                  Sword Art Online  0.759420


In [30]:
# Объединенный набор с 10 лучшими рекомендациями по всем поисковым аниме, не учитывающий выбранные аниме
print(top_all_recoms)

                                                 Name  Distance
10                                 Naruto: Shippuuden  0.733290
12                                         Durarara!!  0.744389
13                                        Mirai Nikki  0.745250
14                                Fullmetal Alchemist  0.751069
15                                         Fairy Tail  0.753146
1                                          Soul Eater  0.753275
16                   Fullmetal Alchemist: Brotherhood  0.754298
17                                          One Piece  0.755778
2   Bleach Movie 3: Fade to Black - Kimi no Na wo ...  0.757773
3                                    Sword Art Online  0.757779


#MAPK TOP 10 recommendations using the K-Nearest Neighbors algorithm (KNN)

In [31]:
# Создание датасета с топ 10 рекомендациями для каждого пользователя
user_recommendations = {}

for user_id in test_ratings['user_id'].unique():
    user_ratings = train_ratings[train_ratings['user_id'] == user_id]
    anime_not_rated = list(set(train_ratings['anime_id'].unique()) - set(user_ratings['anime_id'].unique()))
    recommended_anime = []
    for word in search_words:
        recommended_anime.extend(results[word].index.tolist())
    recommended_anime = list(set(recommended_anime))
    user_recommendations[user_id] = recommended_anime[:10] # использую топ-10 рекомендаций
# Создание списка рекомендованных и реальных оценок для каждого пользователя
recommended_ratings = []
actual_ratings_list = []

for user_id in user_recommendations:
    recommended_ratings.append(user_recommendations[user_id])
    actual_ratings_list.append(list(test_ratings[test_ratings['user_id'] == user_id]['anime_id']))
# Расчет MAPK@10
mapk_test_after_score = mapk(actual_ratings_list, recommended_ratings, k=10)
print("MAPK@10 score: ", mapk_test_after_score)


MAPK@10 score:  0.019129662522202485


#Comparison of MAPK

In [32]:
percent_increase = ((mapk_test_before_score - mapk_test_after_score) / mapk_test_after_score) * 100 if mapk_test_after_score != 0 else 100
print(f"mapk_train_score на {percent_increase:.2f}% больше mapk_test_score.")


mapk_train_score на 204.97% больше mapk_test_score.
