In [37]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
from src.mapk import *

#Data Preprocessing

In [38]:
# Файлы
INPUT_DIR = 'C:/Dataset'

In [39]:
# Чтение файлов
anime_ratings = pd.read_csv(INPUT_DIR + '/rating_complete.csv',
                        low_memory=False,
                        decimal=',',
                        usecols=["user_id", "anime_id","rating"]
                        )
anime_data = pd.read_csv(INPUT_DIR + '/anime.csv',
                        low_memory=False,
                        decimal=','
                        )

In [40]:
# Содержимое anime.csv
anime_data.head(3)

Unnamed: 0,anime_id,Name,Score,Genres,English name,Japanese name,Type,Episodes,Aired,Premiered,...,Score-10,Score-9,Score-8,Score-7,Score-6,Score-5,Score-4,Score-3,Score-2,Score-1
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",Cowboy Bebop,カウボーイビバップ,TV,26,"Apr 3, 1998 to Apr 24, 1999",Spring 1998,...,229170,182126,131625,62330,20688,8904,3184,1357,741,1580
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space",Cowboy Bebop:The Movie,カウボーイビバップ 天国の扉,Movie,1,1-Sep-01,Unknown,...,30043,49201,49505,22632,5805,1877,577,221,109,379
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen",Trigun,トライガン,TV,26,"Apr 1, 1998 to Sep 30, 1998",Spring 1998,...,50229,75651,86142,49432,15376,5838,1965,664,316,533


In [41]:
from sklearn.model_selection import train_test_split

# (60% train, 40% test)
anime_ratings, train_ratings = train_test_split(anime_ratings, test_size=0.6, random_state=42)

# (50% train, 50% test)
train_ratings, test_ratings = train_test_split(train_ratings, test_size=0.5, random_state=42)

In [42]:
# anime_ratings
len(anime_ratings)

23053311

In [43]:
# train_ratings
len(train_ratings)

17289983

In [44]:
# test_ratings
len(test_ratings)

17289984

In [45]:
# Cодержимое anime_ratings.csv
anime_ratings.head(3)

Unnamed: 0,user_id,anime_id,rating
40273468,247449,18099,8
24842538,152911,1047,4
38953564,239265,14227,7


In [46]:
# Пользователь должен оценить минимум 500 аниме (train_ratings)
ntrain_ratings = train_ratings['user_id'].value_counts()
train_ratings = train_ratings[train_ratings['user_id'].isin(ntrain_ratings[ntrain_ratings >= 500].index)].copy()
len(train_ratings)

749513

In [47]:
# Пользователь должен оценить минимум 500 аниме (test_ratings)
ntest_ratings = test_ratings['user_id'].value_counts()
test_ratings = test_ratings[test_ratings['user_id'].isin(ntest_ratings[ntest_ratings >= 500].index)].copy()
len(test_ratings)

747721

In [48]:
# Удаление Duplicated Rows
train_ratings = train_ratings.drop_duplicates()
test_ratings = test_ratings.drop_duplicates()

In [49]:
train_ratings.head(3)

Unnamed: 0,user_id,anime_id,rating
50579508,310065,32900,7
1733703,10851,19023,5
16250982,99690,819,4


In [50]:
# Создание сводной таблицы (pivot table). 
# По горизонтали будут аниме, по вертикали - пользователи, значения - оценки
user_item_matrix_train = train_ratings.pivot(index = 'anime_id', columns = 'user_id', values= 'rating')
user_item_matrix_train.head()

user_id,781,890,1177,1397,1469,1946,3578,4773,5045,5648,...,350215,350286,351119,351361,351696,351801,352301,352761,352922,352930
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,5.0,,8.0,,,,...,,9.0,,,,,,,,
5,,,8.0,,,,,,,8.0,...,,,,8.0,,,9.0,,,
6,,,,,,,,8.0,,,...,,,,5.0,,,,,,
7,,,8.0,,,7.0,,7.0,,,...,,,,,,,,,,
8,,,,,,,,6.0,,,...,,,,,,,,,,


In [51]:
# NaN преобразовываю в нули
user_item_matrix_train.fillna(0, inplace = True)
user_item_matrix_train.head()

user_id,781,890,1177,1397,1469,1946,3578,4773,5045,5648,...,350215,350286,351119,351361,351696,351801,352301,352761,352922,352930
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,5.0,0.0,8.0,0.0,0.0,0.0,...,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,...,0.0,0.0,0.0,8.0,0.0,0.0,9.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,...,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,8.0,0.0,0.0,7.0,0.0,7.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [52]:
# Размерность матрицы "пользователи х anime"
user_item_matrix_train.shape

(16422, 1009)

In [53]:
# Преобразую разреженную матрицу в формат csr
# Метод values передаст функции csr_matrix только значения датафрейма
csr_data_train = csr_matrix(user_item_matrix_train.values)

In [54]:
user_item_matrix_train.head()

user_id,781,890,1177,1397,1469,1946,3578,4773,5045,5648,...,350215,350286,351119,351361,351696,351801,352301,352761,352922,352930
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,5.0,0.0,8.0,0.0,0.0,0.0,...,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,...,0.0,0.0,0.0,8.0,0.0,0.0,9.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,...,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,8.0,0.0,0.0,7.0,0.0,7.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [55]:
# Сброшу индекс с помощью reset_index()
user_item_matrix_train = user_item_matrix_train.rename_axis(None, axis = 1).reset_index()
user_item_matrix_train.head()

Unnamed: 0,anime_id,781,890,1177,1397,1469,1946,3578,4773,5045,...,350215,350286,351119,351361,351696,351801,352301,352761,352922,352930
0,1,0.0,0.0,0.0,0.0,5.0,0.0,8.0,0.0,0.0,...,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,8.0,0.0,0.0,9.0,0.0,0.0,0.0
2,6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,...,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0
3,7,0.0,0.0,8.0,0.0,0.0,7.0,0.0,7.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#MAPK TOP 10 Recommendations

In [56]:
train_ratings.info

<bound method DataFrame.info of           user_id  anime_id  rating
50579508   310065     32900       7
1733703     10851     19023       5
16250982    99690       819       4
35925526   220437     35069       2
21140073   130016     36198       6
...           ...       ...     ...
53564279   328195     35883       5
22097197   135624      3553       6
47296549   290134     37621       7
20430462   125646      4475       6
30745335   189037     35425       7

[749513 rows x 3 columns]>

In [57]:
# Создание датасета с топ 10 рекомендациями для каждого пользователя
user_recommendations = {}

for user_id in test_ratings['user_id'].unique():
    user_ratings = train_ratings[train_ratings['user_id'] == user_id]
    anime_not_rated = list(set(train_ratings['anime_id'].unique()) - set(user_ratings['anime_id'].unique()))
    recommended_anime = anime_not_rated[:10] # использую топ-10 рекомендаций
    user_recommendations[user_id] = recommended_anime


In [58]:
# Создание списка рекомендованных и реальных оценок для каждого пользователя
recommended_ratings = []
actual_ratings_list = []

for user_id in user_recommendations:
    recommended_ratings.append(user_recommendations[user_id])
    actual_ratings_list.append(list(test_ratings[test_ratings['user_id'] == user_id]['anime_id']))


In [59]:
# Расчет MAPK@10
mapk_test_before_score = mapk(actual_ratings_list, recommended_ratings, k=10)
print("MAPK@10 score: ", mapk_test_before_score)

MAPK@10 score:  0.09820214521452146


#Item Based Collaborative Filtering Recommendation System 

#Top 10 anime recommendations based on user preferences

In [60]:
# Функции, которая возвращает результаты по каждому поисковому аниме отдельно, 
# а также объединенный набор данных с 10 лучшими рекомендациями по всем поисковым аниме на основе наименьшего расстояния:
def get_recommendations(search_words, recommendations=10):
    result = {}
    knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)
    knn.fit(csr_data_train)
    all_recoms = []
    for word in search_words:
        anime_search = anime_data[anime_data['Name'].str.contains(word)]
        anime_id = anime_search.iloc[0]['anime_id']
        anime_id = user_item_matrix_train[user_item_matrix_train['anime_id'] == anime_id].index[0]
        distances, indices = knn.kneighbors(csr_data_train[anime_id], n_neighbors=recommendations + 1)
        indices_list = indices.squeeze().tolist()
        distances_list = distances.squeeze().tolist()
        indices_distances = list(zip(indices_list, distances_list))
        indices_distances_sorted = sorted(indices_distances, key=lambda x: x[1], reverse=False)
        indices_distances_sorted = indices_distances_sorted[1:]
        recom_list = []
        for ind_dist in indices_distances_sorted:
            matrix_movie_id = user_item_matrix_train.iloc[ind_dist[0]]['anime_id']
            id = anime_data[anime_data['anime_id'] == matrix_movie_id].index
            Name = anime_data.iloc[id]['Name'].values[0]
            dist = ind_dist[1]
            recom_list.append({'Name': Name, 'Distance': dist})
        recom_df = pd.DataFrame(recom_list, index=range(1, recommendations + 1))
        recom_df.drop(recom_df[recom_df['Name'].isin(search_words)].index, inplace=True)
        result[word] = recom_df
        all_recoms.extend(recom_list)
    all_recoms_df = pd.DataFrame(all_recoms)
    all_recoms_df.drop(all_recoms_df[all_recoms_df['Name'].isin(search_words)].index, inplace=True)
    all_recoms_df = all_recoms_df.sort_values(by=['Distance'], ascending=True).head(recommendations)
    return result, all_recoms_df


In [61]:
search_words = ['Bleach', 'Naruto']
recommendations = 10
results, top_all_recoms = get_recommendations(search_words, recommendations)


In [62]:
print(results['Bleach'])

                                 Name  Distance
1     Bleach: The Sealed Sword Frenzy  0.702827
2          Magi: The Kingdom of Magic  0.703215
3                     No Game No Life  0.705768
4   Ao no Exorcist: Kyoto Fujouou-hen  0.712984
5      Hentai Ouji to Warawanai Neko.  0.718021
6                         Mirai Nikki  0.718024
7        Bleach: Memories in the Rain  0.718626
8                  Ansatsu Kyoushitsu  0.719248
9                   Psycho-Pass Movie  0.719696
10                         To LOVE-Ru  0.720498


In [63]:
print(results['Naruto'])

                                                 Name  Distance
1                 Naruto: Shippuuden Movie 2 - Kizuna  0.681908
2                    Fullmetal Alchemist: Brotherhood  0.694093
3               Steins;Gate: Oukoubakko no Poriomania  0.695952
4                              Tonari no Kaibutsu-kun  0.708977
5                                            Baccano!  0.713177
6                               Angel Beats! Specials  0.713390
7                    Black Lagoon: The Second Barrage  0.714528
8        Highschool of the Dead: Drifters of the Dead  0.716419
9                             Neon Genesis Evangelion  0.718488
10  Yahari Ore no Seishun Love Comedy wa Machigatt...  0.721374


In [64]:
# Объединенный набор с 10 лучшими рекомендациями по всем поисковым аниме, не учитывающий выбранные аниме
print(top_all_recoms)

                                     Name  Distance
10    Naruto: Shippuuden Movie 2 - Kizuna  0.681908
11       Fullmetal Alchemist: Brotherhood  0.694093
12  Steins;Gate: Oukoubakko no Poriomania  0.695952
0         Bleach: The Sealed Sword Frenzy  0.702827
1              Magi: The Kingdom of Magic  0.703215
2                         No Game No Life  0.705768
13                 Tonari no Kaibutsu-kun  0.708977
3       Ao no Exorcist: Kyoto Fujouou-hen  0.712984
14                               Baccano!  0.713177
15                  Angel Beats! Specials  0.713390


#MAPK TOP 10 recommendations using the K-Nearest Neighbors algorithm (KNN)

In [65]:
# Создание датасета с топ 10 рекомендациями для каждого пользователя
user_recommendations = {}

for user_id in test_ratings['user_id'].unique():
    user_ratings = train_ratings[train_ratings['user_id'] == user_id]
    anime_not_rated = list(set(train_ratings['anime_id'].unique()) - set(user_ratings['anime_id'].unique()))
    recommended_anime = []
    for word in search_words:
        recommended_anime.extend(results[word].index.tolist())
    recommended_anime = list(set(recommended_anime))
    user_recommendations[user_id] = recommended_anime[:10] # использую топ-10 рекомендаций
# Создание списка рекомендованных и реальных оценок для каждого пользователя
recommended_ratings = []
actual_ratings_list = []

for user_id in user_recommendations:
    recommended_ratings.append(user_recommendations[user_id])
    actual_ratings_list.append(list(test_ratings[test_ratings['user_id'] == user_id]['anime_id']))
# Расчет MAPK@10
mapk_test_after_score = mapk(actual_ratings_list, recommended_ratings, k=10)
print("MAPK@10 score: ", mapk_test_after_score)


MAPK@10 score:  0.04166171617161716


#Comparison of MAPK

In [66]:
percent_increase = ((mapk_test_before_score - mapk_test_after_score) / mapk_test_after_score) * 100 if mapk_test_after_score != 0 else 100
print(f"mapk (топ 10 найденных с помощью KNN) на {percent_increase:.2f}% больше mapk (топ 10 наиболее популярных)")

mapk (топ 10 найденных с помощью KNN) на 135.71% больше mapk (топ 10 наиболее популярных)
