In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
from src.mapk import *

#Data Preprocessing

In [2]:
# Файлы
INPUT_DIR = 'C:/Dataset'

In [3]:
# Чтение файлов
anime_ratings = pd.read_csv(INPUT_DIR + '/animelist.csv',
                        low_memory=False,
                        decimal=',',
                        usecols=["user_id", "anime_id","rating"]
                        )
anime_data = pd.read_csv(INPUT_DIR + '/anime.csv',
                        low_memory=False,
                        decimal=','
                        )

In [4]:
# Содержимое anime.csv
anime_data.head(3)

Unnamed: 0,anime_id,Name,Score,Genres,English name,Japanese name,Type,Episodes,Aired,Premiered,...,Score-10,Score-9,Score-8,Score-7,Score-6,Score-5,Score-4,Score-3,Score-2,Score-1
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",Cowboy Bebop,カウボーイビバップ,TV,26,"Apr 3, 1998 to Apr 24, 1999",Spring 1998,...,229170,182126,131625,62330,20688,8904,3184,1357,741,1580
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space",Cowboy Bebop:The Movie,カウボーイビバップ 天国の扉,Movie,1,1-Sep-01,Unknown,...,30043,49201,49505,22632,5805,1877,577,221,109,379
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen",Trigun,トライガン,TV,26,"Apr 1, 1998 to Sep 30, 1998",Spring 1998,...,50229,75651,86142,49432,15376,5838,1965,664,316,533


In [5]:
from sklearn.model_selection import train_test_split

# (60% train, 40% test)
anime_ratings, train_ratings = train_test_split(anime_ratings, test_size=0.4, random_state=42)

# (50% train, 50% test)
train_ratings, test_ratings = train_test_split(train_ratings, test_size=0.5, random_state=42)

# (50% test, 50% validation)
test_ratings, val_ratings = train_test_split(test_ratings, test_size=0.5, random_state=42)

In [6]:
# anime_ratings
len(anime_ratings)


65534848

In [7]:
# train_ratings
len(train_ratings)

21844949

In [8]:
# val_ratings
len(val_ratings)

10922475

In [9]:
# test_ratings
len(test_ratings)

10922475

In [10]:
# Cодержимое anime_ratings.csv
anime_ratings.head(3)

Unnamed: 0,user_id,anime_id,rating
70013597,226554,37675,0
91229590,294769,889,8
51336492,166388,24439,8


In [11]:
# Пользователь должен оценить минимум 500 аниме (train_ratings)
ntrain_ratings = train_ratings['user_id'].value_counts()
train_ratings = train_ratings[train_ratings['user_id'].isin(ntrain_ratings[ntrain_ratings >= 500].index)].copy()
len(train_ratings)

1012937

In [12]:
# Пользователь должен оценить минимум 500 аниме (test_ratings)
ntest_ratings = test_ratings['user_id'].value_counts()
test_ratings = test_ratings[test_ratings['user_id'].isin(ntest_ratings[ntest_ratings >= 500].index)].copy()
len(test_ratings)

153948

In [13]:
# Создание датасета с топ 10 рекомендациями для каждого пользователя
user_recommendations = {}

for user_id in test_ratings['user_id'].unique():
    user_ratings = train_ratings[train_ratings['user_id'] == user_id]
    anime_not_rated = list(set(train_ratings['anime_id'].unique()) - set(user_ratings['anime_id'].unique()))
    recommended_anime = anime_not_rated[:10] # использую топ-10 рекомендаций
    user_recommendations[user_id] = recommended_anime


In [14]:
# Создание датасета с реальными оценками для каждого пользователя из тестового набора
actual_ratings = {}

for user_id in test_ratings['user_id'].unique():
    anime_list = list(test_ratings[test_ratings['user_id'] == user_id]['anime_id'])
    actual_ratings[user_id] = anime_list

In [15]:
# Создание списка рекомендованных и реальных оценок для каждого пользователя
recommended_ratings = []
actual_ratings_list = []

for user_id in user_recommendations:
    recommended_ratings.append(user_recommendations[user_id])
    actual_ratings_list.append(actual_ratings[user_id])

In [16]:
# Расчет MAPK@10
mapk_test_score = mapk(actual_ratings_list, recommended_ratings, k=10)
print("MAPK@10 score: ", mapk_test_score)

MAPK@10 score:  0.037910192147034255


---

---

In [17]:
# Удаление Duplicated Rows
train_ratings = train_ratings.drop_duplicates()


In [18]:
train_ratings.head(3)

Unnamed: 0,user_id,anime_id,rating
81856148,264980,19111,9
25511137,82737,30740,7
17186823,55748,9201,8


In [19]:
# Создание сводной таблицы (pivot table). 
# По горизонтали будут аниме, по вертикали - пользователи, значения - оценки
user_item_matrix = train_ratings.pivot(index = 'anime_id', columns = 'user_id', values= 'rating')
user_item_matrix.head()

user_id,781,890,1397,1469,3021,3160,3578,4132,4773,5045,...,350166,350286,350902,350981,351069,351361,351801,352301,352583,352811
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,7.0,,,5.0,,,,,8.0,,...,,,,0.0,,,,,,
5,,,,,,,,,,,...,8.0,,8.0,0.0,8.0,8.0,,,,
6,,,,,,,9.0,,8.0,,...,,,,,,,,,9.0,
7,,,7.0,,0.0,,,,,,...,,,,,,,,,,
8,,,,,,,,,6.0,,...,,,,,0.0,,,,,


In [20]:
# NaN преобразовываю в нули
user_item_matrix.fillna(0, inplace = True)
user_item_matrix.head()

user_id,781,890,1397,1469,3021,3160,3578,4132,4773,5045,...,350166,350286,350902,350981,351069,351361,351801,352301,352583,352811
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,7.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,8.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8.0,0.0,8.0,0.0,8.0,8.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,8.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0
7,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
# Размерность матрицы "пользователи х anime"
user_item_matrix.shape

(17544, 1294)

In [22]:
# Преобразую разреженную матрицу в формат csr
# Метод values передаст функции csr_matrix только значения датафрейма
csr_data = csr_matrix(user_item_matrix.values)

In [23]:
user_item_matrix.head()

user_id,781,890,1397,1469,3021,3160,3578,4132,4773,5045,...,350166,350286,350902,350981,351069,351361,351801,352301,352583,352811
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,7.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,8.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8.0,0.0,8.0,0.0,8.0,8.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,8.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0
7,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
# Сброшу индекс с помощью reset_index()
user_item_matrix = user_item_matrix.rename_axis(None, axis = 1).reset_index()
user_item_matrix.head()

Unnamed: 0,anime_id,781,890,1397,1469,3021,3160,3578,4132,4773,...,350166,350286,350902,350981,351069,351361,351801,352301,352583,352811
0,1,7.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,8.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8.0,0.0,8.0,0.0,8.0,8.0,0.0,0.0,0.0,0.0
2,6,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,8.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0
3,7,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#Item Based Collaborative Filtering Recommendation System 

---

In [25]:
def get_recommendations(search_words, recommendations=10):
    result = {}
    knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)
    knn.fit(csr_data)
    for word in search_words:
        anime_search = anime_data[anime_data['Name'].str.contains(word)]
        anime_id = anime_search.iloc[0]['anime_id']
        anime_id = user_item_matrix[user_item_matrix['anime_id'] == anime_id].index[0]
        distances, indices = knn.kneighbors(csr_data[anime_id], n_neighbors=recommendations + 1)
        indices_list = indices.squeeze().tolist()
        distances_list = distances.squeeze().tolist()
        indices_distances = list(zip(indices_list, distances_list))
        indices_distances_sorted = sorted(indices_distances, key=lambda x: x[1], reverse=False)
        indices_distances_sorted = indices_distances_sorted[1:]
        recom_list = []
        for ind_dist in indices_distances_sorted:
            matrix_movie_id = user_item_matrix.iloc[ind_dist[0]]['anime_id']
            id = anime_data[anime_data['anime_id'] == matrix_movie_id].index
            Name = anime_data.iloc[id]['Name'].values[0]
            dist = ind_dist[1]
            recom_list.append({'Name': Name, 'Distance': dist})
        recom_df = pd.DataFrame(recom_list, index=range(1, recommendations + 1))
        result[word] = recom_df
    return result

#Top 10 anime recommendations based on user preferences

In [26]:
search_words = ['Bleach', 'Naruto']
recommendations = 10
results = get_recommendations(search_words, recommendations)

In [27]:
print(results['Bleach'])

                                               Name  Distance
1                      Bleach: Memories in the Rain  0.768444
2                                    Lovely★Complex  0.786597
3   Dragon Ball Z Movie 01: Ora no Gohan wo Kaese!!  0.793919
4                              High School DxD Hero  0.794663
5                  Black Lagoon: The Second Barrage  0.796375
6                                 Mekakucity Actors  0.796650
7                               Fullmetal Alchemist  0.798496
8                  Sora no Otoshimono: Project Pink  0.802480
9                                          Hellsing  0.803193
10                               Shingeki no Kyojin  0.803194


In [28]:
print(results['Naruto'])

                                    Name  Distance
1       Black Lagoon: The Second Barrage  0.769668
2                        Soul Eater NOT!  0.777317
3                            Psycho-Pass  0.789136
4   Fate/kaleid liner Prisma☆Illya 2wei!  0.789171
5                   Nekomonogatari: Kuro  0.791616
6                                  K-On!  0.793678
7                             Death Note  0.794274
8                              Yuru Yuri  0.795366
9            Kuroko no Basket 2nd Season  0.795392
10  Blood Lad: Wagahai wa Neko de wa Nai  0.796539


---

#Расчет mapk@10 для Item-Based collaborative filtering с использованием метода K-Nearest Neighbors algorithm (KNN)

In [29]:
# Создание датасета с реальными оценками для каждого пользователя из тестового набора
actualtrain_ratings = {}

for user_id in train_ratings['user_id'].unique():
    anime_list = list(train_ratings[train_ratings['user_id'] == user_id]['anime_id'])
    actualtrain_ratings[user_id] = anime_list

In [30]:
# Создание списка фактических оценок
actual = [actualtrain_ratings[user_id] for user_id in test_ratings['user_id'].unique()]


In [31]:
# Преобразование рекомендаций и фактических оценок
predicted = []
for user_id in test_ratings['user_id'].unique():
    recommended_anime = []
    for i in range(1, 11):
        anime_name = results['Bleach'].loc[i, 'Name']
        anime_id = anime_data[anime_data['Name'] == anime_name].iloc[0]['anime_id']
        recommended_anime.append(anime_id)
    predicted.append(recommended_anime)


In [32]:
# Расчет MAPK@10
mapk_train_score = mapk(actual, predicted, k=10)
print("MAPK@10 score: ", mapk_train_score)

MAPK@10 score:  0.06710714285714287


In [33]:
mapk_changes = ((mapk_train_score - mapk_test_score) / mapk_test_score) * 100 if mapk_test_score != 0 else 100
print(f"mapk_train_score на {mapk_changes:.2f}% больше mapk_test_score.")

mapk_train_score на 77.02% больше mapk_test_score.
