In [106]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
from src.mapk import *

#Data Preprocessing

In [107]:
# Файлы
INPUT_DIR = 'C:/Dataset'

In [108]:
# Чтение файлов
anime_ratings = pd.read_csv(INPUT_DIR + '/animelist.csv',
                        low_memory=False,
                        decimal=',',
                        usecols=["user_id", "anime_id","rating"]
                        )
anime_data = pd.read_csv(INPUT_DIR + '/anime.csv',
                        low_memory=False,
                        decimal=','
                        )

In [109]:
# Содержимое anime.csv
anime_data.head(3)

Unnamed: 0,anime_id,Name,Score,Genres,English name,Japanese name,Type,Episodes,Aired,Premiered,...,Score-10,Score-9,Score-8,Score-7,Score-6,Score-5,Score-4,Score-3,Score-2,Score-1
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",Cowboy Bebop,カウボーイビバップ,TV,26,"Apr 3, 1998 to Apr 24, 1999",Spring 1998,...,229170,182126,131625,62330,20688,8904,3184,1357,741,1580
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space",Cowboy Bebop:The Movie,カウボーイビバップ 天国の扉,Movie,1,1-Sep-01,Unknown,...,30043,49201,49505,22632,5805,1877,577,221,109,379
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen",Trigun,トライガン,TV,26,"Apr 1, 1998 to Sep 30, 1998",Spring 1998,...,50229,75651,86142,49432,15376,5838,1965,664,316,533


In [110]:
from sklearn.model_selection import train_test_split

# (95% train, 5% test)
anime_ratings, train_ratings = train_test_split(anime_ratings, test_size=0.4, random_state=42)

# (80% train, 20% test)
train_ratings, test_ratings = train_test_split(train_ratings, test_size=0.5, random_state=42)

# (80% train, 20% validation)
test_ratings, val_ratings = train_test_split(test_ratings, test_size=0.5, random_state=42)

In [111]:
# 100%
len(anime_ratings)


65534848

In [112]:
# 60%
len(train_ratings)

21844949

In [113]:
# 20%
len(val_ratings)

10922475

In [114]:
# 20%
len(test_ratings)

10922475

In [115]:
# Cодержимое anime_ratings.csv
anime_ratings.head(3)

Unnamed: 0,user_id,anime_id,rating
70013597,226554,37675,0
91229590,294769,889,8
51336492,166388,24439,8


In [116]:
# Пользователь должен оценить минимум 500 аниме (train_ratings)
ntrain_ratings = train_ratings['user_id'].value_counts()
train_ratings = train_ratings[train_ratings['user_id'].isin(ntrain_ratings[ntrain_ratings >= 500].index)].copy()
len(train_ratings)

1012937

In [117]:
# Пользователь должен оценить минимум 500 аниме (test_ratings)
ntest_ratings = test_ratings['user_id'].value_counts()
test_ratings = test_ratings[test_ratings['user_id'].isin(ntest_ratings[ntest_ratings >= 500].index)].copy()
len(test_ratings)

153948

In [118]:
# Создание датасета с рекомендациями для каждого пользователя
user_recommendations = {}

for user_id in test_ratings['user_id'].unique():
    user_ratings = train_ratings[train_ratings['user_id'] == user_id]
    anime_not_rated = list(set(train_ratings['anime_id'].unique()) - set(user_ratings['anime_id'].unique()))
    recommended_anime = anime_not_rated[:10] # использую топ-10 рекомендаций
    user_recommendations[user_id] = recommended_anime


In [119]:
# Создание датасета с реальными оценками для каждого пользователя из тестового набора
actual_ratings = {}

for user_id in test_ratings['user_id'].unique():
    anime_list = list(test_ratings[test_ratings['user_id'] == user_id]['anime_id'])
    actual_ratings[user_id] = anime_list

In [120]:
# Создание списка рекомендованных и реальных оценок для каждого пользователя
recommended_ratings = []
actual_ratings_list = []

for user_id in user_recommendations:
    recommended_ratings.append(user_recommendations[user_id])
    actual_ratings_list.append(actual_ratings[user_id])

In [121]:
# Расчет MAPK@10
mapk_score = mapk(actual_ratings_list, recommended_ratings, k=10)
print("MAPK@10 score: ", mapk_score)

MAPK@10 score:  0.037910192147034255


---

---

In [122]:
# Удаление Duplicated Rows
train_ratings = train_ratings.drop_duplicates()


In [123]:
train_ratings.head(3)

Unnamed: 0,user_id,anime_id,rating
81856148,264980,19111,9
25511137,82737,30740,7
17186823,55748,9201,8


In [124]:
# Создание сводной таблицы (pivot table). 
# По горизонтали будут аниме, по вертикали - пользователи, значения - оценки
user_item_matrix = train_ratings.pivot(index = 'anime_id', columns = 'user_id', values= 'rating')
user_item_matrix.head()

user_id,781,890,1397,1469,3021,3160,3578,4132,4773,5045,...,350166,350286,350902,350981,351069,351361,351801,352301,352583,352811
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,7.0,,,5.0,,,,,8.0,,...,,,,0.0,,,,,,
5,,,,,,,,,,,...,8.0,,8.0,0.0,8.0,8.0,,,,
6,,,,,,,9.0,,8.0,,...,,,,,,,,,9.0,
7,,,7.0,,0.0,,,,,,...,,,,,,,,,,
8,,,,,,,,,6.0,,...,,,,,0.0,,,,,


In [125]:
# NaN преобразовываю в нули
user_item_matrix.fillna(0, inplace = True)
user_item_matrix.head()

user_id,781,890,1397,1469,3021,3160,3578,4132,4773,5045,...,350166,350286,350902,350981,351069,351361,351801,352301,352583,352811
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,7.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,8.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8.0,0.0,8.0,0.0,8.0,8.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,8.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0
7,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [126]:
# Размерность матрицы "пользователи х anime"
user_item_matrix.shape

(17544, 1294)

In [127]:
# Преобразую разреженную матрицу в формат csr
# Метод values передаст функции csr_matrix только значения датафрейма
csr_data = csr_matrix(user_item_matrix.values)

In [128]:
user_item_matrix.head()

user_id,781,890,1397,1469,3021,3160,3578,4132,4773,5045,...,350166,350286,350902,350981,351069,351361,351801,352301,352583,352811
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,7.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,8.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8.0,0.0,8.0,0.0,8.0,8.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,8.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0
7,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [129]:
# Сброшу индекс с помощью reset_index()
user_item_matrix = user_item_matrix.rename_axis(None, axis = 1).reset_index()
user_item_matrix.head()

Unnamed: 0,anime_id,781,890,1397,1469,3021,3160,3578,4132,4773,...,350166,350286,350902,350981,351069,351361,351801,352301,352583,352811
0,1,7.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,8.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8.0,0.0,8.0,0.0,8.0,8.0,0.0,0.0,0.0,0.0
2,6,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,8.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0
3,7,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#User preferences

In [130]:
# Количество рекомендаций
recommendations = 10

# Выбор аниме
search_word = 'Bleach'

#Item Based Collaborative Filtering

In [131]:
# Использую класс NearestNeighbors для поиска расстояний
knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute', n_neighbors = 20, n_jobs = -1)

# Обучу модель
knn.fit(csr_data)

In [132]:
# Поиск аниме в датафрейме animeTest
anime_search = anime_data[anime_data['Name'].str.contains(search_word)]
anime_search.head(5)

Unnamed: 0,anime_id,Name,Score,Genres,English name,Japanese name,Type,Episodes,Aired,Premiered,...,Score-10,Score-9,Score-8,Score-7,Score-6,Score-5,Score-4,Score-3,Score-2,Score-1
245,269,Bleach,7.8,"Action, Adventure, Comedy, Super Power, Supern...",Bleach,BLEACH - ブリーチ -,TV,366,"Oct 5, 2004 to Mar 27, 2012",Fall 2004,...,116063,136177,192980,174721,81327,42155,16961,6766,3102,2906
697,762,Bleach: Memories in the Rain,7.15,"Action, Adventure, Supernatural, Drama, Shounen",Bleach:Memories in the Rain,BLEACH Memories in the Rain,Special,1,18-Dec-04,Unknown,...,5593,6434,12020,17902,10193,4975,1556,540,264,230
753,834,Bleach: The Sealed Sword Frenzy,6.98,"Action, Adventure, Comedy, Super Power, Supern...",Unknown,BLEACH The Sealed Sword Frenzy,Special,1,23-Mar-06,Unknown,...,4581,5181,10120,16039,10501,5437,1765,719,353,243
1533,1686,Bleach Movie 1: Memories of Nobody,7.45,"Action, Adventure, Comedy, Super Power, Supern...",Bleach the Movie:Memories of Nobody,劇場版 BLEACH MEMORIES OF NOBODY,Movie,1,16-Dec-06,Unknown,...,14687,20285,35194,40166,19111,8400,2558,960,425,264
2645,2889,Bleach Movie 2: The DiamondDust Rebellion - Mo...,7.45,"Action, Adventure, Comedy, Super Power, Supern...",Bleach the Movie:The DiamondDust Rebellion,劇場版 BLEACH The DiamondDust Rebellion もう一つの氷輪丸,Movie,1,22-Dec-07,Unknown,...,13103,17374,30291,34381,15917,7215,2403,920,421,236


In [133]:
# Через iloc[0] беру первую строку столбца ['MAL_ID']
anime_id = anime_search.iloc[0]['anime_id']

# По индексу аниме в датасете animeTest нахожу соответствующий индекс в матрице предпочтений
anime_id = user_item_matrix[user_item_matrix['anime_id'] == anime_id].index[0]
anime_id

245

In [134]:
# Нахожу индексы и расстояния аниме, которые похожи на запрос
# Использую метод kneighbors()
distances, indices = knn.kneighbors(csr_data[anime_id], n_neighbors = recommendations + 1)

In [135]:
# Индексы рекомендованных аниме
indices

array([[  245,   697,  1854,   809, 12337,  1379,  8496,   100,  5176,
          246,  7449]], dtype=int64)

In [136]:
# Расстояния до них
distances

array([[9.99200722e-16, 7.68444039e-01, 7.86597475e-01, 7.93919418e-01,
        7.94663293e-01, 7.96375020e-01, 7.96650192e-01, 7.98495936e-01,
        8.02479940e-01, 8.03193181e-01, 8.03194243e-01]])

In [137]:
# Уберу лишние измерения через squeeze() и преобразую массивы в списки с помощью tolist()
indices_list = indices.squeeze().tolist()
distances_list = distances.squeeze().tolist()

# С помощью функций zip и list преобразую списки в набор кортежей (tuple)
indices_distances = list(zip(indices_list, distances_list))
print(type(indices_distances[0]))

# Первые три пары/кортежа
print(indices_distances[:3])

<class 'tuple'>
[(245, 9.992007221626409e-16), (697, 0.7684440387703864), (1854, 0.7865974750300192)]


In [138]:
# Отсортирую список по расстояниям через key = lambda x: x[1] (то есть по второму элементу) в возрастающем порядке reverse = False
indices_distances_sorted = sorted(indices_distances, key = lambda x: x[1], reverse = False)

# Уберу первый элемент (потому что это и есть "Запрос")
indices_distances_sorted = indices_distances_sorted[1:]
indices_distances_sorted

[(697, 0.7684440387703864),
 (1854, 0.7865974750300192),
 (809, 0.7939194182395339),
 (12337, 0.7946632925697469),
 (1379, 0.7963750196112722),
 (8496, 0.7966501920527922),
 (100, 0.7984959359589737),
 (5176, 0.8024799404885401),
 (246, 0.8031931809683993),
 (7449, 0.8031942427891764)]

In [139]:
# Создам пустой список, в который буду помещать название аниме и расстояние до него
recom_list = []

# В цикле поочередно прохожу по кортежам
for ind_dist in indices_distances_sorted:

    # Ищу anime_id в матрице предпочтений
    matrix_movie_id = user_item_matrix.iloc[ind_dist[0]]['anime_id']

    # Нахожу индекс этого аниме в датафрейме anime_data
    id = anime_data[anime_data['anime_id'] == matrix_movie_id].index

    # Беру название аниме и расстояние до него
    Name = anime_data.iloc[id]['Name'].values[0]
    dist = ind_dist[1]

    # Помещаю каждую пару в питоновский словарь который, в свою очередь, станет элементом списка recom_list
    recom_list.append({'Name' : Name, 'Distance' : dist})

In [140]:
# Первый элемент
recom_list[0]

{'Name': 'Bleach: Memories in the Rain', 'Distance': 0.7684440387703864}

#Top 10 anime recommendations based on user preferences

In [141]:
# Преобразую список в датафрейм
# Индекс начинаются с 1, как и положено рейтингу
recom_df = pd.DataFrame(recom_list, index = range(1, recommendations + 1))
recom_df

Unnamed: 0,Name,Distance
1,Bleach: Memories in the Rain,0.768444
2,Lovely★Complex,0.786597
3,Dragon Ball Z Movie 01: Ora no Gohan wo Kaese!!,0.793919
4,High School DxD Hero,0.794663
5,Black Lagoon: The Second Barrage,0.796375
6,Mekakucity Actors,0.79665
7,Fullmetal Alchemist,0.798496
8,Sora no Otoshimono: Project Pink,0.80248
9,Hellsing,0.803193
10,Shingeki no Kyojin,0.803194


---

In [None]:
# Создание датасета с найденными рекомендациями для каждого пользователя


In [None]:
# Создание датасета с реальными оценками для каждого пользователя из тренировочного набора


In [None]:
# Расчет MAPK@10
mapk_score = mapk(actual_ratings_list, recommended_ratings, k=10)
print("MAPK@10 score: ", mapk_score)