In [158]:
import pandas as pd
import numpy as np

# модуль sparse библиотеки scipy используется 
# для работы с разреженными матрицами
from scipy.sparse import csr_matrix

# из sklearn использую алгоритм k-ближайших соседей
from sklearn.neighbors import NearestNeighbors

In [159]:
# чтение внешних файлов и преобразование в датафрейм
animeTest = pd.read_csv('dataset/animeTest.csv')
ratingsTest = pd.read_csv('dataset/ratingsTest.csv')

In [160]:
# содержимое файла animeTest.csv
animeTest.head(3)

Unnamed: 0,MAL_ID,Name,Episodes,Duration
0,1,Cowboy Bebop,26,24 min. per ep.
1,5,Cowboy Bebop: Tengoku no Tobira,1,1 hr. 55 min.
2,6,Trigun,26,24 min. per ep.


In [161]:
# содержимое файла ratingsTest.csv
ratingsTest.drop(['watching_status', 'watched_episodes'], axis = 1, inplace = True)
ratingsTest.head(3)

Unnamed: 0,user_id,anime_id,rating
0,0,67,9
1,0,6702,7
2,0,242,10


In [162]:
# Создание сводной таблицы (pivot table). 
# По горизонтали будут аниме, по вертикали - пользователи, значения - оценки
user_item_matrix = ratingsTest.pivot(index = 'anime_id', columns = 'user_id', values= 'rating')
user_item_matrix.head()

user_id,0,1,2,3,4,5,6,7,8,9,...,1064,1065,1066,1067,1068,1069,1070,1071,1072,1073
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,9.0,,,6.0,,0.0,,...,,,0.0,,10.0,10.0,7.0,0.0,10.0,10.0
5,,,,,,,0.0,,,,...,,,,,,,,,,10.0
6,,,,,,,0.0,,,,...,,,9.0,,,,9.0,,8.0,
7,,,,,,,0.0,,,,...,,,,,,,,,,
8,,,,,,,,,,,...,,,,,,,,,,


In [163]:
# NaN преобразовываю в нули
user_item_matrix.fillna(0, inplace = True)
user_item_matrix.head()

user_id,0,1,2,3,4,5,6,7,8,9,...,1064,1065,1066,1067,1068,1069,1070,1071,1072,1073
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,9.0,0.0,0.0,6.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,10.0,10.0,7.0,0.0,10.0,10.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,9.0,0.0,0.0,0.0,9.0,0.0,8.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [164]:
# Размерность матрицы "пользователи х anime"
user_item_matrix.shape

(12401, 985)

In [165]:
# Сгруппирую пользователей, посчитаю, количество оценок каждого пользователя
users_votes = ratingsTest.groupby('user_id')['rating'].agg('count')

# То же самое, только для аниме
anime_votes = ratingsTest.groupby('anime_id')['rating'].agg('count')

In [166]:
# Создам фильтр (mask)
user_mask = users_votes[users_votes > 50].index
anime_mask = anime_votes[anime_votes > 10].index

In [167]:
# Применю фильтры и отберу аниме с достаточным количеством оценок и активных пользователей
user_item_matrix = user_item_matrix.loc[anime_mask,:]
user_item_matrix = user_item_matrix.loc[:,user_mask]

In [168]:
# Количество пользователей и аниме после применения фильтра
user_item_matrix.shape

(4604, 856)

In [169]:
# Преобразую разреженную матрицу в формат csr
# Метод values передаст функции csr_matrix только значения датафрейма
csr_data = csr_matrix(user_item_matrix.values)

# Первые записи сопоставлю с исходной таблицей выше
print(csr_data[:2,:5])

  (0, 3)	9.0


In [170]:
# Сброшу индекс с помощью reset_index()
user_item_matrix = user_item_matrix.rename_axis(None, axis = 1).reset_index()
user_item_matrix.head()

Unnamed: 0,anime_id,0,1,2,3,4,5,6,7,8,...,1063,1064,1065,1066,1068,1069,1070,1071,1072,1073
0,1,0.0,0.0,0.0,9.0,0.0,0.0,6.0,0.0,0.0,...,0.0,0.0,0.0,0.0,10.0,10.0,7.0,0.0,10.0,10.0
1,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0
2,6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,9.0,0.0,0.0,9.0,0.0,8.0,0.0
3,7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,15,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [171]:
# Использую класс NearestNeighbors для поиска расстояний
knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute', n_neighbors = 20, n_jobs = -1)

# Обучу модель
knn.fit(csr_data)

In [172]:
# Количество рекомендаций
recommendations = 30

# Выбор аниме
search_word = 'Sword Art Online'

In [173]:
# Поиск аниме в датафрейме animeTest
anime_search = animeTest[animeTest['Name'].str.contains(search_word)]
anime_search

Unnamed: 0,MAL_ID,Name,Episodes,Duration
6614,11757,Sword Art Online,25,23 min. per ep.
7380,16099,Sword Art Online: Sword Art Offline,9,11 min. per ep.
8206,20021,Sword Art Online: Extra Edition,1,1 hr. 41 min.
8550,21879,Sword Art Online: Sword Art Offline - Extra Ed...,1,13 min.
8551,21881,Sword Art Online II,24,23 min. per ep.
9663,27891,Sword Art Online II: Debriefing,1,24 min.
9702,28063,Sword Art Online II: Sword Art Offline II,9,13 min. per ep.
11110,31765,Sword Art Online Movie: Ordinal Scale,1,1 hr. 59 min.
13637,36439,Sword Art Online Movie: Ordinal Scale - Sword ...,1,14 min.
13652,36474,Sword Art Online: Alicization,24,24 min. per ep.


In [174]:
# Через iloc[0] беру первую строку столбца ['MAL_ID']
anime_id = anime_search.iloc[0]['MAL_ID']

# По индексу аниме в датасете animeTest нахожу соответствующий индекс в матрице предпочтений
anime_id = user_item_matrix[user_item_matrix['anime_id'] == anime_id].index[0]
anime_id

2267

In [175]:
# Нахожу индексы и расстояния аниме, которые похожи на запрос
# Использую метод kneighbors()
distances, indices = knn.kneighbors(csr_data[anime_id], n_neighbors = recommendations + 1)

In [176]:
# Индексы рекомендованных аниме
indices

array([[2267, 2886, 2588, 2907, 2764, 1695, 2154, 3237,  896, 2902, 2028,
        3119, 1841, 2787, 1954, 3432, 3326, 1448, 2989, 3133, 1538, 2255,
        2537, 1715,  918, 2182, 2469,    9, 2217, 3136, 2720]],
      dtype=int64)

In [177]:
# Расстояния до них
distances

array([[0.        , 0.18417551, 0.2308626 , 0.25549963, 0.26669002,
        0.27604024, 0.28452597, 0.31918086, 0.32264486, 0.33823867,
        0.33921378, 0.3408711 , 0.34808592, 0.35451971, 0.35460821,
        0.35542082, 0.35720427, 0.3579342 , 0.36234497, 0.36280493,
        0.36524392, 0.36610036, 0.36672097, 0.37794065, 0.38312939,
        0.38426183, 0.38488353, 0.38803554, 0.39182285, 0.39220067,
        0.39318657]])

In [178]:
# Уберу лишние измерения через squeeze() и преобразую массивы в списки с помощью tolist()
indices_list = indices.squeeze().tolist()
distances_list = distances.squeeze().tolist()

# С помощью функций zip и list преобразую списки в набор кортежей (tuple)
indices_distances = list(zip(indices_list, distances_list))
print(type(indices_distances[0]))

# Первые три пары/кортежа
print(indices_distances[:3])

<class 'tuple'>
[(2267, 0.0), (2886, 0.1841755135600205), (2588, 0.23086260137213277)]


In [179]:
# Отсортирую список по расстояниям через key = lambda x: x[1] (то есть по второму элементу) в возрастающем порядке reverse = False
indices_distances_sorted = sorted(indices_distances, key = lambda x: x[1], reverse = False)

# Уберу первый элемент (потому что это и есть "Запрос")
indices_distances_sorted = indices_distances_sorted[1:]
indices_distances_sorted

[(2886, 0.1841755135600205),
 (2588, 0.23086260137213277),
 (2907, 0.25549962714078467),
 (2764, 0.2666900203551239),
 (1695, 0.2760402369485355),
 (2154, 0.28452596871045965),
 (3237, 0.31918086419913283),
 (896, 0.322644864550088),
 (2902, 0.33823867147131526),
 (2028, 0.33921378430937676),
 (3119, 0.34087109556604567),
 (1841, 0.34808591629399466),
 (2787, 0.354519708527955),
 (1954, 0.3546082080126478),
 (3432, 0.3554208162594982),
 (3326, 0.35720426934681015),
 (1448, 0.357934197205257),
 (2989, 0.36234496888400447),
 (3133, 0.3628049292512352),
 (1538, 0.3652439190903395),
 (2255, 0.36610036273342395),
 (2537, 0.3667209705412614),
 (1715, 0.3779406534992419),
 (918, 0.3831293888741376),
 (2182, 0.38426182812113174),
 (2469, 0.38488352708218276),
 (9, 0.38803553728180884),
 (2217, 0.39182285495040925),
 (3136, 0.39220066965327394),
 (2720, 0.393186574826362)]

In [180]:
# Создам пустой список, в который буду помещать название аниме и расстояние до него
recom_list = []

# В цикле поочередно прохожу по кортежам
for ind_dist in indices_distances_sorted:

    # Ищу anime_id в матрице предпочтений
    matrix_movie_id = user_item_matrix.iloc[ind_dist[0]]['anime_id']

    # Нахожу индекс этого аниме в датафрейме animeTest
    id = animeTest[animeTest['MAL_ID'] == matrix_movie_id].index

    # Беру название анеме и расстояние до него
    Name = animeTest.iloc[id]['Name'].values[0]
    dist = ind_dist[1]

    # Помещаю каждую пару в питоновский словарь который, в свою очередь, станет элементом списка recom_list
    recom_list.append({'Name' : Name, 'Distance' : dist})

In [181]:
# Первый элемент
recom_list[0]

{'Name': 'Sword Art Online II', 'Distance': 0.1841755135600205}

In [182]:
# Преобразую список в датафрейм
# Индекс начинаются с 1, как и положено рейтингу
recom_df = pd.DataFrame(recom_list, index = range(1, recommendations + 1))
recom_df

Unnamed: 0,Name,Distance
1,Sword Art Online II,0.184176
2,Shingeki no Kyojin,0.230863
3,Tokyo Ghoul,0.2555
4,No Game No Life,0.26669
5,Angel Beats!,0.27604
6,Mirai Nikki,0.284526
7,One Punch Man,0.319181
8,Death Note,0.322645
9,Akame ga Kill!,0.338239
10,Ao no Exorcist,0.339214
