In [35]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
import json
from src.mapk import *

In [36]:
# Файлы
INPUT_DIR = 'C:/Dataset'

In [37]:
# Чтение файлов
anime_ratings = pd.read_csv(INPUT_DIR + '/animelist.csv',
                        low_memory=False,
                        decimal=',',
                        usecols=["user_id", "anime_id","rating"]
                        )
anime_data = pd.read_csv(INPUT_DIR + '/anime.csv',
                        low_memory=False,
                        decimal=','
                        )

In [38]:
# Содержимое файла anime.csv
anime_data.head(3)

Unnamed: 0,MAL_ID,Name,Score,Genres,English name,Japanese name,Type,Episodes,Aired,Premiered,...,Score-10,Score-9,Score-8,Score-7,Score-6,Score-5,Score-4,Score-3,Score-2,Score-1
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",Cowboy Bebop,カウボーイビバップ,TV,26,"Apr 3, 1998 to Apr 24, 1999",Spring 1998,...,229170.0,182126.0,131625.0,62330.0,20688.0,8904.0,3184.0,1357.0,741.0,1580.0
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space",Cowboy Bebop:The Movie,カウボーイビバップ 天国の扉,Movie,1,"Sep 1, 2001",Unknown,...,30043.0,49201.0,49505.0,22632.0,5805.0,1877.0,577.0,221.0,109.0,379.0
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen",Trigun,トライガン,TV,26,"Apr 1, 1998 to Sep 30, 1998",Spring 1998,...,50229.0,75651.0,86142.0,49432.0,15376.0,5838.0,1965.0,664.0,316.0,533.0


In [39]:
from sklearn.model_selection import train_test_split

# (80% train, 20% test)
train_ratings, test_ratings = train_test_split(anime_ratings, test_size=0.20, random_state=42)

# (60% train, 20% validation)
train_ratings, val_ratings = train_test_split(train_ratings, test_size=0.25, random_state=42)

In [40]:
# 100%
len(anime_ratings)

109224747

In [41]:
# 60%
len(train_ratings)

65534847

In [42]:
# 20%
len(val_ratings)

21844950

In [43]:
# 20%
len(test_ratings)

21844950

In [44]:
# Cодержимое файла anime_ratings.csv
anime_ratings.head(3)

Unnamed: 0,user_id,anime_id,rating
0,0,67,9
1,0,6702,7
2,0,242,10


In [45]:
# Пользователь должен оценить минимум 200 аниме (train_ratings)
n_ratings = train_ratings['user_id'].value_counts()
train_ratings = train_ratings[train_ratings['user_id'].isin(n_ratings[n_ratings >= 500].index)].copy()
len(train_ratings)

20425705

In [46]:
# Пользователь должен оценить минимум 200 аниме (val_ratings)
n_ratings = val_ratings['user_id'].value_counts()
val_ratings = val_ratings[val_ratings['user_id'].isin(n_ratings[n_ratings >= 500].index)].copy()
len(val_ratings)

1019968

In [47]:
# Создание датасета с рекомендациями для каждого пользователя
user_recommendations = {}

for user_id in val_ratings['user_id'].unique():
    user_ratings = train_ratings[train_ratings['user_id'] == user_id]
    anime_not_rated = list(set(train_ratings['anime_id'].unique()) - set(user_ratings['anime_id'].unique()))
    recommended_anime = anime_not_rated[:10] # используем топ-10 рекомендаций
    user_recommendations[user_id] = recommended_anime


In [48]:
# Создание датасета с реальными оценками для каждого пользователя из тестового набора
actual_ratings = {}

for user_id in val_ratings['user_id'].unique():
    anime_list = list(val_ratings[val_ratings['user_id'] == user_id]['anime_id'])
    actual_ratings[user_id] = anime_list

In [49]:
# Создание списка рекомендованных и реальных оценок для каждого пользователя
recommended_ratings = []
actual_ratings_list = []

for user_id in user_recommendations:
    recommended_ratings.append(user_recommendations[user_id])
    actual_ratings_list.append(actual_ratings[user_id])

In [50]:
# Расчет MAPK@10
mapk_score = mapk(actual_ratings_list, recommended_ratings, k=10)
print("MAPK@10 score: ", mapk_score)

MAPK@10 score:  0.12652542835315944


In [55]:
# Removing Duplicated Rows
train_ratings = train_ratings.drop_duplicates()


In [56]:
train_ratings.head(3)

Unnamed: 0,user_id,anime_id,rating
41016262,133003,16498,8
94975070,306958,13367,8
66606738,215568,1818,8


In [57]:
# Создание сводной таблицы (pivot table). 
# По горизонтали будут аниме, по вертикали - пользователи, значения - оценки
user_item_matrix = train_ratings.pivot(index = 'anime_id', columns = 'user_id', values= 'rating')
user_item_matrix.head()

user_id,17,19,42,47,60,111,121,145,146,147,...,353302,353304,353311,353324,353325,353326,353365,353390,353395,353398
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,8.0,,10.0,,,,,,8.0,...,0.0,7.0,,10.0,,,,0.0,,
5,,6.0,,0.0,0.0,,,,,,...,,,,,,,,0.0,8.0,
6,,,0.0,,0.0,9.0,,,,10.0,...,,7.0,,,8.0,,,,9.0,
7,,,,,0.0,0.0,,,,,...,,,,,9.0,,,,0.0,
8,,,,,,,,,,,...,,,,,,0.0,,,,


In [58]:
# NaN преобразовываю в нули
user_item_matrix.fillna(0, inplace = True)
user_item_matrix.head()

user_id,17,19,42,47,60,111,121,145,146,147,...,353302,353304,353311,353324,353325,353326,353365,353390,353395,353398
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,8.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,8.0,...,0.0,7.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0
6,0.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,10.0,...,0.0,7.0,0.0,0.0,8.0,0.0,0.0,0.0,9.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [59]:
# Размерность матрицы "пользователи х anime"
user_item_matrix.shape

(17559, 25326)

In [60]:
# Сгруппирую пользователей, посчитаю, количество оценок каждого пользователя
users_votes = train_ratings.groupby('user_id')['rating'].agg('count')

# То же самое, только для аниме
anime_votes = train_ratings.groupby('anime_id')['rating'].agg('count')

In [61]:
# Преобразую разреженную матрицу в формат csr
# Метод values передаст функции csr_matrix только значения датафрейма
csr_data = csr_matrix(user_item_matrix.values)

In [62]:
user_item_matrix.head()

user_id,17,19,42,47,60,111,121,145,146,147,...,353302,353304,353311,353324,353325,353326,353365,353390,353395,353398
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,8.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,8.0,...,0.0,7.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0
6,0.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,10.0,...,0.0,7.0,0.0,0.0,8.0,0.0,0.0,0.0,9.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [63]:
# Сброшу индекс с помощью reset_index()
user_item_matrix = user_item_matrix.rename_axis(None, axis = 1).reset_index()
user_item_matrix.head()

Unnamed: 0,anime_id,17,19,42,47,60,111,121,145,146,...,353302,353304,353311,353324,353325,353326,353365,353390,353395,353398
0,1,0.0,8.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,...,0.0,7.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0
2,6,0.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,...,0.0,7.0,0.0,0.0,8.0,0.0,0.0,0.0,9.0,0.0
3,7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0
4,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Collaborative Filtering

In [66]:
# Использую класс NearestNeighbors для поиска расстояний
knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute', n_neighbors = 20, n_jobs = -1)

# Обучу модель
knn.fit(csr_data)

In [80]:
# Количество рекомендаций
recommendations = 30

# Выбор аниме
search_word = 'Naruto'

In [81]:
# Поиск аниме в датафрейме animeTest
anime_search = anime_data[anime_data['Name'].str.contains(search_word)]
anime_search

Unnamed: 0,MAL_ID,Name,Score,Genres,English name,Japanese name,Type,Episodes,Aired,Premiered,...,Score-10,Score-9,Score-8,Score-7,Score-6,Score-5,Score-4,Score-3,Score-2,Score-1
10,20,Naruto,7.91,"Action, Adventure, Comedy, Super Power, Martia...",Naruto,ナルト,TV,220,"Oct 3, 2002 to Feb 8, 2007",Fall 2002,...,216866.0,234481.0,345563.0,286175.0,108155.0,46886.0,15477.0,6098.0,3582.0,5310.0
414,442,Naruto Movie 1: Dai Katsugeki!! Yuki Hime Shin...,7.1,"Adventure, Comedy, Historical, Supernatural, D...",Naruto the Movie:Ninja Clash in the Land of Snow,劇場版　NARUTO　大活劇！雪姫忍法帖だってばよ!!,Movie,1,"Aug 21, 2004",Unknown,...,11155.0,13254.0,27363.0,41911.0,23643.0,11356.0,3685.0,1415.0,655.0,476.0
557,594,Naruto: Takigakure no Shitou - Ore ga Eiyuu Da...,6.76,"Action, Adventure, Comedy, Shounen, Super Power",Naruto:The Lost Story - Mission:Protect the Wa...,滝隠れの死闘　オレが英雄だってばよ!,Special,1,"Dec 20, 2003",Unknown,...,3335.0,3129.0,6460.0,11881.0,9383.0,5289.0,1735.0,643.0,333.0,257.0
696,761,Naruto: Akaki Yotsuba no Clover wo Sagase,6.52,"Adventure, Comedy, Shounen",Naruto:Find the Crimson Four-leaf Clover!,ナルト 紅き四つ葉のクローバーを探せ,Special,1,"May 24, 2003",Unknown,...,2833.0,2371.0,4720.0,9395.0,9249.0,5684.0,2187.0,843.0,453.0,317.0
848,936,Naruto Movie 2: Dai Gekitotsu! Maboroshi no Ch...,6.88,"Adventure, Comedy, Drama, Fantasy, Shounen, Su...",Naruto the Movie 2:Legend of the Stone of Gelel,劇場版　NARUTO　大激突！幻の地底遺跡だってばよ,Movie,1,"Aug 6, 2005",Unknown,...,8004.0,8333.0,17468.0,30890.0,20956.0,10962.0,3894.0,1435.0,698.0,467.0
975,1074,Naruto Narutimate Hero 3: Tsuini Gekitotsu! Jo...,6.77,"Game, Adventure, Comedy, Shounen",Unknown,NARUTO ナルティメットヒーロー3 ついに激突! 上忍VS下忍!! 無差別大乱戦大会開催!!,OVA,1,"Dec 22, 2005",Unknown,...,3099.0,2846.0,5250.0,9292.0,7766.0,4344.0,1512.0,662.0,317.0,228.0
1574,1735,Naruto: Shippuuden,8.16,"Action, Adventure, Comedy, Super Power, Martia...",Naruto:Shippuden,ナルト- 疾風伝,TV,500,"Feb 15, 2007 to Mar 23, 2017",Winter 2007,...,249262.0,213677.0,239113.0,175685.0,75970.0,36221.0,15913.0,6661.0,3383.0,4245.0
1963,2144,Naruto Movie 3: Dai Koufun! Mikazuki Jima no A...,6.89,"Action, Adventure",Naruto the Movie 3:Guardians of the Crescent M...,劇場版 NARUTO -ナルト- 大興奮!みかづき島のアニマル騒動だってばよ,Movie,1,"Aug 5, 2006",Unknown,...,7495.0,8053.0,16024.0,26166.0,17858.0,9526.0,3693.0,1565.0,856.0,530.0
2058,2248,Naruto: Dai Katsugeki!! Yuki Hime Shinobu Houj...,6.87,"Action, Comedy, Sports, Martial Arts, Fantasy,...",Hidden Leaf Village Grand Sports Festival,木ノ葉の里と大うん動会,Special,1,"Aug 21, 2004",Unknown,...,4028.0,3564.0,6059.0,9089.0,6890.0,4047.0,1664.0,845.0,515.0,458.0
2267,2472,Naruto: Shippuuden Movie 1,7.29,"Action, Adventure, Comedy, Fantasy, Shounen",Naruto:Shippuden the Movie,劇場版NARUTO -ナルト- 疾風伝,Movie,1,"Aug 4, 2007",Unknown,...,12595.0,14304.0,28143.0,38664.0,19234.0,8470.0,2822.0,964.0,470.0,385.0


In [82]:
# Через iloc[0] беру первую строку столбца ['MAL_ID']
anime_id = anime_search.iloc[0]['MAL_ID']

# По индексу аниме в датасете animeTest нахожу соответствующий индекс в матрице предпочтений
anime_id = user_item_matrix[user_item_matrix['anime_id'] == anime_id].index[0]
anime_id

10

In [83]:
# Нахожу индексы и расстояния аниме, которые похожи на запрос
# Использую метод kneighbors()
distances, indices = knn.kneighbors(csr_data[anime_id], n_neighbors = recommendations + 1)

In [84]:
# Индексы рекомендованных аниме
indices

array([[   10,  1574,  7449,   245,  6614,  1393, 10451, 11185,  3971,
         5975,  1431,  8646,  4636,  8148,  8292,  4707,  5221,  3564,
         8551,  9011,  6295,  3155,  2656,  5683, 11914,  4798,  7323,
         8625, 10876,   100,  9732]], dtype=int64)

In [85]:
# Расстояния до них
distances

array([[0.        , 0.48484735, 0.53506474, 0.53525505, 0.54115287,
        0.54116725, 0.55329588, 0.5554523 , 0.55630542, 0.55755639,
        0.5615995 , 0.56354719, 0.56367479, 0.56376526, 0.56453727,
        0.56479626, 0.56833868, 0.57490752, 0.57576918, 0.57698407,
        0.57781759, 0.57859137, 0.5822861 , 0.58315825, 0.58406771,
        0.58407734, 0.58534013, 0.58558506, 0.58578919, 0.58606276,
        0.58655089]])

In [86]:
# Уберу лишние измерения через squeeze() и преобразую массивы в списки с помощью tolist()
indices_list = indices.squeeze().tolist()
distances_list = distances.squeeze().tolist()

# С помощью функций zip и list преобразую списки в набор кортежей (tuple)
indices_distances = list(zip(indices_list, distances_list))
print(type(indices_distances[0]))

# Первые три пары/кортежа
print(indices_distances[:3])

<class 'tuple'>
[(10, 0.0), (1574, 0.4848473549381255), (7449, 0.5350647362053083)]


In [87]:
# Отсортирую список по расстояниям через key = lambda x: x[1] (то есть по второму элементу) в возрастающем порядке reverse = False
indices_distances_sorted = sorted(indices_distances, key = lambda x: x[1], reverse = False)

# Уберу первый элемент (потому что это и есть "Запрос")
indices_distances_sorted = indices_distances_sorted[1:]
indices_distances_sorted

[(1574, 0.4848473549381255),
 (7449, 0.5350647362053083),
 (245, 0.5352550536419549),
 (6614, 0.5411528727499593),
 (1393, 0.5411672493817973),
 (10451, 0.5532958817644472),
 (11185, 0.5554523027139571),
 (3971, 0.556305416792122),
 (5975, 0.5575563883906058),
 (1431, 0.5615994962609439),
 (8646, 0.5635471917514312),
 (4636, 0.5636747917861982),
 (8148, 0.563765264055373),
 (8292, 0.5645372683370697),
 (4707, 0.5647962626848138),
 (5221, 0.568338681947981),
 (3564, 0.5749075202643115),
 (8551, 0.5757691789868328),
 (9011, 0.5769840682879737),
 (6295, 0.5778175926209432),
 (3155, 0.5785913671898735),
 (2656, 0.5822860959921571),
 (5683, 0.5831582494299337),
 (11914, 0.5840677089282889),
 (4798, 0.5840773435473401),
 (7323, 0.5853401271693071),
 (8625, 0.5855850606199466),
 (10876, 0.5857891920154702),
 (100, 0.5860627577434689),
 (9732, 0.5865508892135117)]

In [88]:
# Создам пустой список, в который буду помещать название аниме и расстояние до него
recom_list = []

# В цикле поочередно прохожу по кортежам
for ind_dist in indices_distances_sorted:

    # Ищу anime_id в матрице предпочтений
    matrix_movie_id = user_item_matrix.iloc[ind_dist[0]]['anime_id']

    # Нахожу индекс этого аниме в датафрейме anime_data
    id = anime_data[anime_data['MAL_ID'] == matrix_movie_id].index

    # Беру название анеме и расстояние до него
    Name = anime_data.iloc[id]['Name'].values[0]
    dist = ind_dist[1]

    # Помещаю каждую пару в питоновский словарь который, в свою очередь, станет элементом списка recom_list
    recom_list.append({'Name' : Name, 'Distance' : dist})

In [89]:
# Первый элемент
recom_list[0]

{'Name': 'Naruto: Shippuuden', 'Distance': 0.4848473549381255}

In [90]:
# Преобразую список в датафрейм
# Индекс начинаются с 1, как и положено рейтингу
recom_df = pd.DataFrame(recom_list, index = range(1, recommendations + 1))
recom_df

Unnamed: 0,Name,Distance
1,Naruto: Shippuuden,0.484847
2,Shingeki no Kyojin,0.535065
3,Bleach,0.535255
4,Sword Art Online,0.541153
5,Death Note,0.541167
6,One Punch Man,0.553296
7,Boku no Hero Academia,0.555452
8,Fullmetal Alchemist: Brotherhood,0.556305
9,Ao no Exorcist,0.557556
10,Code Geass: Hangyaku no Lelouch,0.561599
